## Task Definition

- Regression Task with RMSE as metric
- Target is median value of owner-occupied homes in $1000s 'medv'

## Imports

In [1]:
import os

# Data manipulation and analysis
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# Data preprocessing
from sklearn.utils import resample
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler
#from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor

import xgboost as xgb
import lightgbm as lgb

# Sagemaker
# import boto3
# import joblib
# from sagemaker.sklearn.estimator import SKLearn
# import sagemaker
# Data profiling
from ydata_profiling import ProfileReport

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Ensure plots are displayed inline in Jupyter Notebook
%matplotlib inline

# Set a consistent visual style
sns.set(style="whitegrid")

  from .autonotebook import tqdm as notebook_tqdm


## Data Collection and Loading



In [2]:
# Load the dataset
df = pd.read_csv('./data/raw/train.csv')

# drop ID
df.drop('ID', axis=1, inplace=True)

# Define numeric and categorical features

numeric_features = df.select_dtypes(include=['number']).columns
catergorical_features = df.select_dtypes(include=['object', 'category', 'bool']).columns


print(f'Shape of df: {df.shape}')
print(f'Amount of numeric features: {numeric_features.shape[0]}')
print(f'Amount of categorical features: {catergorical_features.shape[0]}')

print('----------------------------------------------------------------')

df.info()

Shape of df: (333, 14)
Amount of numeric features: 14
Amount of categorical features: 0
----------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333 entries, 0 to 332
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     333 non-null    float64
 1   zn       333 non-null    float64
 2   indus    333 non-null    float64
 3   chas     333 non-null    int64  
 4   nox      333 non-null    float64
 5   rm       333 non-null    float64
 6   age      333 non-null    float64
 7   dis      333 non-null    float64
 8   rad      333 non-null    int64  
 9   tax      333 non-null    int64  
 10  ptratio  333 non-null    float64
 11  black    333 non-null    float64
 12  lstat    333 non-null    float64
 13  medv     333 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 36.5 KB


In [3]:
# Assuming 'df' is your existing DataFrame
np.random.seed(42)  # Setting a seed for reproducibility

# Define the color categories
colors = ['Red', 'Green', 'Blue']

# Add a new column 'color' with random color values
df['color'] = np.random.choice(colors, size=len(df))

# Display the first few rows to verify
print(df.head())



# Define numeric and categorical features

numeric_features = df.select_dtypes(include=['number']).columns
catergorical_features = df.select_dtypes(include=['object', 'category', 'bool']).columns


print(f'Shape of df: {df.shape}')
print(f'Amount of numeric features: {numeric_features.shape[0]}')
print(f'Amount of categorical features: {catergorical_features.shape[0]}')

print('----------------------------------------------------------------')

df.info()

      crim    zn  indus  chas    nox     rm   age     dis  rad  tax  ptratio  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296     15.3   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242     17.8   
2  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222     18.7   
3  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222     18.7   
4  0.08829  12.5   7.87     0  0.524  6.012  66.6  5.5605    5  311     15.2   

    black  lstat  medv color  
0  396.90   4.98  24.0  Blue  
1  396.90   9.14  21.6   Red  
2  394.63   2.94  33.4  Blue  
3  396.90   5.33  36.2  Blue  
4  395.60  12.43  22.9   Red  
Shape of df: (333, 15)
Amount of numeric features: 14
Amount of categorical features: 1
----------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333 entries, 0 to 332
Data columns (total 15 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  

## EDA

In [4]:
# # Exploring both datasets with ydata_profiling library
# profile_df = ProfileReport(df)

# # Create directory for reports
# if not os.path.exists('./reports'):
#     os.makedirs('./reports')

# # Save as html file
# profile_df.to_file('./reports/df_report.html')

##### Notes

- No missing values
- data incosistency detected
- outlier detected

In [5]:
# Create a histogram with additional styling
fig = px.histogram(
    df, 
    x="medv", 
    title="Distribution of Median Value of Owner-Occupied Homes",
    nbins=30,  # Adjust the number of bins
    color_discrete_sequence=["#636EFA"]  # Custom color for the bars
)

# Update layout for better aesthetics
fig.update_layout(
    title_font_size=20,  # Increase the font size of the title
    title_x=0.5,  # Center the title
    xaxis_title="Median Value (in $1000s)",  # Add x-axis label
    yaxis_title="Frequency",  # Add y-axis label
    bargap=0.1,  # Adjust the gap between bars
    plot_bgcolor="#f9f9f9",  # Set background color for the plot
    paper_bgcolor="#ffffff",  # Set background color for the entire figure
    xaxis=dict(
        showgrid=False  # Hide grid lines on the x-axis
    ),
    yaxis=dict(
        showgrid=True,  # Show grid lines on the y-axis
        gridcolor="rgba(200, 200, 200, 0.5)"  # Light gray grid lines
    )
)
fig.show()


## Data Cleaning

##### Missing Values

In [6]:
# Show missing values for df
missing_values = df.isnull().sum() / len(df)
missing_values_df = pd.DataFrame(missing_values, columns=['Missing Values (%)']).sort_values(by='Missing Values (%)', ascending=False)
missing_values_df.T

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv,color
Missing Values (%),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


##### Data Inconsistency

In [7]:
# Transfer feature 'chas' to boolean
df['chas'] = df['chas'].astype(bool)

In [8]:
numeric_features = df.select_dtypes(include=['number']).columns
catergorical_features = df.select_dtypes(include=['object', 'category', 'bool']).columns

# print categorical features

print(f'Amount of numeric features: {numeric_features.shape[0]}')
print(f'Amount of categorical features: {catergorical_features.shape[0]}')

print('----------------------------------------------------------------')


Amount of numeric features: 13
Amount of categorical features: 2
----------------------------------------------------------------


##### Outlier Detection and Treatment

In [9]:
# Step 1: Select only numeric columns
numeric_df = df.select_dtypes(include=['number'])

# Step 2: Calculate skewness for numeric columns
skewness_before = numeric_df.skew()

# Step 3: Identify columns with absolute skewness > 1.5
skewed_columns = skewness_before[abs(skewness_before) > 1.5].index

# Step 4: Apply 5% / 95% percentile capping
for col in skewed_columns:
    lower_bound = numeric_df[col].quantile(0.10)
    upper_bound = numeric_df[col].quantile(0.90)
    numeric_df[col] = numeric_df[col].clip(lower=lower_bound, upper=upper_bound)

# Step 5: Calculate skewness after capping
skewness_after = numeric_df.skew()

# Step 6: Combine numeric and non-numeric columns back together
df = pd.concat([numeric_df, df.select_dtypes(exclude=['number'])], axis=1)

# Display the skewness before and after
skewness_table = pd.DataFrame({
    'Skewness Before': skewness_before,
    'Skewness After': skewness_after
})

# Show the skewness table sorted after absolute skewness in descending order
skewness_table.sort_values(by='Skewness After', ascending=False)


Unnamed: 0,Skewness Before,Skewness After
zn,2.374052,1.483315
crim,4.598981,1.360292
medv,1.122472,1.122472
rad,0.983258,0.983258
lstat,0.978328,0.978328
dis,0.938143,0.938143
nox,0.705552,0.705552
tax,0.633027,0.633027
indus,0.290434,0.290434
rm,0.284028,0.284028


##### Duplicates

In [10]:
# Check for duplicates in df
duplicate_rows = df[df.duplicated()]
# Display the duplicate rows
print("Duplicate Rows:")
print(duplicate_rows)


Duplicate Rows:
Empty DataFrame
Columns: [crim, zn, indus, nox, rm, age, dis, rad, tax, ptratio, black, lstat, medv, chas, color]
Index: []


#### Save Cleaned Data

In [11]:
# Save cleaned dataframes to csv files
df.to_csv('./data/processed/train_cleaned.csv', index=False)

## Data Preperation


##### Train-Validation-Test Split

In [12]:
# Assuming 'df' is your DataFrame and 'MEDV' is the target variable
X = df.drop(columns=['medv'])  # Features
y = df['medv']  # Target

# Split the data into train, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

##### Encoding categorical variables

In [13]:
X_train = pd.get_dummies(X_train, columns=['color'], drop_first=True)  # drop_first=True to avoid multicollinearity
X_test = pd.get_dummies(X_test, columns=['color'], drop_first=True)

##### Feature Engineering

##### Feature Selection

In [14]:
# Step 1: Fit a RandomForestRegressor to get feature importances
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Step 2: Visualize feature importances
importances = rf.feature_importances_

# Create a DataFrame to store feature importances
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
})

# Sort the DataFrame by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Visualize the top 20 features
fig = px.bar(
    feature_importance_df.head(20),  # Display the top 20 features
    x="Importance", 
    y="Feature", 
    orientation='h',
    title="Top 20 Feature Importances from RandomForestRegressor",
    color_discrete_sequence=["#636EFA"]
)
fig.update_layout(
    title_font_size=20,
    title_x=0.5,
    xaxis_title="Importance",
    yaxis_title="Feature",
    bargap=0.1,
    plot_bgcolor="#f9f9f9",
    paper_bgcolor="#ffffff",
    font=dict(
        family="Arial, sans-serif",
        size=14,
        color="#333333"
    ),
    xaxis=dict(showgrid=False),
    yaxis=dict(showgrid=True, gridcolor="rgba(200, 200, 200, 0.5)")
)
fig.update_yaxes(autorange="reversed")
fig.show()

In [15]:
def select_features(selector, X_train, X_test, y_train):
    """
    Apply a feature selection method to the training and test datasets.
    Parameters:
    - selector: The SelectFromModel object (or any feature selector)
    - X_train: The original training DataFrame before feature selection
    - X_test: The original test DataFrame before feature selection
    - y_train: The target variable for training data (used to fit the selector)
    
    Returns:
    - X_train_selected_df: DataFrame with selected features from the training data
    - X_test_selected_df: DataFrame with selected features from the test data
    """

    # Fit the selector on the training data
    selector.fit(X_train, y_train)

    # Transform both training and test data
    X_train_selected = selector.transform(X_train)
    X_test_selected = selector.transform(X_test)

    # Get the selected feature names
    selected_feature_names = X_train.columns[selector.get_support()]

    # Convert the NumPy arrays back to DataFrames with the original column names
    X_train_selected_df = pd.DataFrame(X_train_selected, columns=selected_feature_names, index=X_train.index)
    X_test_selected_df = pd.DataFrame(X_test_selected, columns=selected_feature_names, index=X_test.index)

    return X_train_selected_df, X_test_selected_df


# Apply feature selection
selector = SelectFromModel(RandomForestRegressor(random_state=42), threshold=0.01)
X_train_selected, X_test_selected = select_features(selector, X_train, X_test, y_train)

# Get the names of the selected features
selected_features = X_train.columns[selector.get_support()]
print(f"Selected Features: {list(selected_features)}")

# Train the model using only the selected features
model_selected_features = RandomForestRegressor(n_estimators=100, random_state=42)
model_selected_features.fit(X_train_selected, y_train)
y_pred_selected = model_selected_features.predict(X_test_selected)

# Evaluate the model using RMSE with selected features
rmse_selected = np.sqrt(mean_squared_error(y_test, y_pred_selected))

# Comparing the RMSE before and after feature selection
y_pred = rf.predict(X_test) 
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f'RMSE without Feature Selection: {rmse}')
print(f"RMSE with Feature Selection: {rmse_selected}")


Selected Features: ['crim', 'nox', 'rm', 'age', 'dis', 'tax', 'ptratio', 'lstat']
RMSE without Feature Selection: 2.928151750104798
RMSE with Feature Selection: 2.8346771960789736


##### Scaling

In [16]:
X_train.shape

(266, 15)

In [17]:
# Identify numeric features

numeric_features = X_train_selected.select_dtypes(include=['number']).columns

# Define the ColumnTransformer to apply scaling only to numeric features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features)  # Apply StandardScaler only to numeric features
    ],
    remainder='passthrough'  # Keep the non-numeric features as they are
)

# Step 3: Create a pipeline that includes the preprocessor
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Step 4: Fit and transform the training data
X_train_scaled = pipeline.fit_transform(X_train)
X_test_scaled = pipeline.transform(X_test)

# # Convert the results back to DataFrames, if necessary, using the original DataFrame's columns
columns = numeric_features.tolist() + X_train.drop(columns=numeric_features).columns.tolist()

X_train_scaled = pd.DataFrame(X_train_scaled, columns=columns, index=X_train.index)
#X_test_scaled = pd.DataFrame(X_test_scaled, columns=columns, index=X_test.index)


In [18]:
X_train_scaled.head()

Unnamed: 0,crim,nox,rm,age,dis,tax,ptratio,lstat,zn,indus,rad,black,chas,color_Green,color_Red
224,-0.642306,-0.359449,-0.407144,-0.788644,0.537799,-1.071847,0.808588,-0.376277,0.0,5.19,5,396.9,False,False,True
78,-0.614307,-0.087649,-0.35584,0.52992,-0.497329,0.15879,-0.321453,-0.295354,0.0,10.01,6,394.51,False,False,False
295,0.727547,1.322315,0.362419,0.668157,-0.653541,1.543256,0.808588,0.836116,0.0,18.1,24,289.65,False,True,False
17,-0.379615,-0.164092,-0.318787,0.749681,0.357925,-0.580776,1.185268,0.713287,0.0,8.14,4,306.38,False,True,False
24,-0.617072,-0.92853,-0.144923,-2.163922,0.99335,-1.018598,-0.274368,-0.94418,0.0,6.91,3,383.37,False,False,True


## Initial Model Comparison

##### Model Selection

In [19]:
# Regressions
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': xgb.XGBRegressor(random_state=42),
    'LightGBM': lgb.LGBMRegressor(random_state=42)
}

##### Cross-Validation:

In [20]:
# Basic model selection for Regression and Classification

# Regressions
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': xgb.XGBRegressor(random_state=42),
    'LightGBM': lgb.LGBMRegressor(random_state=42)
}

# Custom RMSE scorer
rmse_scorer = make_scorer(lambda y, y_pred: np.sqrt(mean_squared_error(y, y_pred)))

# Perform cross-validation and store results
results = {}

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
    results[name] = [-scores.mean()] + list(-scores)

# Create a DataFrame for the results
results_df = pd.DataFrame(results, index=['Mean'] + [f'Fold {i+1}' for i in range(len(scores))])

# Transpose the DataFrame to get the desired format
results_df = results_df.T

# Display the results table
results_df.sort_values('Mean', ascending=True)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000062 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 541
[LightGBM] [Info] Number of data points in the train set: 212, number of used features: 14
[LightGBM] [Info] Start training from score 23.100000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000041 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 544
[LightGBM] [Info] Number of data points in the train set: 213, number of used features: 14
[LightGBM] [Info] Start training from score 22.558685
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000045 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 550
[LightGBM] [Info] Number of data points in the train set: 213

Unnamed: 0,Mean,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5
Gradient Boosting,3.569471,3.240014,3.586354,4.092286,3.299349,3.629352
Random Forest,3.619518,3.128734,3.862484,4.424936,2.806168,3.875266
LightGBM,3.918672,3.362871,4.554707,4.301034,3.105963,4.268784
XGBoost,4.096307,3.199355,4.598109,4.427498,3.110659,5.145914
Decision Tree,4.5951,4.167889,3.760896,5.187849,3.571744,6.287124
Linear Regression,4.942933,3.92878,5.664248,5.891105,3.516651,5.71388


In [21]:
# Remove the 'Mean' column to focus only on folds
results_df_no_mean = results_df.drop(columns=['Mean'])

# Initialize a Plotly figure
fig = go.Figure()

# Add a line for each model
for index, row in results_df_no_mean.iterrows():
    fig.add_trace(go.Scatter(
        x=row.index,  # Fold columns
        y=row.values,  # Values for each fold
        mode='lines+markers',
        name=index  # The name of the model
    ))

# Update layout to make the plot more readable
fig.update_layout(
    title="Model Performance Across Folds",
    xaxis_title="Folds",
    yaxis_title="Score",
    legend_title="Models",
    xaxis=dict(type='category'),
)

# Display the plot
fig.show()


## Hyperparameter Tuning

##### Tuning with GridSearchCV

In [22]:
# Define the models and their respective parameter grids
models = {
    'RandomForest': {
        'model': RandomForestRegressor(random_state=42),
        'param_grid': {
            'n_estimators': [100, 200], 
            'max_depth': [1, 3], 
            'min_samples_split': [1, 2], 
            'min_samples_leaf': [1, 2], 
            'bootstrap': [True, False]
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingRegressor(random_state=42),
        'param_grid': {
            'n_estimators': [100, 200], 
            'learning_rate': [0.01, 0.1], 
            'max_depth': [3], 
            'min_samples_split': [2], 
            'min_samples_leaf': [1, 2], 
            'subsample': [1.0]
        }
    },
    'XGBoost': {
        'model': xgb.XGBRegressor(random_state=42, objective='reg:squarederror'),
        'param_grid': {
            'n_estimators': [100, 200], 
            'learning_rate': [0.01, 0.1], 
            'max_depth': [3], 
            'subsample': [1.0],
            'colsample_bytree': [1.0]
        }
    }
}

# Define the RMSE scorer
rmse_scorer = make_scorer(lambda y, y_pred: np.sqrt(mean_squared_error(y, y_pred)), greater_is_better=False)

# Dictionary to store results
results = {}

# Define KFold with random_state
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform GridSearchCV for each model
for name, model_info in models.items():
    print(f"Performing GridSearchCV for {name}")
    
    grid_search = GridSearchCV(estimator=model_info['model'], 
                               param_grid=model_info['param_grid'], 
                               scoring=rmse_scorer, 
                               cv=kf, 
                               n_jobs=-1, 
                               verbose=2
                               )
    
    grid_search.fit(X_train, y_train)
    
    # Store the best model and its evaluation results
    best_model = grid_search.best_estimator_
    y_pred_best = best_model.predict(X_test)
    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_best))
    
    results[name] = {
        'best_model': best_model,
        'best_params': grid_search.best_params_,
        'best_rmse_cv': -grid_search.best_score_,
        'test_rmse': rmse_test,
        'y_pred': y_pred_best
    }
    
    results[name] = {
        'best_model': best_model,
        'best_params': grid_search.best_params_,
        'best_rmse_cv': -grid_search.best_score_
    }

# Create a DataFrame from the results dictionary
results_df = pd.DataFrame({
    'Model': [name for name in results.keys()],
    'Best Parameters': [result['best_params'] for result in results.values()],
    'CV RMSE': [result['best_rmse_cv'] for result in results.values()]
})

# Display the DataFrame
results_df = results_df.set_index('Model')
results_df.sort_values(by='CV RMSE', ascending=True)


Performing GridSearchCV for RandomForest
Fitting 5 folds for each of 32 candidates, totalling 160 fits
Performing GridSearchCV for GradientBoosting
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Performing GridSearchCV for XGBoost
Fitting 5 folds for each of 4 candidates, totalling 20 fits


Unnamed: 0_level_0,Best Parameters,CV RMSE
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
GradientBoosting,"{'learning_rate': 0.1, 'max_depth': 3, 'min_sa...",3.374359
XGBoost,"{'colsample_bytree': 1.0, 'learning_rate': 0.1...",3.720688
RandomForest,"{'bootstrap': True, 'max_depth': 3, 'min_sampl...",4.107059


In [23]:
print(X_train.shape)
print(X_test.shape)

# print X_train and X_test

X_train.head()

(266, 15)
(67, 15)


Unnamed: 0,crim,zn,indus,nox,rm,age,dis,rad,tax,ptratio,black,lstat,chas,color_Green,color_Red
224,0.05497,0.0,5.19,0.515,5.985,45.4,4.8122,5,224,20.2,396.9,9.74,False,False,True
78,0.15098,0.0,10.01,0.547,6.021,82.6,2.7474,6,432,17.8,394.51,10.3,False,False,False
295,4.75237,0.0,18.1,0.713,6.525,86.5,2.4358,24,666,20.2,289.65,18.13,False,True,False
17,0.95577,0.0,8.14,0.538,6.047,88.8,4.4534,4,307,21.0,306.38,17.28,False,True,False
24,0.1415,0.0,6.91,0.448,6.169,6.6,5.7209,3,233,17.9,383.37,5.81,False,False,True


## Evaluation

##### Test Set Evaluation


In [24]:
X_train.head()

Unnamed: 0,crim,zn,indus,nox,rm,age,dis,rad,tax,ptratio,black,lstat,chas,color_Green,color_Red
224,0.05497,0.0,5.19,0.515,5.985,45.4,4.8122,5,224,20.2,396.9,9.74,False,False,True
78,0.15098,0.0,10.01,0.547,6.021,82.6,2.7474,6,432,17.8,394.51,10.3,False,False,False
295,4.75237,0.0,18.1,0.713,6.525,86.5,2.4358,24,666,20.2,289.65,18.13,False,True,False
17,0.95577,0.0,8.14,0.538,6.047,88.8,4.4534,4,307,21.0,306.38,17.28,False,True,False
24,0.1415,0.0,6.91,0.448,6.169,6.6,5.7209,3,233,17.9,383.37,5.81,False,False,True


In [25]:
X_test.head()

Unnamed: 0,crim,zn,indus,nox,rm,age,dis,rad,tax,ptratio,black,lstat,chas,color_Green,color_Red
25,0.15936,0.0,6.91,0.448,6.211,6.5,5.7209,3,233,17.9,394.46,7.44,False,False,True
309,4.03841,0.0,18.1,0.532,6.229,90.7,3.0993,24,666,20.2,395.33,12.87,False,False,True
73,0.12802,0.0,8.56,0.52,6.474,97.1,2.4329,5,384,20.9,395.24,12.27,False,True,False
195,0.03615,39.0,4.95,0.411,6.63,23.4,5.1167,4,245,19.2,396.9,4.7,False,True,False
57,0.05059,0.0,4.49,0.449,6.389,48.0,4.7794,3,247,18.5,396.9,9.62,False,False,True


In [26]:
X_train_selected, X_test_selected = select_features(selector, X_train, X_test, y_train)

In [27]:

pipeline = Pipeline(steps=[
    #('feature_selection', SelectFromModel(rf, threshold=0.01, prefit=True)),
    ('preprocessor', preprocessor),  # Apply scaling after feature selection
    ('model', best_model)
])



# Step 2: Fit the pipeline on the training data
pipeline.fit(X_train_selected, y_train)

# Step 3: Predict using the pipeline on the test data
y_pred = pipeline.predict(X_test)

# Step 4: Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Test Set RMSE with Feature Selection, Scaling, and Model Pipeline: {rmse}")

Test Set RMSE with Feature Selection, Scaling, and Model Pipeline: 2.522739382023388


In [28]:

# Define the models
models = {
    'GradientBoosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': xgb.XGBRegressor(random_state=42, objective='reg:squarederror'),
    'LightGBM': lgb.LGBMRegressor(random_state=42)
}

# Define the parameter grid (same for all models for simplicity)
param_grid = {
    'n_estimators': [100, 200, 500], 
    'learning_rate': [0.01, 0.1], 
    'max_depth': [3], 
    'min_samples_split': [2], 
    'min_samples_leaf': [1, 2], 
    'subsample': [1.0] 
}

# Define the RMSE scorer
rmse_scorer = make_scorer(lambda y, y_pred: np.sqrt(mean_squared_error(y, y_pred)), greater_is_better=False)

# Dictionary to store results
results = {}

# Perform GridSearchCV for each model
for name, model in models.items():
    print(f"Performing GridSearchCV for {name}")
    
    grid_search = GridSearchCV(estimator=model, 
                               param_grid=param_grid, 
                               scoring=rmse_scorer, 
                               cv=5, 
                               n_jobs=-1, 
                               verbose=2)
    
    grid_search.fit(X_train, y_train)
    
    results[name] = {
        'best_model': best_model,
        'best_params': grid_search.best_params_,
        'best_rmse_cv': -grid_search.best_score_
    }

# Create a DataFrame from the results dictionary
results_df = pd.DataFrame({
    'Model': [name for name in results.keys()],
    'Best Parameters': [result['best_params'] for result in results.values()],
    'CV RMSE': [result['best_rmse_cv'] for result in results.values()]
})

# Display the DataFrame
results_df = results_df.set_index('Model')
results_df


Performing GridSearchCV for GradientBoosting
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Performing GridSearchCV for XGBoost
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Performing GridSearchCV for LightGBM
Fitting 5 folds for each of 12 candidates, totalling 60 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000127 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 656
[LightGBM] [Info] Number of data points in the train set: 266, number of used features: 15
[LightGBM] [Info] Start training from score 22.868421


Unnamed: 0_level_0,Best Parameters,CV RMSE
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
GradientBoosting,"{'learning_rate': 0.1, 'max_depth': 3, 'min_sa...",3.514286
XGBoost,"{'learning_rate': 0.1, 'max_depth': 3, 'min_sa...",3.743392
LightGBM,"{'learning_rate': 0.1, 'max_depth': 3, 'min_sa...",3.599789


In [29]:
import plotly.express as px
import plotly.graph_objs as go

# Assuming results dictionary and results_df from your previous code

# Step 1: Fit the best models on the entire training data
for name, result in results.items():
    best_model = models[name].set_params(**result['best_params'])
    best_model.fit(X_train, y_train)
    results[name]['best_model'] = best_model

# Step 2: Make predictions on the test data
for name, result in results.items():
    y_pred = result['best_model'].predict(X_test)
    results[name]['y_pred'] = y_pred

# Step 3: Create the Plotly plot with predictions from all models
fig = go.Figure()

# Add traces for each model's predictions
for name, result in results.items():
    fig.add_trace(go.Scatter(
        x=y_test, 
        y=result['y_pred'], 
        mode='markers',
        name=f'{name} Predictions',
        marker=dict(
            size=6,
            line=dict(width=1),
        )
    ))

# Add a diagonal line to indicate where predicted = actual
fig.add_shape(
    type="line",
    x0=min(y_test), x1=max(y_test),
    y0=min(y_test), y1=max(y_test),
    line=dict(color="red", dash="dash")
)

# Update the layout for better aesthetics
fig.update_layout(
    title="Predicted vs. Actual Values",
    xaxis_title="Actual Values",
    yaxis_title="Predicted Values",
    plot_bgcolor="#f9f9f9",  # Set background color for the plot
    paper_bgcolor="#ffffff",  # Set background color for the entire figure
    title_x=0.5,  # Center the title
    font=dict(
        family="Arial, sans-serif",  # Set the font family
        size=14,  # Set the font size
        color="#333333"  # Set the font color
    )
)

# Show the plot
fig.show()


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000071 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 656
[LightGBM] [Info] Number of data points in the train set: 266, number of used features: 15
[LightGBM] [Info] Start training from score 22.868421


In [30]:
# Get the names of the selected features after feature selection
selected_features = X_train_selected.columns

# Get the importances of the selected features
name = 'GradientBoosting'
best_gbr = models[name].set_params(**result['best_params'])
best_gbr.fit(X_train_selected, y_train)

importances = best_gbr.feature_importances_

# Create a DataFrame for feature importances
feature_importance_df = pd.DataFrame({
    'Feature': selected_features,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Create a bar chart for feature importances
fig = px.bar(
    feature_importance_df, 
    x='Importance', 
    y='Feature', 
    orientation='h', 
    title="Feature Importance",
    color_discrete_sequence=["#636EFA"]  # Custom color
)

# Update layout for better aesthetics
fig.update_layout(
    plot_bgcolor="#f9f9f9",  # Set background color for the plot
    paper_bgcolor="#ffffff",  # Set background color for the entire figure
    title_x=0.5,  # Center the title
    font=dict(
        family="Arial, sans-serif",  # Set the font family
        size=14,  # Set the font size
        color="#333333"  # Set the font color
    ),
    yaxis=dict(
        categoryorder='total ascending'  # Order the features by importance
    )
)

fig.show()
