## Task Definition

## Imports

In [296]:
import os

# Data manipulation and analysis
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go


# Data preprocessing
from sklearn.utils import resample
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV, KFold

# Models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor


import xgboost as xgb
import lightgbm as lgb

# Sagemaker
import boto3
import joblib
from sagemaker.sklearn.estimator import SKLearn
import sagemaker
# Data profiling
from ydata_profiling import ProfileReport

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Ensure plots are displayed inline in Jupyter Notebook
%matplotlib inline

# Set a consistent visual style
sns.set(style="whitegrid")

## Data Collection and Loading



In [297]:
# Load the dataset
df = pd.read_csv('data/raw/data_students.csv')

# Define numeric and categorical features

numeric_features = df.select_dtypes(include=['number']).columns
catergorical_features = df.select_dtypes(include=['object', 'category', 'bool']).columns


print(f'Shape of df: {df.shape}')
print(f'Amount of numeric features: {numeric_features.shape[0]}')
print(f'Amount of categorical features: {catergorical_features.shape[0]}')

print('----------------------------------------------------------------')

df.info()

Shape of df: (6607, 20)
Amount of numeric features: 7
Amount of categorical features: 13
----------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6607 entries, 0 to 6606
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Hours_Studied               6607 non-null   int64 
 1   Attendance                  6607 non-null   int64 
 2   Parental_Involvement        6607 non-null   object
 3   Access_to_Resources         6607 non-null   object
 4   Extracurricular_Activities  6607 non-null   object
 5   Sleep_Hours                 6607 non-null   int64 
 6   Previous_Scores             6607 non-null   int64 
 7   Motivation_Level            6607 non-null   object
 8   Internet_Access             6607 non-null   object
 9   Tutoring_Sessions           6607 non-null   int64 
 10  Family_Income               6607 non-null   object
 11  Teache

## EDA

In [298]:
# # Exploring both datasets with ydata_profiling library
# profile_df = ProfileReport(df)

# # Create directory for reports
# if not os.path.exists('./reports'):
#     os.makedirs('./reports')

# # Save as html file
# profile_df.to_file('./reports/df_report.html')

In [299]:
# Create a histogram with additional styling
fig = px.histogram(
    df, 
    x="Exam_Score", 
    title="Distribution of Median Value of Owner-Occupied Homes",
    nbins=30,  # Adjust the number of bins
    color_discrete_sequence=["#636EFA"]  # Custom color for the bars
)

# Update layout for better aesthetics
fig.update_layout(
    title_font_size=20,  # Increase the font size of the title
    title_x=0.5,  # Center the title
    xaxis_title="Exam Score)",  # Add x-axis label
    yaxis_title="Frequency",  # Add y-axis label
    bargap=0.1,  # Adjust the gap between bars
    plot_bgcolor="#f9f9f9",  # Set background color for the plot
    paper_bgcolor="#ffffff",  # Set background color for the entire figure
    xaxis=dict(
        showgrid=False  # Hide grid lines on the x-axis
    ),
    yaxis=dict(
        showgrid=True,  # Show grid lines on the y-axis
        gridcolor="rgba(200, 200, 200, 0.5)"  # Light gray grid lines
    )
)
fig.show()

## Data Cleaning

##### Missing Values

In [300]:
# Show missing values for df
missing_values = df.isnull().sum() / len(df)
missing_values_df = pd.DataFrame(missing_values, columns=['Missing Values (%)']).sort_values(by='Missing Values (%)', ascending=False)


# Just show columns with missing values

missing_values_df.style.background_gradient(cmap='Reds')

# Columns with missing values
columns_missing = missing_values_df[missing_values_df['Missing Values (%)'] > 0]


# Impute all the missing values with most occuring category values

for column in columns_missing.index:
    df[column].fillna(df[column].mode()[0], inplace=True)

##### Outlier Detection and Treatment

In [301]:
# # Step 1: Select only numeric columns
# numeric_df = df.select_dtypes(include=['number'])

# # Step 2: Calculate skewness for numeric columns
# skewness_before = numeric_df.skew()

# # Step 3: Identify columns with absolute skewness > 1.5
# skewed_columns = skewness_before[abs(skewness_before) > 1.5].index

# # Step 4: Apply 5% / 95% percentile capping
# for col in skewed_columns:
#     lower_bound = numeric_df[col].quantile(0.01)
#     upper_bound = numeric_df[col].quantile(0.99)
#     numeric_df[col] = numeric_df[col].clip(upper=upper_bound, lower=lower_bound)

# # Step 5: Calculate skewness after capping
# skewness_after = numeric_df.skew()

# # Step 6: Combine numeric and non-numeric columns back together
# df = pd.concat([numeric_df, df.select_dtypes(exclude=['number'])], axis=1)

# # Display the skewness before and after
# skewness_table = pd.DataFrame({
#     'Skewness Before': skewness_before,
#     'Skewness After': skewness_after
# })

# # Show the skewness table sorted after absolute skewness in descending order
# skewness_table.sort_values(by='Skewness After', ascending=False)


In [302]:
skewness_before[abs(skewness_before) > 0]

Hours_Studied        0.013499
Attendance           0.013666
Sleep_Hours         -0.023805
Previous_Scores     -0.003737
Tutoring_Sessions    0.815530
Physical_Activity   -0.031365
Exam_Score           1.644808
dtype: float64

In [303]:
# Create a histogram with additional styling
fig = px.histogram(
    df, 
    x="Exam_Score", 
    title="Distribution of Median Value of Owner-Occupied Homes",
    nbins=30,  # Adjust the number of bins
    color_discrete_sequence=["#636EFA"]  # Custom color for the bars
)

# Update layout for better aesthetics
fig.update_layout(
    title_font_size=20,  # Increase the font size of the title
    title_x=0.5,  # Center the title
    xaxis_title="Exam Score)",  # Add x-axis label
    yaxis_title="Frequency",  # Add y-axis label
    bargap=0.1,  # Adjust the gap between bars
    plot_bgcolor="#f9f9f9",  # Set background color for the plot
    paper_bgcolor="#ffffff",  # Set background color for the entire figure
    xaxis=dict(
        showgrid=False  # Hide grid lines on the x-axis
    ),
    yaxis=dict(
        showgrid=True,  # Show grid lines on the y-axis
        gridcolor="rgba(200, 200, 200, 0.5)"  # Light gray grid lines
    )
)
fig.show()

#### Duplicates Check

In [304]:
# Check for duplicates in df
duplicate_rows = df[df.duplicated()]
# Display the duplicate rows
print("Duplicate Rows:")
print(duplicate_rows.values)

Duplicate Rows:
[]


#### Saved Cleaned Dataframe

In [305]:
# Save cleaned dataframes to csv files
df.to_csv('./data/processed/data_cleaned.csv', index=False)

## Data Preperation


##### Encoding categorical variables

In [306]:

print(catergorical_features.shape)

df.head()

(13,)


Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70


In [307]:
df = pd.get_dummies(df, columns=catergorical_features, drop_first=True)  # drop_first=True to avoid multicollinearity
df.head()

Unnamed: 0,Hours_Studied,Attendance,Sleep_Hours,Previous_Scores,Tutoring_Sessions,Physical_Activity,Exam_Score,Parental_Involvement_Low,Parental_Involvement_Medium,Access_to_Resources_Low,...,Teacher_Quality_Medium,School_Type_Public,Peer_Influence_Neutral,Peer_Influence_Positive,Learning_Disabilities_Yes,Parental_Education_Level_High School,Parental_Education_Level_Postgraduate,Distance_from_Home_Moderate,Distance_from_Home_Near,Gender_Male
0,23,84,7,73,0,3,67,True,False,False,...,True,True,False,True,False,True,False,False,True,True
1,19,64,8,59,2,4,61,True,False,False,...,True,True,False,False,False,False,False,True,False,False
2,24,98,7,91,2,4,74,False,True,False,...,True,True,True,False,False,False,True,False,True,True
3,29,89,8,98,1,4,71,True,False,False,...,True,True,False,False,False,True,False,True,False,True
4,19,92,6,65,3,4,70,False,True,False,...,False,True,True,False,False,False,False,False,True,False


##### Train-Test Split

In [308]:
# Assuming 'df' is your DataFrame and 'MEDV' is the target variable
X = df.drop(columns=['Exam_Score'])  # Features
y = df['Exam_Score']  # Target

# Split the data into train, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

##### Feature Engineering

##### Scaling

##### Feature Selection

In [309]:
# Step 1: Fit a RandomForestRegressor to get feature importances
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Step 2: Visualize feature importances
importances = rf.feature_importances_

# Create a DataFrame to store feature importances
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
})

# Sort the DataFrame by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Visualize the top 20 features
fig = px.bar(
    feature_importance_df.head(20),  # Display the top 20 features
    x="Importance", 
    y="Feature", 
    orientation='h',
    title="Top 20 Feature Importances from RandomForestRegressor",
    color_discrete_sequence=["#636EFA"]
)
fig.update_layout(
    title_font_size=20,
    title_x=0.5,
    xaxis_title="Importance",
    yaxis_title="Feature",
    bargap=0.1,
    plot_bgcolor="#f9f9f9",
    paper_bgcolor="#ffffff",
    font=dict(
        family="Arial, sans-serif",
        size=10,
        color="#333333"
    ),
    xaxis=dict(showgrid=False),
    yaxis=dict(showgrid=True, gridcolor="rgba(200, 200, 200, 0.5)")
)
fig.update_yaxes(autorange="reversed")
fig.show()

In [310]:
feature_importance_df.sort_values(by='Importance', ascending=False)

Unnamed: 0,Feature,Importance
1,Attendance,0.380896
0,Hours_Studied,0.242241
3,Previous_Scores,0.092234
4,Tutoring_Sessions,0.039065
2,Sleep_Hours,0.027021
5,Physical_Activity,0.026876
6,Parental_Involvement_Low,0.020537
8,Access_to_Resources_Low,0.017597
9,Access_to_Resources_Medium,0.01118
23,Parental_Education_Level_Postgraduate,0.010913


In [311]:
def select_features(selector, X_train, X_test, y_train):
    """
    Apply a feature selection method to the training and test datasets.
    Parameters:
    - selector: The SelectFromModel object (or any feature selector)
    - X_train: The original training DataFrame before feature selection
    - X_test: The original test DataFrame before feature selection
    - y_train: The target variable for training data (used to fit the selector)
    
    Returns:
    - X_train_selected_df: DataFrame with selected features from the training data
    - X_test_selected_df: DataFrame with selected features from the test data
    """

    # Fit the selector on the training data
    selector.fit(X_train, y_train)

    # Transform both training and test data
    X_train_selected = selector.transform(X_train)
    X_test_selected = selector.transform(X_test)

    # Get the selected feature names
    selected_feature_names = X_train.columns[selector.get_support()]

    # Convert the NumPy arrays back to DataFrames with the original column names
    X_train_selected_df = pd.DataFrame(X_train_selected, columns=selected_feature_names, index=X_train.index)
    X_test_selected_df = pd.DataFrame(X_test_selected, columns=selected_feature_names, index=X_test.index)

    return X_train_selected_df, X_test_selected_df


# Apply feature selection
selector = SelectFromModel(RandomForestRegressor(random_state=42), threshold=0.005)
X_train_selected, X_test_selected = select_features(selector, X_train, X_test, y_train)

# Get the names of the selected features
selected_features = X_train.columns[selector.get_support()]
print(f"Selected Features: {list(selected_features)}")

# Train the model using only the selected features
model_selected_features = RandomForestRegressor(n_estimators=100, random_state=42)
model_selected_features.fit(X_train_selected, y_train)
y_pred_selected = model_selected_features.predict(X_test_selected)

# Evaluate the model using RMSE with selected features
rmse_selected = np.sqrt(mean_squared_error(y_test, y_pred_selected))

# Comparing the RMSE before and after feature selection
y_pred = rf.predict(X_test) 
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f'RMSE without Feature Selection: {rmse}')
print(f"RMSE with Feature Selection: {rmse_selected}")

Selected Features: ['Hours_Studied', 'Attendance', 'Sleep_Hours', 'Previous_Scores', 'Tutoring_Sessions', 'Physical_Activity', 'Parental_Involvement_Low', 'Parental_Involvement_Medium', 'Access_to_Resources_Low', 'Access_to_Resources_Medium', 'Extracurricular_Activities_Yes', 'Motivation_Level_Low', 'Internet_Access_Yes', 'Family_Income_Low', 'Family_Income_Medium', 'Teacher_Quality_Low', 'Teacher_Quality_Medium', 'School_Type_Public', 'Peer_Influence_Neutral', 'Peer_Influence_Positive', 'Learning_Disabilities_Yes', 'Parental_Education_Level_High School', 'Parental_Education_Level_Postgraduate', 'Distance_from_Home_Moderate', 'Distance_from_Home_Near', 'Gender_Male']
RMSE without Feature Selection: 2.2332001488200435
RMSE with Feature Selection: 2.234938782511363


## Initial Model Comparison

##### Model Selection

In [312]:
# Basic model selection for Regression and Classification

# Regressions
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': xgb.XGBRegressor(random_state=42),
    'LightGBM': lgb.LGBMRegressor(random_state=42)
}


##### Cross-Validation:

In [313]:
# Perform cross-validation and store results

results_cv = {}

for name, model in models.items():
    print(f"Evaluating model: {name}")
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
    results_cv[name] = [-scores.mean()] + list(-scores)
    print(f"Scores: {list(-scores)}")
    print(f"Mean RMSE: {-scores.mean()}\n")

# Create a DataFrame for the results_cv
results_df = pd.DataFrame(results_cv, index=['Mean'] + [f'Fold {i+1}' for i in range(len(scores))])

# Transpose the DataFrame to get the desired format
results_df = results_df.T

# Display the results table
results_df.sort_values('Mean', ascending=True)


Evaluating model: Linear Regression
Scores: [1.6257756731811603, 1.6246402478210895, 2.0225607783633, 2.1793381424553075, 2.8102475874824204]
Mean RMSE: 2.052512485860656

Evaluating model: Decision Tree
Scores: [3.2697981502281714, 4.001891700271273, 3.927078928671429, 3.608174257969689, 4.350099771580621]
Mean RMSE: 3.8314085617442366

Evaluating model: Random Forest
Scores: [2.082950022132549, 2.2799226945018214, 2.427292409038977, 2.594089617972709, 3.1350649381645965]
Mean RMSE: 2.503863936362131

Evaluating model: Gradient Boosting
Scores: [1.7850583010461014, 1.8446001869566917, 2.1888807096676905, 2.346957362399702, 2.9128318052618782]
Mean RMSE: 2.2156656730664124

Evaluating model: XGBoost
Scores: [2.058293170497538, 2.1583598254575644, 2.4816162950583216, 2.517352492079864, 3.0145676139167743]
Mean RMSE: 2.446037879402012

Evaluating model: LightGBM
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000194 seconds.
You can set `force_row_

Unnamed: 0,Mean,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5
Linear Regression,2.052512,1.625776,1.62464,2.022561,2.179338,2.810248
Gradient Boosting,2.215666,1.785058,1.8446,2.188881,2.346957,2.912832
LightGBM,2.246601,1.840611,1.883716,2.216117,2.337501,2.955062
XGBoost,2.446038,2.058293,2.15836,2.481616,2.517352,3.014568
Random Forest,2.503864,2.08295,2.279923,2.427292,2.59409,3.135065
Decision Tree,3.831409,3.269798,4.001892,3.927079,3.608174,4.3501


In [314]:
# Remove the 'Mean' column to focus only on folds
results_df_no_mean = results_df.drop(columns=['Mean'])

# Initialize a Plotly figure
fig = go.Figure()

# Add a line for each model
for index, row in results_df_no_mean.iterrows():
    fig.add_trace(go.Scatter(
        x=row.index,  # Fold columns
        y=row.values,  # Values for each fold
        mode='lines+markers',
        name=index  # The name of the model
    ))

# Update layout to make the plot more readable
fig.update_layout(
    title="Model Performance Across Folds",
    xaxis_title="Folds",
    yaxis_title="Score",
    legend_title="Models",
    xaxis=dict(type='category'),
)

# Display the plot
fig.show()

In [315]:
results

{'RandomForest': {'best_model': RandomForestRegressor(max_depth=3, random_state=42),
  'best_params': {'bootstrap': True,
   'max_depth': 3,
   'min_samples_leaf': 1,
   'min_samples_split': 2,
   'n_estimators': 100},
  'best_rmse_cv': 2.1312266796970922,
  'test_rmse': 2.082770545298044,
  'y_pred': array([65.00001162, 66.12270301, 70.16085906, ..., 66.57371539,
         67.17938414, 69.89848234])},
 'GradientBoosting': {'best_model': GradientBoostingRegressor(min_samples_leaf=2, n_estimators=200, random_state=42),
  'best_params': {'learning_rate': 0.1,
   'max_depth': 3,
   'min_samples_leaf': 2,
   'min_samples_split': 2,
   'n_estimators': 200,
   'subsample': 1.0},
  'best_rmse_cv': 1.0804761419443305,
  'test_rmse': 0.9612574171308671,
  'y_pred': array([64.78676316, 65.31811274, 70.96112209, ..., 65.97448233,
         64.42743256, 71.38366462])},
 'XGBoost': {'best_model': XGBRegressor(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsam

## Hyperparameter Tuning

##### Tuning with GridSearchCV

In [326]:
from sklearn.linear_model import LinearRegression

# Define the models and their respective parameter grids
models = {
    'RandomForest': {
        'model': RandomForestRegressor(random_state=42),
        'param_grid': {
            'n_estimators': [100, 200], 
            'max_depth': [1, 3], 
            'min_samples_split': [2], 
            'min_samples_leaf': [1, 2], 
            'bootstrap': [True, False]
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingRegressor(random_state=42),
        'param_grid': {
            'n_estimators': [100, 200], 
            'learning_rate': [0.01, 0.1], 
            'max_depth': [3], 
            'min_samples_split': [2], 
            'min_samples_leaf': [1, 2], 
            'subsample': [1.0]
        }
    },
    'XGBoost': {
        'model': xgb.XGBRegressor(random_state=42, objective='reg:squarederror'),
        'param_grid': {
            'n_estimators': [100, 200], 
            'learning_rate': [0.01, 0.1], 
            'max_depth': [3], 
            'subsample': [1.0],
            'colsample_bytree': [1.0]
        }
    },
    'LinearRegression': {
        'model': LinearRegression(),
        'param_grid': {
            'fit_intercept': [True, False]
        }
    }
}


In [317]:
results = {}

# Perform GridSearchCV for each model
for name, model_info in models.items():
    print(f"Performing GridSearchCV for {name}")
    
    grid_search = GridSearchCV(estimator=model_info['model'], 
                               param_grid=model_info['param_grid'], 
                               scoring=rmse_scorer, 
                               cv=kf, 
                               n_jobs=-1, 
                               verbose=2)
    
    grid_search.fit(X_train, y_train)
    
    # Store the best model and its evaluation results
    best_model = grid_search.best_estimator_
    y_pred_best = best_model.predict(X_test)
    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_best))
    
    results[name] = {
        'best_model': best_model,
        'best_params': grid_search.best_params_,
        'best_rmse_cv': -grid_search.best_score_,
        'test_rmse': rmse_test
    }

# Debugging: Print the results dictionary to inspect its structure
print("Results dictionary structure:", results)


Performing GridSearchCV for RandomForest
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Performing GridSearchCV for GradientBoosting
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Performing GridSearchCV for XGBoost
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Performing GridSearchCV for LinearRegression
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Results dictionary structure: {'RandomForest': {'best_model': RandomForestRegressor(max_depth=3, min_samples_leaf=2, random_state=42), 'best_params': {'bootstrap': True, 'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}, 'best_rmse_cv': 2.839581793216111, 'test_rmse': 2.6046240111799244}, 'GradientBoosting': {'best_model': GradientBoostingRegressor(min_samples_leaf=2, n_estimators=200, random_state=42), 'best_params': {'learning_rate': 0.1, 'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200, 'subsample': 1.0}, 'best_rmse

In [318]:

# Create a DataFrame from the results dictionary
results_df = pd.DataFrame({
    'Model': [name for name in results.keys()],
    'Best Parameters': [result['best_params'] for result in results.values()],
    'CV RMSE': [result['best_rmse_cv'] for result in results.values()],
    'Test RMSE': [result['test_rmse'] for result in results.values()]
})

# Display the DataFrame
results_df = results_df.set_index('Model')
results_df.sort_values(by='CV RMSE', ascending=True)


Unnamed: 0_level_0,Best Parameters,CV RMSE,Test RMSE
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LinearRegression,{'fit_intercept': True},2.089537,1.804445
XGBoost,"{'colsample_bytree': 1.0, 'learning_rate': 0.1...",2.174851,1.884194
GradientBoosting,"{'learning_rate': 0.1, 'max_depth': 3, 'min_sa...",2.179645,1.921994
RandomForest,"{'bootstrap': True, 'max_depth': 3, 'min_sampl...",2.839582,2.604624


## Evaluation

##### Test Set Evaluation


In [319]:
# Step 1 & 2: Fit the best models on the entire training data, make predictions, and store RMSE
for name, result in results.items():
    best_model = models[name]['model'].set_params(**result['best_params'])
    best_model.fit(X_train, y_train)
    
    y_pred = best_model.predict(X_test)
    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
    
    results[name]['best_model'] = best_model
    results[name]['y_pred'] = y_pred
    results[name]['test_rmse'] = rmse_test
    
    print(f"RMSE for {name} on the test set: {rmse_test:.4f}")

# Optional: Create a summary DataFrame
results_df = pd.DataFrame({
    'Model': [name for name in results.keys()],
    'Test RMSE': [result['test_rmse'] for result in results.values()]
})

# Display the summary DataFrame
print(results_df.set_index('Model').sort_values(by='Test RMSE', ascending=True))


RMSE for RandomForest on the test set: 2.6046
RMSE for GradientBoosting on the test set: 1.9220
RMSE for XGBoost on the test set: 1.8842
RMSE for LinearRegression on the test set: 1.8044
                  Test RMSE
Model                      
LinearRegression   1.804445
XGBoost            1.884194
GradientBoosting   1.921994
RandomForest       2.604624


In [320]:
# Step 3: Create the Plotly plot with predictions from all models
fig = go.Figure()

# Add traces for each model's predictions
for name, result in results.items():
    fig.add_trace(go.Scatter(
        x=y_test, 
        y=result['y_pred'], 
        mode='markers',
        name=f'{name} Predictions',
        marker=dict(
            size=6,
            line=dict(width=1),
        )
    ))


# Add a diagonal line to indicate where predicted = actual
fig.add_shape(
    type="line",
    x0=min(y_test), x1=max(y_test),
    y0=min(y_test), y1=max(y_test),
    line=dict(color="red", dash="dash")
)

# Update the layout for better aesthetics
fig.update_layout(
    title="Predicted vs. Actual Values",
    xaxis_title="Actual Values",
    yaxis_title="Predicted Values",
    plot_bgcolor="#f9f9f9",  # Set background color for the plot
    paper_bgcolor="#ffffff",  # Set background color for the entire figure
    title_x=0.5,  # Center the title
    font=dict(
        family="Arial, sans-serif",  # Set the font family
        size=14,  # Set the font size
        color="#333333"  # Set the font color
    )
)

# Show the plot
fig.show()

In [341]:
# Get the names of the selected features after feature selection
selected_features = X_train_selected.columns

# Get the importances of the selected features
name = 'GradientBoosting'
best_model = results[name]['best_model']
best_model.fit(X_train_selected, y_train)

importances = best_model.feature_importances_

# Create a DataFrame for feature importances
feature_importance_df = pd.DataFrame({
    'Feature': selected_features,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Create a bar chart for feature importances
fig = px.bar(
    feature_importance_df, 
    x='Importance', 
    y='Feature', 
    orientation='h', 
    title="Feature Importance",
    color_discrete_sequence=["#636EFA"]  # Custom color
)

# Update layout for better aesthetics
fig.update_layout(
    plot_bgcolor="#f9f9f9",  # Set background color for the plot
    paper_bgcolor="#ffffff",  # Set background color for the entire figure
    title_x=0.5,  # Center the title
    font=dict(
        family="Arial, sans-serif",  # Set the font family
        size=14,  # Set the font size
        color="#333333"  # Set the font color
    ),
    yaxis=dict(
        categoryorder='total ascending'  # Order the features by importance
    )
)

fig.show()

In [342]:
best_model = models[name]['model']
best_model

In [338]:
results[name]['best_model']

results[name]

{'best_model': GradientBoostingRegressor(min_samples_leaf=2, n_estimators=200, random_state=42),
 'best_params': {'learning_rate': 0.1,
  'max_depth': 3,
  'min_samples_leaf': 2,
  'min_samples_split': 2,
  'n_estimators': 200,
  'subsample': 1.0},
 'best_rmse_cv': 2.1796446781510648,
 'test_rmse': 1.9219941932866427,
 'y_pred': array([64.65606853, 65.44401485, 71.45690647, ..., 65.56640831,
        65.05685611, 71.7157861 ])}

## Interpretation and Next Steps