# Model Training

In [22]:
# Set seed for randomization

SEED = 12344321

## 1.1 Import required packages

In [36]:
# Basic imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# %matplotlib inline

# Modeling
# Preprocessing and encoding
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# Data split
from sklearn.model_selection import train_test_split, RandomizedSearchCV

# performance Metrics
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Models - Simple
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

# Models - ensemble and boosting
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

# Misc
import warnings


## 1.2 - Import Dataset

Validations completed in EDA. No need to re verify. Can add required changes in preprocessing steps if required

In [24]:
raw_data = pd.read_csv('data/data.csv')

#### Top 10 rows

In [25]:
raw_data.head(10)

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
5,female,group B,associate's degree,standard,none,71,83,78
6,female,group B,some college,standard,completed,88,95,92
7,male,group B,some college,free/reduced,none,40,43,39
8,male,group D,high school,free/reduced,completed,64,64,67
9,female,group B,high school,free/reduced,none,38,60,50


#### Bottom 10 rows

In [26]:
raw_data.tail(10)

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
990,male,group E,high school,free/reduced,completed,86,81,75
991,female,group B,some high school,standard,completed,65,82,78
992,female,group D,associate's degree,free/reduced,none,55,76,76
993,female,group D,bachelor's degree,free/reduced,none,62,72,74
994,male,group A,high school,standard,none,63,63,62
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77
999,female,group D,some college,free/reduced,none,77,86,86


### 1.3 Split data into features and target variable sets

Target selected is writing score

In [27]:
X = raw_data.drop(columns = ['writing_score', 'reading_score','math_score'])
y = raw_data.loc[:,['writing_score']]

X.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course
0,female,group B,bachelor's degree,standard,none
1,female,group C,some college,standard,completed
2,female,group B,master's degree,standard,none
3,male,group A,associate's degree,free/reduced,none
4,male,group C,some college,standard,none


In [28]:
y.head()

Unnamed: 0,writing_score
0,74
1,88
2,93
3,44
4,75


## 2.0 - Transform columns

We will make a column transformer with 3 transformers for each type of columns available

In [29]:
num_feats = X.select_dtypes(exclude = "object").columns
cat_feats = X.select_dtypes(include = "object").columns

num_transformer = StandardScaler()
ohe_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("StandardScaler",num_transformer, num_feats),
        ("OneHotEncoder",ohe_transformer, cat_feats)
    ]
)

In [30]:
X = preprocessor.fit_transform(X)


## 3.0 - Split into train and test sets

In [31]:
# Validate if y is an array or dataframe and convert it
if type(y) is pd.DataFrame:
    y = y.to_numpy()

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.8, random_state = SEED)

X_train.shape, X_test.shape

((800, 17), (200, 17))

## 4.0 - Create model evalutation function

In [32]:
def eval_model(actuals, preds):
    '''
    Quick function to give a variety of scoring matrics quickly
    '''
    mae = mean_absolute_error(actuals, preds)
    mse = mean_squared_error(actuals, preds)
    rmse = np.sqrt(mse)
    r2_square = r2_score(actuals, preds)

    return mae, mse, rmse, r2_square

In [43]:
# List down all models we will check
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}

model_list = []
r2_score_train_list = []
r2_score_test_list = []

In [44]:
# Establish baseline using mean score
model_name = "Baseline mean predictions"
base_pred_train = np.ones(y_train.shape)*np.mean(y_train)
base_pred_test = np.ones(y_test.shape)*np.mean(y_train)

train_mae, train_mse, train_rmse, train_r2_score = eval_model(y_train,base_pred_train)
test_mae, test_mse, test_rmse, test_r2_score = eval_model(y_test,base_pred_test)

model_list.append(model_name)
r2_score_train_list.append(train_r2_score)
r2_score_test_list.append(test_r2_score)

# Check multiple models

for i,model_name in enumerate(models):
    model = models[model_name]
    # Train model
    model.fit(X_train, np.ravel(y_train))

    # Generate predictions
    pred_train = model.predict(X_train)
    pred_test = model.predict(X_test)

    # Get all metrics

    train_mae, train_mse, train_rmse, train_r2_score = eval_model(y_train,pred_train)
    test_mae, test_mse, test_rmse, test_r2_score = eval_model(y_test,pred_test)
    
    model_list.append(model_name)
    r2_score_train_list.append(train_r2_score)
    r2_score_test_list.append(test_r2_score)
    
    # Print stats for the models
    print(model_name)
    print("Training set performance:")
    print("MAE: {:.3f}".format(train_mae))
    print("RMSE: {:.3f}".format(train_rmse))
    print("R2 score: {:.3f}".format(train_r2_score))

    
    print("\nTest set performance:")
    print("MAE: {:.3f}".format(test_mae))
    print("RMSE: {:.3f}".format(test_rmse))
    print("R2 score: {:.3f}".format(test_r2_score))
    
    print("=+="*20)
    print("\n")
    

Linear Regression
Training set performance:
MAE: 9.829
RMSE: 12.196
R2 score: 0.353

Test set performance:
MAE: 10.744
RMSE: 13.273
R2 score: 0.247
=+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+=


Lasso
Training set performance:
MAE: 10.731
RMSE: 13.438
R2 score: 0.214

Test set performance:
MAE: 10.887
RMSE: 13.789
R2 score: 0.188
=+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+=


Ridge
Training set performance:
MAE: 9.829
RMSE: 12.196
R2 score: 0.353

Test set performance:
MAE: 10.739
RMSE: 13.268
R2 score: 0.248
=+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+=


K-Neighbors Regressor
Training set performance:
MAE: 9.300
RMSE: 11.649
R2 score: 0.410

Test set performance:
MAE: 11.715
RMSE: 14.608
R2 score: 0.088
=+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+=


Decision Tree
Training set performance:
MAE: 8.431
RMSE: 10.811
R2 score: 0.491

Test set performance:
MAE: 12.154
RMSE: 14.997
R2 score: 0.039
=+==+==+==+==+==+==+==+==+==+==

### 4.1 Pretty print outputs

In [45]:
results = pd.DataFrame(list(zip(model_list, r2_score_train_list, r2_score_test_list)), columns=["Model name","R2 Score - Training", "R2 Score - Testing"])
results.sort_values(by=['R2 Score'], ascending=False)

Unnamed: 0,Model name,R2 Score
2,Lasso,0.247777
0,Baseline mean predictions,0.247256
1,Linear Regression,0.187574
8,CatBoosting Regressor,0.18121
3,Ridge,0.088226
5,Decision Tree,0.07941
7,XGBRegressor,0.050002
4,K-Neighbors Regressor,0.040613
6,Random Forest Regressor,0.01733


In [None]:
list(zip(model_list, r2_score_list))
