# Model Training

## 1.1 Import required packages

In [86]:
# Basic imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# %matplotlib inline

# Modelling
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

# Misc
import warnings


## 1.2 - Import Dataset

In [87]:
df = pd.read_csv('data/data.csv')

#### Top 10 records

In [88]:
df.head(10)

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
5,female,group B,associate's degree,standard,none,71,83,78
6,female,group B,some college,standard,completed,88,95,92
7,male,group B,some college,free/reduced,none,40,43,39
8,male,group D,high school,free/reduced,completed,64,64,67
9,female,group B,high school,free/reduced,none,38,60,50


#### Botton 10 entries

In [89]:
df.tail(10)

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
990,male,group E,high school,free/reduced,completed,86,81,75
991,female,group B,some high school,standard,completed,65,82,78
992,female,group D,associate's degree,free/reduced,none,55,76,76
993,female,group D,bachelor's degree,free/reduced,none,62,72,74
994,male,group A,high school,standard,none,63,63,62
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77
999,female,group D,some college,free/reduced,none,77,86,86


### 1.3 Split data into X and y sets

Lets predict writing score

In [90]:
X = df.drop(columns = ['writing_score'])
y = df.loc[:,['writing_score']]

X.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score
0,female,group B,bachelor's degree,standard,none,72,72
1,female,group C,some college,standard,completed,69,90
2,female,group B,master's degree,standard,none,90,95
3,male,group A,associate's degree,free/reduced,none,47,57
4,male,group C,some college,standard,none,76,78


In [91]:
y.head()

Unnamed: 0,writing_score
0,74
1,88
2,93
3,44
4,75


## 2.0 - Transform columns

We will make a column transformer with 3 transformers for each type of columns available

In [92]:
num_feats = X.select_dtypes(exclude = "object").columns
cat_feats = X.select_dtypes(include = "object").columns

num_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder",oh_transformer, cat_feats),
        ("StandardScaler",num_transformer, num_feats)
    ]
)

In [93]:
X = preprocessor.fit_transform(X)


## 3.0 - Split into train and test sets

In [94]:
# Validate if y is an array or dataframe and convert it
if type(y) is pd.DataFrame:
    y = y.to_numpy()

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.8, random_state = 100)

X_train.shape, X_test.shape

((800, 19), (200, 19))

## 4.0 - Create model evalutation function

In [95]:
def eval_model(actuals, preds):
    '''
    Quick function to give a variety of scoring matrics quickly
    '''
    mae = mean_absolute_error(actuals, preds)
    mse = mean_squared_error(actuals, preds)
    rmse = np.sqrt(mse)
    r2_square = r2_score(actuals, preds)

    return mae, rmse, r2_square

In [96]:
# List down all models we will check
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}

model_list = []
r2_score_list = []

In [97]:
# Check multiple models

for i in range(len(list(models))):
    model = list(models.values())[i]

    # Train model
    model.fit(X_train, np.ravel(y_train))

    # Generate predictions
    pred_train = model.predict(X_train)
    pred_test = model.predict(X_test)

    # Get all metrics

    train_mae, train_rmse, train_r2_score = eval_model(y_train,pred_train)
    test_mae, test_rmse, test_r2_score = eval_model(y_test,pred_test)
    
    model_list.append(list(models.keys())[i])
    r2_score_list.append(test_r2_score)
    
    # Print stats for the models
    print(list(models.keys())[i])
    print("Train set performance:")
    print("MAE: {:.3f}".format(train_mae))
    print("RMSE: {:.3f}".format(train_rmse))
    print("R2 score: {:.3f}".format(train_r2_score))

    
    print("\nTest set performance:")
    print("MAE: {:.3f}".format(test_mae))
    print("RMSE: {:.3f}".format(test_rmse))
    print("R2 score: {:.3f}".format(test_r2_score))
    
    print("=+="*20)
    print("\n")
    

Linear Regression
Train set performance:
MAE: 2.816
RMSE: 3.467
R2 score: 0.948

Test set performance:
MAE: 2.880
RMSE: 3.614
R2 score: 0.942
=+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+=


Lasso
Train set performance:
MAE: 3.674
RMSE: 4.564
R2 score: 0.910

Test set performance:
MAE: 3.797
RMSE: 4.747
R2 score: 0.900
=+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+=


Ridge
Train set performance:
MAE: 2.807
RMSE: 3.451
R2 score: 0.948

Test set performance:
MAE: 2.825
RMSE: 3.550
R2 score: 0.944
=+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+=


K-Neighbors Regressor
Train set performance:
MAE: 3.392
RMSE: 4.210
R2 score: 0.923

Test set performance:
MAE: 3.900
RMSE: 4.818
R2 score: 0.897
=+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+=


Decision Tree
Train set performance:
MAE: 0.015
RMSE: 0.224
R2 score: 1.000

Test set performance:
MAE: 4.095
RMSE: 5.233
R2 score: 0.878
=+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+==+=



### 4.1 Pretty print outputs

In [98]:
results = pd.DataFrame(list(zip(model_list, r2_score_list)), columns=["Model name","R2 Score"])
results.sort_values(by=['R2 Score'], ascending=False)

Unnamed: 0,Model name,R2 Score
2,Ridge,0.944052
0,Linear Regression,0.942013
7,CatBoosting Regressor,0.938898
5,Random Forest Regressor,0.928479
6,XGBRegressor,0.920775
8,AdaBoost Regressor,0.916015
1,Lasso,0.89997
3,K-Neighbors Regressor,0.896942
4,Decision Tree,0.878423


In [99]:
list(zip(model_list, r2_score_list))


[('Linear Regression', 0.9420134271572104),
 ('Lasso', 0.8999699679537821),
 ('Ridge', 0.944052033567275),
 ('K-Neighbors Regressor', 0.8969418084551437),
 ('Decision Tree', 0.8784226240024862),
 ('Random Forest Regressor', 0.9284790754861011),
 ('XGBRegressor', 0.9207750336129594),
 ('CatBoosting Regressor', 0.9388984118915205),
 ('AdaBoost Regressor', 0.9160150753332332)]