In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score
from sklearn.neighbors import KNeighborsRegressor  
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [2]:
df = pd.read_csv('stud.csv')
df

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [3]:
# We will predict the reading score based on the other features
X = df.drop(columns=['reading_score'])
X

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,69,88
2,female,group B,master's degree,standard,none,90,93
3,male,group A,associate's degree,free/reduced,none,47,44
4,male,group C,some college,standard,none,76,75
...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,95
996,male,group C,high school,free/reduced,none,62,55
997,female,group C,high school,free/reduced,completed,59,65
998,female,group D,some college,standard,completed,68,77


In [5]:
# Since reading score is our target variable we will make it our y
y = df['reading_score']
y

0      72
1      90
2      95
3      57
4      78
       ..
995    99
996    55
997    71
998    78
999    86
Name: reading_score, Length: 1000, dtype: int64

In [8]:
# For preprocessing we will one hote encode categorical features and scale numerical ones
num_feat = [feature for feature in X.columns if X[feature].dtype != 'O'] # features that are not object data types
cat_feat = [feature for feature in X.columns if X[feature].dtype == 'O'] # features that are object data types

In [9]:
num_feat

['math_score', 'writing_score']

In [10]:
cat_feat

['gender',
 'race_ethnicity',
 'parental_level_of_education',
 'lunch',
 'test_preparation_course']

In [12]:
num_trans = StandardScaler() # scale numerical features
cat_trans = OneHotEncoder() # one hot encode categorical features

preprocessor = ColumnTransformer(
    transformers=[
        ("StandardScaler", num_trans, num_feat),
        ("OneHotEncoder", cat_trans, cat_feat)
    ]
)

In [13]:
# Preprocess the data
X = preprocessor.fit_transform(X)
X

array([[ 0.39002351,  0.39149181,  1.        , ...,  1.        ,
         0.        ,  1.        ],
       [ 0.19207553,  1.31326868,  1.        , ...,  1.        ,
         1.        ,  0.        ],
       [ 1.57771141,  1.64247471,  1.        , ...,  1.        ,
         0.        ,  1.        ],
       ...,
       [-0.46775108, -0.20107904,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.12609287,  0.58901542,  1.        , ...,  1.        ,
         1.        ,  0.        ],
       [ 0.71993682,  1.18158627,  1.        , ...,  0.        ,
         0.        ,  1.        ]], shape=(1000, 19))

In [14]:
# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
X_train

array([[-0.2698031 ,  0.45733301,  1.        , ...,  1.        ,
         0.        ,  1.        ],
       [-0.00587246,  0.98406266,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.85190214,  1.18158627,  1.        , ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.86364705, -0.99117351,  1.        , ...,  1.        ,
         0.        ,  1.        ],
       [-1.06159503, -0.99117351,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 1.2477981 ,  1.37910989,  1.        , ...,  1.        ,
         0.        ,  1.        ]], shape=(800, 19))

In [17]:
model_list = [
    LinearRegression(),
    Ridge(),
    Lasso(),    
    KNeighborsRegressor(),
    DecisionTreeRegressor(),    
    RandomForestRegressor()
]

In [20]:
metrics = []

for model in model_list:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    metrics.append({
        'Model': f'{model}',
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2 Score': r2
    })
metrics_df = pd.DataFrame(metrics)

In [21]:
metrics_df

Unnamed: 0,Model,MSE,RMSE,MAE,R2 Score
0,LinearRegression(),18.5656,4.308782,3.490803,0.917955
1,Ridge(),18.513734,4.302759,3.486115,0.918184
2,Lasso(),21.483165,4.634994,3.694106,0.905061
3,KNeighborsRegressor(),36.9074,6.075146,4.679,0.836898
4,DecisionTreeRegressor(),39.31,6.269769,4.99,0.826281
5,RandomForestRegressor(),20.216168,4.496239,3.662,0.91066


#### Since we are looking at predicting a continuous variable, we will use R2 as our metric for best accuracy/fit.
#### Higher R2 means a better fit and in our case the Ridge regressor has the highest R2 score.
#### Given that linear regression is also very close in terms of R2, and it is usually an easier model to interpret and explain to stakeholders, we will choose linear regression

In [22]:
# Linear Regression Model
selected_model = LinearRegression()
selected_model.fit(X_train, y_train)
y_pred = selected_model.predict(X_test)

In [23]:
# Deeper look at the model predictions
pred_df=pd.DataFrame({'True Value':y_test,'Predicted Value':y_pred,'Difference':y_test-y_pred})
pred_df

Unnamed: 0,True Value,Predicted Value,Difference
521,86,86.171818,-0.171818
737,66,71.366283,-5.366283
740,73,72.353713,0.646287
660,77,75.897002,1.102998
411,83,78.364897,4.635103
...,...,...,...
408,57,56.258870,0.741130
332,56,54.225431,1.774569
208,81,78.545888,2.454112
613,77,74.284418,2.715582


#### We can now use these predictions to tease out the students who may need more supports for their reading tests so that they can perform better 