In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#modeling
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')


In [3]:
df = pd.read_csv('data/stud.csv')

In [8]:
x = df.drop(columns=['math_score'],axis=0)
x.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [10]:
y= df['math_score']
y.head()

0    72
1    69
2    90
3    47
4    76
Name: math_score, dtype: int64

In [11]:
# create column and transform

num_features = x.select_dtypes(exclude='object').columns
cat_features = x.select_dtypes(include='object').columns

from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numerical_trans=StandardScaler()
oh_encoder = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder",oh_encoder,cat_features),
        ("StandardScaler",numerical_trans,num_features)
    ]
)



In [12]:
x = preprocessor.fit_transform(x)

In [13]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)


In [22]:
def evaluate_model(true, pred):
    r2 = r2_score(true, pred)
    mse = mean_squared_error(true, pred)
    mae = mean_absolute_error(true, pred)
    rmse = np.sqrt(mse)
    return r2, mse, mae, rmse


In [23]:
models={
    "KNN":KNeighborsRegressor(),
    "Decision Tree":DecisionTreeRegressor(),
    "Random Forest":RandomForestRegressor(),
    "AdaBoost":AdaBoostRegressor(),
    "SVR":SVR(),
    "Linear Regression":LinearRegression(),
    "Ridge Regression":Ridge(),
    "Lasso Regression":Lasso(),
    "CatBoost Regressor":CatBoostRegressor(verbose=0),
    "XGBoost Regressor":XGBRegressor()
}

model_list =[]
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train,y_train)

    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    model_train_mae,model_train_rmse,model_train_mse,model_train_r2 = evaluate_model(y_train,y_train_pred)


    model_test_mae,model_test_rmse,model_test_mse,model_test_r2 = evaluate_model(y_test,y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("model performance for training set:")
    print("root mean squared error:",model_train_rmse)
    print("mean absolute error:",model_train_mae)
    print("r2 score:",model_train_r2)

    print("---------------------------------------------")
    print("model performance for testing set:")
    print("root mean squared error:",model_test_rmse)
    print("mean absolute error:",model_test_mae)
    print("r2 score:",model_test_r2)

    r2_list.append(model_test_r2)

    print("="*35)
    print("\n")

KNN
model performance for training set:
root mean squared error: 32.57765
mean absolute error: 0.8554978341651085
r2 score: 5.707683417990174
---------------------------------------------
model performance for testing set:
root mean squared error: 52.6066
mean absolute error: 0.7838129945787431
r2 score: 7.253040741647602


Decision Tree
model performance for training set:
root mean squared error: 0.078125
mean absolute error: 0.9996534669718089
r2 score: 0.2795084971874737
---------------------------------------------
model performance for testing set:
root mean squared error: 60.12
mean absolute error: 0.7529366511820575
r2 score: 7.753708789992051


Random Forest
model performance for training set:
root mean squared error: 5.327304531046272
mean absolute error: 0.9763700867681304
r2 score: 2.308095433695555
---------------------------------------------
model performance for testing set:
root mean squared error: 37.01194377246315
mean absolute error: 0.8478992885115413
r2 score: 6.08

In [28]:
pd.DataFrame(list(zip(model_list,r2_list)),columns=['Model','mean_squared_error']).sort_values(by='mean_squared_error',ascending=False)

Unnamed: 0,Model,mean_squared_error
4,SVR,8.126623
1,Decision Tree,7.753709
0,KNN,7.253041
7,Lasso Regression,6.519695
9,XGBoost Regressor,6.473307
2,Random Forest,6.083744
3,AdaBoost,6.028683
8,CatBoost Regressor,6.008632
5,Linear Regression,5.393994
6,Ridge Regression,5.390387


In [26]:
lin_model = LinearRegression()
lin_model.fit(x_train,y_train)
y_pred = lin_model.predict(x_test)
score = r2_score(y_test,y_pred)
print(score)

0.8804332983749565


In [30]:
pred_df = pd.DataFrame({'Actual':y_test,'Predicted':y_pred,'Difference':y_test-y_pred})
pred_df.head()

Unnamed: 0,Actual,Predicted,Difference
521,91,76.38797,14.61203
737,53,58.88597,-5.88597
740,80,76.990265,3.009735
660,74,76.851804,-2.851804
411,84,87.627378,-3.627378
