In [2]:
# Importing core libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Importing scikit-learn libraries
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA

# Supervised Learning
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, AdaBoostClassifier, AdaBoostRegressor
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from xgboost import XGBClassifier, XGBRegressor

# Importing utilities
import warnings
warnings.filterwarnings('ignore')

In [4]:
data = pd.read_csv("./data/CleanedStudentPerformance.csv")
df = pd.DataFrame(data)

df.head()

Unnamed: 0.1,Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,total score,average
0,0,female,group B,bachelor's degree,standard,uncompleted,72,72,74,218,72.666667
1,1,female,group C,some college,standard,completed,69,90,88,247,82.333333
2,2,female,group B,master's degree,standard,uncompleted,90,95,93,278,92.666667
3,3,male,group A,associate's degree,free/reduced,uncompleted,47,57,44,148,49.333333
4,4,male,group C,some college,standard,uncompleted,76,78,75,229,76.333333


x = independent features

y = dependent or output feature

In [None]:
x = df.drop(['math score'],axis=1)
y = df['math score']

In [None]:
categorical_features = []
numerical_features = []
for column in x.columns:
    if(df[column].dtype=='object'):
        categorical_features.append(column)
    else :
        numerical_features.append(column)

numerical_features
categorical_features

['gender',
 'race/ethnicity',
 'parental level of education',
 'lunch',
 'test preparation course']

In [None]:
onehot_encoder = OneHotEncoder()
standarization = StandardScaler()

numerical_features

['reading score', 'writing score', 'total score', 'average']

In [None]:
preprocessor = ColumnTransformer([
    ('OneHotEncoder',onehot_encoder,categorical_features),
    ('standarization',standarization,numerical_features),
])

In [None]:
x=preprocessor.fit_transform(x)
x.shape

(1000, 21)

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
x_test.shape

(200, 21)

In [None]:
def analyse_performance(y_test,y_pred):
    r2 = r2_score(y_test,y_pred)
    mae = mean_absolute_error(y_test,y_pred)
    mse = mean_squared_error(y_test,y_pred)

    print("R2 Score : ",r2)
    print("Mean Absolute Error : ",mae)
    print("Mean Squared Error : ",mse)
    return r2

In [None]:
models = {
    "linearRegression":LinearRegression(),
    "Ridge":Ridge(),
    "Lasso":Lasso(),
    "ElasticNet":ElasticNet(),
    "logisticRegression":LogisticRegression(),
    "DecisionTreeRegressor":DecisionTreeRegressor(),
    "RandomForestRegressor":RandomForestRegressor(),
    "AdaBoostRegressor":AdaBoostRegressor(),
    "SVR":SVR(),
    "KNeighborsRegressor":KNeighborsRegressor(),
    "GaussianNB":GaussianNB(),
    "BernoulliNB":BernoulliNB(),
    "XGBRegressor":XGBRegressor(),
}

In [None]:
model_name = []
train_r2_score = []
test_r2_score = []

In [None]:
for modelname in models.keys():
    print(modelname)
    model_name.append(modelname)
    model = models[modelname]

    model.fit(x_train, y_train)

    y_train_pred = model.predict(x_train)
    print("train data:")
    r2_train = analyse_performance(y_train, y_train_pred)
    train_r2_score.append(r2_train)
    print("\n")

    print("test data:")
    y_test_pred = model.predict(x_test)
    r2_test = analyse_performance(y_test, y_test_pred)
    test_r2_score.append(r2_test)
    print("\n")

print("Train R2 Scores:", train_r2_score)
print("Test R2 Scores:", test_r2_score)

linearRegression
train data:
R2 Score :  1.0
Mean Absolute Error :  1.2800427384718205e-13
Mean Squared Error :  2.5203285506469897e-26


test data:
R2 Score :  1.0
Mean Absolute Error :  1.2789769243681803e-13
Mean Squared Error :  2.630377802368942e-26


Ridge
train data:
R2 Score :  0.9994884476076243
Mean Absolute Error :  0.27244149420391794
Mean Squared Error :  0.11532820078639502


test data:
R2 Score :  0.9995159680022118
Mean Absolute Error :  0.26842795106647477
Mean Squared Error :  0.1177835718905797


Lasso
train data:
R2 Score :  0.9004238119887846
Mean Absolute Error :  3.743856971975057
Mean Squared Error :  22.44920124636832


test data:
R2 Score :  0.9085602868908693
Mean Absolute Error :  3.7578731421148284
Mean Squared Error :  22.250793484434897


ElasticNet
train data:
R2 Score :  0.8408375766006
Mean Absolute Error :  4.75356182118061
Mean Squared Error :  35.88276821111466


test data:
R2 Score :  0.8471677389976175
Mean Absolute Error :  4.906425886824505
Mean

In [None]:
performance = pd.DataFrame({"model_name":model_name,"train_r2_score":train_r2_score,"test_r2_score":test_r2_score})
performance.sort_values(by="test_r2_score",ascending=False)

Unnamed: 0,model_name,train_r2_score,test_r2_score
0,linearRegression,1.0,1.0
1,Ridge,0.999488,0.999516
12,XGBRegressor,0.999944,0.977101
13,CatBoostRegressor,0.996057,0.9681
6,RandomForestRegressor,0.994519,0.963913
5,DecisionTreeRegressor,1.0,0.936899
7,AdaBoostRegressor,0.932632,0.926474
2,Lasso,0.900424,0.90856
9,KNeighborsRegressor,0.918781,0.889781
3,ElasticNet,0.840838,0.847168
