In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as numpy
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

In [3]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [4]:
data=pd.read_csv('StudentsPerformance.csv')
data.head(2)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88


In [5]:
data['total score']=data['math score']+data['reading score']+data['writing score']
data['avg score']=data['total score']/3

In [6]:
x=data.drop(['total score','math score','avg score'],axis=1)
y=data['math score']

In [7]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42)

COLUMN TRANSFORMER

In [8]:
cat=[f for f in data.columns if data[f].dtype=="object"]
num=[f for f in data.columns if data[f].dtype!="object"]

In [9]:
num.remove('total score')
num.remove('math score')
num.remove('avg score')

In [10]:
preprocessor=ColumnTransformer(
    [
        ('one hot',OneHotEncoder(),cat),
        ('scaler',StandardScaler(),num)
    ],remainder="passthrough"
)

In [11]:
x_train=preprocessor.fit_transform(x_train)
x_test=preprocessor.transform(x_test)

MODEL

In [12]:
models={
    'Linear Regression':LinearRegression(),
    'Ridge':Ridge(),
    'lasso':Lasso(),
    'SVR':SVR(),
    'K neighburs':KNeighborsRegressor(),
    'decision tree':DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'adaboost':AdaBoostRegressor(),
    'gradient boost':GradientBoostingRegressor(),
    'xgboost':XGBRegressor()
}

In [13]:
def evaluate_model(test,predicted):
    mae=mean_absolute_error(test,predicted)
    mse=mean_squared_error(test,predicted)
    r2=r2_score(test,predicted)
    
    return mae,mse,r2

In [14]:
for name,model in models.items():
    model.fit(x_train,y_train)
    x_ptrain=model.predict(x_train)
    x_ptest=model.predict(x_test)
    
    print(f'---------{name}---------')
    maetrain,msetrain,r2train=evaluate_model(y_train,x_ptrain)
    maetest,msetest,r2test=evaluate_model(y_test,x_ptest)
    print('Training Data Accuracy Report')
    print(f'MAE-{maetrain}')
    print(f'MSE-{msetrain}')
    print(f'R2 score-{r2train}')
    print('*'*30)
    print('Testing Data Accuracy Report')
    print(f'MAE-{maetest}')
    print(f'MSE-{msetest}')
    print(f'R2 score-{r2test}')
    print('='*30)
    print('\n')

---------Linear Regression---------
Training Data Accuracy Report
MAE-4.241875
MSE-28.089276692708335
R2 score-0.874145666862887
******************************
Testing Data Accuracy Report
MAE-4.3243125
MSE-29.7690146484375
R2 score-0.8789994007388723


---------Ridge---------
Training Data Accuracy Report
MAE-4.236869363734474
MSE-28.06398850144637
R2 score-0.874258970899952
******************************
Testing Data Accuracy Report
MAE-4.3354506524397145
MSE-30.019015966746583
R2 score-0.877983233099845


---------lasso---------
Training Data Accuracy Report
MAE-5.1795714149742285
MSE-42.862504500584855
R2 score-0.8079540466804563
******************************
Testing Data Accuracy Report
MAE-5.218411335948426
MSE-44.22324433196245
R2 score-0.8202480287428799


---------SVR---------
Training Data Accuracy Report
MAE-4.905780174943105
MSE-44.290965830373715
R2 score-0.8015538089655635
******************************
Testing Data Accuracy Report
MAE-5.589548575227139
MSE-67.8079393567

HYPERPARAMETER TUNING

In [15]:
from sklearn.model_selection import RandomizedSearchCV

In [16]:
tuning=[
    ('Ridge',Ridge(),{'alpha':[0.01,0.1,1,10,100]}),

    ('Lasso',Lasso(),{'alpha':[0.001,0.01,0.1,1,10],
                      'max_iter':[5000,10000]}),

    ('SVR',SVR(),{'C':[0.1,1,10,100],
                  'kernel':['linear','rbf'],
                  'gamma':['scale','auto'],
                  'epsilon':[0.1,0.2,0.5]}),

    ('KNN',KNeighborsRegressor(),{'n_neighbors':[3,5,7,9,11],
                                  'weights':['uniform','distance'],
                                  'metric':['euclidean','manhattan','minkowski'],
                                  'p':[1,2]}),

    ('Decision Tree',DecisionTreeRegressor(),{'criterion':['squared_error','absolute_error'],
                                              'max_depth':[None,5,10,20,30],
                                              'min_samples_leaf':[1,2,4],
                                              'min_samples_split':[2,5,10]}),

    ('Random Forest',RandomForestRegressor(),{'n_estimators':[100,200,500],
                                              'max_depth':[None,5,10,20],
                                              'min_samples_leaf':[1,2,4],
                                              'min_samples_split':[2,5,10]}),

    ('Adaboost',AdaBoostRegressor(),{'n_estimators':[50,100,200],
                                     'learning_rate':[0.01,0.1,1.0]}),

    ('Gradient Boost',GradientBoostingRegressor(),{'n_estimators':[50,100,200],
                                                   'learning_rate':[0.01,0.1,0.2],
                                                   'max_depth':[3,5],
                                                   'subsample':[0.8,1.0]}),

    ('XG Boost',XGBRegressor(),{'n_estimators':[100,300,500],
                                'learning_rate':[0.01,0.05,0.1],
                                'max_depth':[3,5,7],
                                'subsample':[0.8,1.0],
                                'colsample_bytree':[0.8,1.0],
                                'gamma':[0,0.1,0.3],
                                'reg_alpha':[0,0.1,1],
                                'reg_lambda':[1,1.5,2]})
]


In [17]:
for name,model,params in tuning:
    m=RandomizedSearchCV(estimator=model,param_distributions=params,cv=5,n_jobs=-1,n_iter=20,refit=True,verbose=True)
    m.fit(x_train,y_train)
    x_ptrain=m.predict(x_train)
    x_ptest=m.predict(x_test)
    
    print(f'-------hyperparameter tuned {name}--------')
    maetrain,msetrain,r2train=evaluate_model(y_train,x_ptrain)
    maetest,msetest,r2test=evaluate_model(y_test,x_ptest)
    print('Training Data Accuracy Report')
    print(f'MAE-{maetrain}')
    print(f'MSE-{msetrain}')
    print(f'R2 score-{r2train}')
    print('*'*30)
    print('Testing Data Accuracy Report')
    print(f'MAE-{maetest}')
    print(f'MSE-{msetest}')
    print(f'R2 score-{r2test}')
    print('='*30)
    print('\n')



Fitting 5 folds for each of 5 candidates, totalling 25 fits
-------hyperparameter tuned Ridge--------
Training Data Accuracy Report
MAE-4.236869363734474
MSE-28.06398850144637
R2 score-0.874258970899952
******************************
Testing Data Accuracy Report
MAE-4.3354506524397145
MSE-30.019015966746583
R2 score-0.877983233099845


Fitting 5 folds for each of 10 candidates, totalling 50 fits
-------hyperparameter tuned Lasso--------
Training Data Accuracy Report
MAE-4.237427585626363
MSE-28.07120380602613
R2 score-0.8742266426432936
******************************
Testing Data Accuracy Report
MAE-4.330382219090673
MSE-29.921764995926495
R2 score-0.8783785241730272


Fitting 5 folds for each of 20 candidates, totalling 100 fits




-------hyperparameter tuned SVR--------
Training Data Accuracy Report
MAE-4.221864796724045
MSE-28.259539679932068
R2 score-0.8733828015549121
******************************
Testing Data Accuracy Report
MAE-4.337817305920536
MSE-29.832101702105625
R2 score-0.8787429740015547


Fitting 5 folds for each of 20 candidates, totalling 100 fits
-------hyperparameter tuned KNN--------
Training Data Accuracy Report
MAE-0.020000080680782104
MSE-0.08333333333375345
R2 score-0.9996266240241944
******************************
Testing Data Accuracy Report
MAE-5.481829424992571
MSE-49.17645092067331
R2 score-0.8001149819297353


Fitting 5 folds for each of 20 candidates, totalling 100 fits
-------hyperparameter tuned Decision Tree--------
Training Data Accuracy Report
MAE-4.581549377540993
MSE-32.375244835823764
R2 score-0.8549423364104967
******************************
Testing Data Accuracy Report
MAE-5.080323339732309
MSE-43.15193951445192
R2 score-0.8246025069290402


Fitting 5 folds for each of 20



-------hyperparameter tuned Adaboost--------
Training Data Accuracy Report
MAE-4.627116886986091
MSE-32.286841465896615
R2 score-0.8553384287446273
******************************
Testing Data Accuracy Report
MAE-4.7588539902208975
MSE-37.57615552929785
R2 score-0.8472661124101727


Fitting 5 folds for each of 20 candidates, totalling 100 fits
-------hyperparameter tuned Gradient Boost--------
Training Data Accuracy Report
MAE-4.006534408447006
MSE-24.8086744878248
R2 score-0.888844443456552
******************************
Testing Data Accuracy Report
MAE-4.382841294460031
MSE-31.649757960096327
R2 score-0.8713548390879539


Fitting 5 folds for each of 20 candidates, totalling 100 fits
-------hyperparameter tuned XG Boost--------
Training Data Accuracy Report
MAE-3.521365338643392
MSE-19.301156631170368
R2 score-0.9135209417043662
******************************
Testing Data Accuracy Report
MAE-4.421558628082275
MSE-31.666488774163142
R2 score-0.8712868341992426




LINEAR REGRESSION IS SELECTED

In [35]:
d={
    'a':1,
    'b':11,
    'c':3
}

In [36]:
s=max(sorted(list(d.values())))

In [43]:
list(d.keys())[list(d.values()).index(s)]

'b'

In [46]:
d

{'a': 1, 'b': 11, 'c': 3}