In [3]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor,RandomForestRegressor
from sklearn.svm import SVR




from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error,mean_squared_error
from sklearn.model_selection import RandomizedSearchCV

from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [7]:
df=pd.read_csv('stud.csv')

In [8]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [5]:
X=df.drop(columns=['math_score'], axis=1)

In [9]:
y=df['math_score']

In [10]:
print('categories in gender are ')
print(df['gender'].unique())

print('categories in race_ethnicity are ')
print(df['race_ethnicity'].unique())


print('categories in parental_level_of_education are ')
print(df['parental_level_of_education'].unique())

print('categories in lunch are ')
print(df['lunch'].unique())

print('categories in test_preparation_course  are ')
print(df['test_preparation_course'].unique())



categories in gender are 
['female' 'male']
categories in race_ethnicity are 
['group B' 'group C' 'group A' 'group D' 'group E']
categories in parental_level_of_education are 
["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
categories in lunch are 
['standard' 'free/reduced']
categories in test_preparation_course  are 
['none' 'completed']


In [14]:
num_features=X.select_dtypes(exclude='object').columns
cat_features=X.select_dtypes(include='object').columns

from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transfomer=StandardScaler()
oh_transformer=OneHotEncoder()

preprocessor=ColumnTransformer(
    [("oneHotEncoder",oh_transformer,cat_features), 
    ('standardScaler',numeric_transfomer,num_features)]
)

In [15]:
X=preprocessor.fit_transform(X)

In [16]:
X.shape

(1000, 19)

In [18]:
from sklearn.model_selection import train_test_split

X_train,X_test, y_train,y_test=train_test_split(X, y, test_size=0.2, shuffle=True,random_state=42)
X_train.shape, X_test.shape

((800, 19), (200, 19))

In [21]:
def evaluated_model(true, predicted):
    mae=mean_absolute_error(true, predicted)
    mse=mean_squared_error(true, predicted)
    rmse=np.sqrt(mean_squared_error(true, predicted))
    r2=r2_score(true,predicted)
    return mae,rmse,r2

In [32]:
models={
'AdaBoostRegressor':AdaBoostRegressor(),
'CatBoostRegressor':CatBoostRegressor(verbose=False),
'LinearRegression':LinearRegression(),
"Ridge":Ridge(),
'KneighborsRegressor':KNeighborsRegressor(),
'RandomForestRegressor': RandomForestRegressor(),
'DecisionTreeRegressor':DecisionTreeRegressor(),
'XGBRegressor':XGBRegressor(),
'Lasso':Lasso()

}
model_list=[]
r2_list=[]
rmse_list=[]
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #predict y hat
    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)

    #evaluate the predictions
    train_mae,train_rmse,train_r2 = evaluated_model(y_train,y_train_pred)
    test_mae,test_rmse,test_r2 = evaluated_model(y_test,y_test_pred)


    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print(f"model performance for training set : model: {list(models.keys())[i]} , mae :{train_mae} , rmse: {train_rmse} , r2 : {train_r2}")
    print(f"mdoel performance for test set : model: {list(models.keys())[i]}, mae :{test_mae} , rmse: {test_rmse} , r2 : {test_r2}")
    r2_list.append(test_r2)
    rmse_list.append(test_rmse)

AdaBoostRegressor
model performance for training set : model: AdaBoostRegressor , mae :4.8139942479681395 , rmse: 5.878283576951116 , r2 : 0.8467305249940443
mdoel performance for test set : model: AdaBoostRegressor, mae :4.8210864161379705 , rmse: 6.167885824950527 , r2 : 0.8436629173627112
CatBoostRegressor
model performance for training set : model: CatBoostRegressor , mae :2.405393926779502 , rmse: 3.042664195326799 , r2 : 0.9589358676277713
mdoel performance for test set : model: CatBoostRegressor, mae :4.612531714976557 , rmse: 6.008631956907363 , r2 : 0.8516318920747058
LinearRegression
model performance for training set : model: LinearRegression , mae :4.266711846071957 , rmse: 5.323050852720514 , r2 : 0.8743172040139593
mdoel performance for test set : model: LinearRegression, mae :4.214763142474852 , rmse: 5.393993869732845 , r2 : 0.8804332983749564
Ridge
model performance for training set : model: Ridge , mae :4.264987823725981 , rmse: 5.323324922741654 , r2 : 0.874304261521

In [30]:
result_df=pd.DataFrame(list(zip(models, r2_list)), columns=['model name','r2 score']).sort_values(by=['r2 score'], ascending=False)

In [31]:
result_df

Unnamed: 0,model name,r2 score
3,Ridge,0.880593
2,LinearRegression,0.880433
5,RandomForestRegressor,0.853931
0,AdaBoostRegressor,0.851899
1,CatBoostRegressor,0.851632
7,XGBRegressor,0.827797
8,Lasso,0.82532
4,KneighborsRegressor,0.78403
6,DecisionTreeRegressor,0.719691


In [34]:
pd.DataFrame(list(zip(models,rmse_list)), columns=['model names','rmse']).sort_values(by=['rmse'], ascending=True)

Unnamed: 0,model names,rmse
3,Ridge,5.390387
2,LinearRegression,5.393994
5,RandomForestRegressor,5.965406
1,CatBoostRegressor,6.008632
0,AdaBoostRegressor,6.167886
7,XGBRegressor,6.473307
8,Lasso,6.519695
4,KneighborsRegressor,7.2494
6,DecisionTreeRegressor,7.732723
