In [29]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [30]:
df = pd.read_csv("student_scores_with_exception.csv")
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group E,some high school,free/reduced,none,37,10,55
1,female,group B,some high school,standard,none,25,34,6
2,male,group B,high school,standard,completed,39,60,94
3,male,group C,master's degree,free/reduced,completed,9,29,49
4,male,group A,bachelor's degree,free/reduced,completed,16,62,33


In [31]:
X = df.drop(columns=['math_score'],axis=1)
X.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group E,some high school,free/reduced,none,10,55
1,female,group B,some high school,standard,none,34,6
2,male,group B,high school,standard,completed,60,94
3,male,group C,master's degree,free/reduced,completed,29,49
4,male,group A,bachelor's degree,free/reduced,completed,62,33


In [32]:
print("Categories in 'gender' variable:     ",end=" " )
print(df['gender'].unique())

print("Categories in 'race_ethnicity' variable:  ",end=" ")
print(df['race_ethnicity'].unique())

print("Categories in'parental level of education' variable:",end=" " )
print(df['parental_level_of_education'].unique())

print("Categories in 'lunch' variable:     ",end=" " )
print(df['lunch'].unique())

print("Categories in 'test preparation course' variable:     ",end=" " )
print(df['test_preparation_course'].unique())

Categories in 'gender' variable:      ['female' 'male']
Categories in 'race_ethnicity' variable:   ['group E' 'group B' 'group C' 'group A' 'group D']
Categories in'parental level of education' variable: ['some high school' 'high school' "master's degree" "bachelor's degree"
 "associate's degree" 'some college']
Categories in 'lunch' variable:      ['free/reduced' 'standard']
Categories in 'test preparation course' variable:      ['none' 'completed']


In [33]:
y = df['math_score']
y

0      37
1      25
2      39
3       9
4      16
       ..
496    29
497    34
498    16
499    25
500    90
Name: math_score, Length: 501, dtype: int64

In [34]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns
print("numerical:" , num_features)
print("categorical:", cat_features)

numerical: Index(['reading_score', 'writing_score'], dtype='object')
categorical: Index(['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch',
       'test_preparation_course'],
      dtype='object')


In [35]:
numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [36]:
numeric_transformer

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [37]:
preprocessor

0,1,2
,transformers,"[('OneHotEncoder', ...), ('StandardScaler', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [38]:
X = preprocessor.fit_transform(X)

In [39]:
X

array([[ 1.        ,  0.        ,  0.        , ...,  1.        ,
        -1.43236633,  0.11116838],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
        -0.59598459, -1.56516613],
       [ 0.        ,  1.        ,  0.        , ...,  0.        ,
         0.31009563,  1.44539381],
       ...,
       [ 0.        ,  1.        ,  0.        , ...,  0.        ,
        -0.66568307, -0.77831524],
       [ 1.        ,  0.        ,  1.        , ...,  1.        ,
         0.24039715,  1.20591745],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         1.28587432,  1.37697199]])

In [40]:

X.shape


(501, 19)

In [41]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((400, 19), (101, 19))

## linear model

In [None]:
lin_model = LinearRegression()

lin_model = lin_model.fit(X_train, y_train)
y_pred = lin_model.predict(X_test)
score = r2_score(y_test, y_pred)
print(" Accuracy of the model is %.2f" %score)

 Accuracy of the model is 0.02


In [46]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [47]:
evaluate_model(y_test, y_pred)


(23.485727420851287, np.float64(27.38932000500593), 0.021679105984978464)

In [49]:

pred_df=pd.DataFrame({'Actual Value':y_test,'Predicted Value':y_pred,'Difference':y_test-y_pred})
pred_df

Unnamed: 0,Actual Value,Predicted Value,Difference
362,94,54.189575,39.810425
73,53,48.237881,4.762119
375,12,56.961812,-44.961812
155,50,49.184991,0.815009
104,49,60.215412,-11.215412
...,...,...,...
86,62,52.261718,9.738282
75,86,44.524929,41.475071
439,23,56.907631,-33.907631
15,72,55.962875,16.037125


## ridge model

In [53]:
Ridge_model = Ridge(alpha=0.5)
Ridge_model = Ridge_model.fit(X_train , y_train)
Ridge_model


0,1,2
,alpha,0.5
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [54]:
y_pred_ridge = Ridge_model.predict(X_test)
scroe_ridge = r2_score(y_test, y_pred_ridge)
print(" Accuracy of the Ridge model is %.2f" %scroe_ridge)

 Accuracy of the Ridge model is 0.02
