In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer #for missing data
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


file_path ='D:\\ML\\student_performance.xlsx'
music_data= pd.read_excel(file_path)



music_data.dropna(subset=['GPA'], inplace=True) 
music_data.isna().sum() #check number of missing data again


x=music_data.drop(columns=['GPA'])
y=music_data['GPA']

#for missing data
cat_imputer=SimpleImputer(strategy='constant',fill_value='missing')
num_imputer=SimpleImputer(strategy='constant',fill_value=0)


#define column
cat_features=['Gender','Major','PartTimeJob','ExtraCurricularActivities']
num_features=['Age','StudyHoursPerWeek','AttendanceRate']

#create imputer
imputer=ColumnTransformer([
    ('cat_imputer',cat_imputer,cat_features),
    ('num_imputer',num_imputer,num_features)
])


filled_x=imputer.fit_transform(x)
#filled_x

#checking data got filled
filled_data=pd.DataFrame(filled_x,columns=['Gender','Major','PartTimeJob','ExtraCurricularActivities','Age','StudyHoursPerWeek','AttendanceRate'])
filled_data.isna().sum()


categorical_features=['Gender','Major','PartTimeJob','ExtraCurricularActivities']   #categorical features to convert into number
one_hot=OneHotEncoder()
transformer=ColumnTransformer([('one_hot',one_hot,categorical_features)],remainder='passthrough')
transformed_x=transformer.fit_transform(filled_data)

# Convert sparse matrix to dense
#transformed_x = transformed_x.toarray()

dummies=pd.get_dummies(music_data[['Gender','Major','PartTimeJob','ExtraCurricularActivities','Age','StudyHoursPerWeek','AttendanceRate']])
dummies



Unnamed: 0,Age,StudyHoursPerWeek,AttendanceRate,Gender_Female,Gender_Male,Major_Arts,Major_Business,Major_Education,Major_Engineering,Major_Science,PartTimeJob_No,PartTimeJob_Yes,ExtraCurricularActivities_No,ExtraCurricularActivities_Yes
0,24,37,90.75,False,True,True,False,False,False,False,False,True,True,False
1,22,37,74.90,True,False,False,False,True,False,False,True,False,True,False
2,22,10,53.36,False,True,False,True,False,False,False,True,False,True,False
3,24,10,70.26,False,True,False,False,False,False,True,False,True,True,False
4,18,19,74.87,False,True,False,False,True,False,False,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,22,37,76.61,False,True,False,False,False,False,True,True,False,True,False
496,23,11,56.29,False,True,False,False,False,False,True,True,False,True,False
497,20,6,56.64,True,False,False,False,False,False,True,True,False,False,True
498,22,18,57.18,False,True,False,True,False,False,False,True,False,False,True


In [5]:
x_train,x_test,y_train,y_test=train_test_split(transformed_x,y,test_size=0.2)


model=LinearRegression()
model.fit(x_train,y_train)
model1 = DecisionTreeRegressor()
model1.fit(x_train, y_train)
model2 = RandomForestRegressor()
model2.fit(x_train, y_train)
model3 = GradientBoostingRegressor()
model3.fit(x_train, y_train)
model4 = SVR()
model4.fit(x_train, y_train)
model5 = KNeighborsRegressor()
model5.fit(x_train, y_train)



prediction=model.predict(x_test)
prediction1=model1.predict(x_test)
prediction2=model2.predict(x_test)
prediction3=model3.predict(x_test)
prediction4=model4.predict(x_test)
prediction5=model5.predict(x_test)

mse = mean_squared_error(y_test, prediction)
mse1 = mean_squared_error(y_test, prediction1)
mse2 = mean_squared_error(y_test, prediction2)
mse3 = mean_squared_error(y_test, prediction3)
mse4 = mean_squared_error(y_test, prediction4)
mse5 = mean_squared_error(y_test, prediction5)

r2 = r2_score(y_test, prediction)
r2_1 = r2_score(y_test, prediction1)
r2_2 = r2_score(y_test, prediction2)
r2_3 = r2_score(y_test, prediction3)
r2_4 = r2_score(y_test, prediction4)
r2_5 = r2_score(y_test, prediction5)


mse_scores = [mse, mse1, mse2, mse3, mse4, mse5]
r_score=[r2,r2_1,r2_2,r2_3,r2_4,r2_5]
print([mse_scores,r_score])


[[0.3315276708223458, 0.703635, 0.3656388651999999, 0.39012083881024223, 0.3235213586821137, 0.39235428000000006], [-0.014514395621732845, -1.1532074079747763, -0.11889873754450853, -0.1938165101672702, 0.009985878825823247, -0.20065110781386597]]
