# Student grades prediction project

# Imports

In [101]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import joblib

# Loading data

In [102]:
students = pd.read_csv('data/student-por.csv', sep=';')

In [103]:
students.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13


In [104]:
students.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649 entries, 0 to 648
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      649 non-null    object
 1   sex         649 non-null    object
 2   age         649 non-null    int64 
 3   address     649 non-null    object
 4   famsize     649 non-null    object
 5   Pstatus     649 non-null    object
 6   Medu        649 non-null    int64 
 7   Fedu        649 non-null    int64 
 8   Mjob        649 non-null    object
 9   Fjob        649 non-null    object
 10  reason      649 non-null    object
 11  guardian    649 non-null    object
 12  traveltime  649 non-null    int64 
 13  studytime   649 non-null    int64 
 14  failures    649 non-null    int64 
 15  schoolsup   649 non-null    object
 16  famsup      649 non-null    object
 17  paid        649 non-null    object
 18  activities  649 non-null    object
 19  nursery     649 non-null    object
 20  higher    

# Preparing the data

In [108]:
y = students.G3.copy()
X = students.drop(['age', 'guardian', 'sex', 'Mjob', 'Fjob', 'G3', 'reason', 'nursery', 'romantic', 'goout', 'Dalc', 
                       'Walc', 'failures'], axis=1)

## Editing Features

### Grade scale: 0 to 10

In [110]:
y = y.apply(lambda grade: grade / 2)
y

0      5.5
1      5.5
2      6.0
3      7.0
4      6.5
      ... 
644    5.0
645    8.0
646    4.5
647    5.0
648    5.5
Name: G3, Length: 649, dtype: float64

In [111]:
X['G1'] = X['G1'].apply(lambda grade: grade / 2)
X['G2'] = X['G2'].apply(lambda grade: grade / 2)

X[['G1', 'G2']]

Unnamed: 0,G1,G2
0,0.0,5.5
1,4.5,5.5
2,6.0,6.5
3,7.0,7.0
4,5.5,6.5
...,...,...
644,5.0,5.5
645,7.5,7.5
646,5.5,6.0
647,5.0,5.0


### Normalization of absences column

In [112]:
scaler = MinMaxScaler()
absences_reshaped = np.array(X.absences).reshape(-1, 1)

In [113]:
X.drop(['absences'], axis=1, inplace=True)
X['absences'] = scaler.fit_transform(absences_reshaped)
X

Unnamed: 0,school,address,famsize,Pstatus,Medu,Fedu,traveltime,studytime,schoolsup,famsup,paid,activities,higher,internet,famrel,freetime,health,G1,G2,absences
0,GP,U,GT3,A,4,4,2,2,yes,no,no,no,yes,no,4,3,3,0.0,5.5,0.1250
1,GP,U,GT3,T,1,1,1,2,no,yes,no,no,yes,yes,5,3,3,4.5,5.5,0.0625
2,GP,U,LE3,T,1,1,1,2,yes,no,no,no,yes,yes,4,3,3,6.0,6.5,0.1875
3,GP,U,GT3,T,4,2,1,3,no,yes,no,yes,yes,yes,3,2,5,7.0,7.0,0.0000
4,GP,U,GT3,T,3,3,1,2,no,yes,no,no,yes,no,4,3,5,5.5,6.5,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644,MS,R,GT3,T,2,3,1,3,no,no,no,yes,yes,yes,5,4,5,5.0,5.5,0.1250
645,MS,U,LE3,T,3,1,1,2,no,yes,no,no,yes,yes,4,3,1,7.5,7.5,0.1250
646,MS,U,GT3,T,1,1,2,2,no,no,no,yes,yes,no,1,1,5,5.5,6.0,0.1875
647,MS,U,LE3,T,3,1,2,1,no,no,no,no,yes,yes,2,4,2,5.0,5.0,0.1875


## Fixing the data

### Binary Features

In [114]:
bin_features = ['school', 'address', 'famsize', 'Pstatus', 'schoolsup', 'famsup', 'paid', 'activities', 'higher', 'internet']

In [115]:
ord_enc = OrdinalEncoder()

In [116]:
X[bin_features] = ord_enc.fit_transform(X[bin_features])
X[bin_features] = X[bin_features].astype('int64')

# Building the Model

## Train and Test split data

In [None]:
X_train, y_train, X_test, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Linear Regression

In [119]:
lin_model = LinearRegression()

In [120]:
lin_model.fit(X_train, y_train)

LinearRegression()

In [121]:
lin_pred = lin_model.predict(X_test)

In [122]:
lin_score = lin_model.score(X_test, y_test)
lin_mae = mean_absolute_error(y_test, lin_pred)
lin_mse = mean_squared_error(y_test, lin_pred)

In [123]:
print(f'Score : {lin_score:.2f}%')
print(f'MAE   : {lin_mae:.2f}')
print(f'MSE   : {lin_mse:.2f}')

Score : 0.85%
MAE   : 0.38
MSE   : 0.37


## Decision Tree

In [124]:
tree_model = DecisionTreeRegressor()

In [125]:
tree_model.fit(X_train, y_train)

DecisionTreeRegressor()

In [126]:
tree_pred = tree_model.predict(X_test)

In [127]:
tree_score = tree_model.score(X_test, y_test)
tree_mae = mean_absolute_error(y_test, tree_pred)
tree_mse = mean_squared_error(y_test, tree_pred)

In [128]:
print(f'Score : {tree_score:.2f}%')
print(f'MAE   : {tree_mae:.2f}')
print(f'MSE   : {tree_mse:.2f}')

Score : 0.61%
MAE   : 0.50
MSE   : 0.96


## Random Forest

In [129]:
rf_model = RandomForestRegressor()

In [130]:
rf_model.fit(X_train, y_train)

RandomForestRegressor()

In [131]:
rf_pred = rf_model.predict(X_test)

In [132]:
rf_score = rf_model.score(X_test, y_test)
rf_mae = mean_absolute_error(y_test, rf_pred)
rf_mse = mean_squared_error(y_test, rf_pred)

In [133]:
print(f'Score : {rf_score:.2f}%')
print(f'MAE   : {rf_mae:.2f}')
print(f'MSE   : {rf_mse:.2f}')

Score : 0.79%
MAE   : 0.42
MSE   : 0.52


## Saving the best model

In [134]:
joblib.dump(lin_model, 'linear_model_grade_prediction.pkl')

['linear_model_grade_prediction.pkl']

# Comparing predictions with real values

In [165]:
df = pd.DataFrame(y_test)
df['G3_pred'] = lin_pred.copy()
df['error'] = abs(df.G3 - df.G3_pred)
df['G3_pred'] = df['G3_pred'].apply(lambda grade: '{:.1f}'.format(grade))
df['error'] = df['error'].apply(lambda error: '{:.1f}'.format(error))
df.head()

Unnamed: 0,G3,G3_pred,error
636,9.5,9.1,0.4
220,6.0,5.8,0.2
594,9.0,9.2,0.2
429,5.5,5.4,0.1
72,5.5,5.9,0.4


In [166]:
df.to_csv('grades_comparison.csv', sep=';', index=False) # Saving the datraframe in CSV file