# STUDENT PERFORMANCE PREDICTION MODEL -- MATHS

### Steps to be taken:
- import libraries
- import data
- perform EDA and data cleaning
- handling missing values
- feature scaling -  standardise/normalise data
- outlier removal
- data encoding
- model training
- test the model
- save the model as a file 

### IMPORTING THE LIBRARIES

In [1]:
import pandas as pd 
import numpy as np 
# import seaborn as sn
import matplotlib.pyplot as plt 
import matplotlib.ticker as mticker
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"]=(20,10)

from sklearn import preprocessing


### IMPORT THE DATA

In [2]:
maths = pd.read_csv(r"C:\Users\sjr\OneDrive\Desktop\COMP SCIENCE\sjrCodes\py\DJANGO\studentPerformance\notebook\data\student\student-mat.csv", sep=";", header=0)

In [3]:
maths.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


### EDA AND DATA CLEANING

In [4]:
maths.isnull().sum()

school        0
sex           0
age           0
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      0
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
higher        0
internet      0
romantic      0
famrel        0
freetime      0
goout         0
Dalc          0
Walc          0
health        0
absences      0
G1            0
G2            0
G3            0
dtype: int64

it looks like there are no missing values

In [5]:
maths.address.value_counts()

address
U    307
R     88
Name: count, dtype: int64

In [6]:
maths.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      395 non-null    object
 1   sex         395 non-null    object
 2   age         395 non-null    int64 
 3   address     395 non-null    object
 4   famsize     395 non-null    object
 5   Pstatus     395 non-null    object
 6   Medu        395 non-null    int64 
 7   Fedu        395 non-null    int64 
 8   Mjob        395 non-null    object
 9   Fjob        395 non-null    object
 10  reason      395 non-null    object
 11  guardian    395 non-null    object
 12  traveltime  395 non-null    int64 
 13  studytime   395 non-null    int64 
 14  failures    395 non-null    int64 
 15  schoolsup   395 non-null    object
 16  famsup      395 non-null    object
 17  paid        395 non-null    object
 18  activities  395 non-null    object
 19  nursery     395 non-null    object
 20  higher    

In [7]:
maths.age.value_counts()

age
16    104
17     98
18     82
15     82
19     24
20      3
22      1
21      1
Name: count, dtype: int64

lets first drop unwanted columns:
- reason
- guardian

In [8]:
maths.drop(['reason', 'guardian'],axis= 1,inplace= True)

In [9]:
maths.drop(['school'], axis=1, inplace=True)

In [10]:
maths.drop(['higher'], axis=1, inplace=True)

In [11]:
maths.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 29 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   sex         395 non-null    object
 1   age         395 non-null    int64 
 2   address     395 non-null    object
 3   famsize     395 non-null    object
 4   Pstatus     395 non-null    object
 5   Medu        395 non-null    int64 
 6   Fedu        395 non-null    int64 
 7   Mjob        395 non-null    object
 8   Fjob        395 non-null    object
 9   traveltime  395 non-null    int64 
 10  studytime   395 non-null    int64 
 11  failures    395 non-null    int64 
 12  schoolsup   395 non-null    object
 13  famsup      395 non-null    object
 14  paid        395 non-null    object
 15  activities  395 non-null    object
 16  nursery     395 non-null    object
 17  internet    395 non-null    object
 18  romantic    395 non-null    object
 19  famrel      395 non-null    int64 
 20  freetime  

In [12]:
maths[['age','Medu','Fedu', 'traveltime', 'studytime', 'failures', 'famrel', 'freetime', 'Dalc', 'Walc', 'absences', 'G1', 'G2', 'G3']].corr()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,Dalc,Walc,absences,G1,G2,G3
age,1.0,-0.163658,-0.163438,0.070641,-0.00414,0.243665,0.05394,0.016434,0.131125,0.117276,0.17523,-0.064081,-0.143474,-0.161579
Medu,-0.163658,1.0,0.623455,-0.171639,0.064944,-0.23668,-0.003914,0.030891,0.019834,-0.047123,0.100285,0.205341,0.215527,0.217147
Fedu,-0.163438,0.623455,1.0,-0.158194,-0.009175,-0.250408,-0.00137,-0.012846,0.002386,-0.012631,0.024473,0.19027,0.164893,0.152457
traveltime,0.070641,-0.171639,-0.158194,1.0,-0.100909,0.092239,-0.016808,-0.017025,0.138325,0.134116,-0.012944,-0.09304,-0.153198,-0.117142
studytime,-0.00414,0.064944,-0.009175,-0.100909,1.0,-0.173563,0.039731,-0.143198,-0.196019,-0.253785,-0.0627,0.160612,0.13588,0.09782
failures,0.243665,-0.23668,-0.250408,0.092239,-0.173563,1.0,-0.044337,0.091987,0.136047,0.141962,0.063726,-0.354718,-0.355896,-0.360415
famrel,0.05394,-0.003914,-0.00137,-0.016808,0.039731,-0.044337,1.0,0.150701,-0.077594,-0.113397,-0.044354,0.022168,-0.018281,0.051363
freetime,0.016434,0.030891,-0.012846,-0.017025,-0.143198,0.091987,0.150701,1.0,0.209001,0.147822,-0.058078,0.012613,-0.013777,0.011307
Dalc,0.131125,0.019834,0.002386,0.138325,-0.196019,0.136047,-0.077594,0.209001,1.0,0.647544,0.111908,-0.094159,-0.06412,-0.05466
Walc,0.117276,-0.047123,-0.012631,0.134116,-0.253785,0.141962,-0.113397,0.147822,0.647544,1.0,0.136291,-0.126179,-0.084927,-0.051939


### DATA ENCODING
- we are encoding all the categorical data 

#### Lets first seperate the target data from the independent data

#### Label Encoding

In [13]:
le = preprocessing.LabelEncoder()

In [14]:
maths['schoolsup_label'] = le.fit_transform(maths.schoolsup.values)
maths['famsup_label'] = le.fit_transform(maths.famsup.values)
maths['paid_label'] = le.fit_transform(maths.paid.values)
maths['activities_label'] = le.fit_transform(maths.activities.values)
maths['nursery_label'] = le.fit_transform(maths.nursery.values)
maths['internet_label'] = le.fit_transform(maths.internet.values)
maths['romantic_label'] = le.fit_transform(maths.romantic.values)

In [15]:
maths.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 36 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   sex               395 non-null    object
 1   age               395 non-null    int64 
 2   address           395 non-null    object
 3   famsize           395 non-null    object
 4   Pstatus           395 non-null    object
 5   Medu              395 non-null    int64 
 6   Fedu              395 non-null    int64 
 7   Mjob              395 non-null    object
 8   Fjob              395 non-null    object
 9   traveltime        395 non-null    int64 
 10  studytime         395 non-null    int64 
 11  failures          395 non-null    int64 
 12  schoolsup         395 non-null    object
 13  famsup            395 non-null    object
 14  paid              395 non-null    object
 15  activities        395 non-null    object
 16  nursery           395 non-null    object
 17  internet        

In [16]:
labeled_droped = maths.drop(['schoolsup','famsup', 'paid', 'activities', 'nursery', 'internet', 'romantic'], axis=1, inplace=True)

In [17]:
maths.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 29 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   sex               395 non-null    object
 1   age               395 non-null    int64 
 2   address           395 non-null    object
 3   famsize           395 non-null    object
 4   Pstatus           395 non-null    object
 5   Medu              395 non-null    int64 
 6   Fedu              395 non-null    int64 
 7   Mjob              395 non-null    object
 8   Fjob              395 non-null    object
 9   traveltime        395 non-null    int64 
 10  studytime         395 non-null    int64 
 11  failures          395 non-null    int64 
 12  famrel            395 non-null    int64 
 13  freetime          395 non-null    int64 
 14  goout             395 non-null    int64 
 15  Dalc              395 non-null    int64 
 16  Walc              395 non-null    int64 
 17  health          

#### One Hot encoding

we first drop the target data

In [18]:
x = maths.drop(['G3'],axis='columns')
y = maths['G3']

In [19]:
df_maths = pd.get_dummies(x, drop_first=True, dtype=int)

In [30]:
df_maths.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 34 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   age               395 non-null    int64
 1   Medu              395 non-null    int64
 2   Fedu              395 non-null    int64
 3   traveltime        395 non-null    int64
 4   studytime         395 non-null    int64
 5   failures          395 non-null    int64
 6   famrel            395 non-null    int64
 7   freetime          395 non-null    int64
 8   goout             395 non-null    int64
 9   Dalc              395 non-null    int64
 10  Walc              395 non-null    int64
 11  health            395 non-null    int64
 12  absences          395 non-null    int64
 13  G1                395 non-null    int64
 14  G2                395 non-null    int64
 15  schoolsup_label   395 non-null    int32
 16  famsup_label      395 non-null    int32
 17  paid_label        395 non-null    i

### FEATURE SCALING
- normalization or standardization

In [31]:
min_max = preprocessing.MinMaxScaler()
scaler = preprocessing.StandardScaler()

#### Normalization

In [32]:
# min_max.fit(df_maths)
# nomarlised_data = min_max.transform(df_maths)
# X = nomarlised_data

#### Standardization

In [33]:
# scaler.fit(df_maths)
# standardized_data = scaler.transform(df_maths)
# X = standardized_data

In [34]:
X = df_maths
X

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,famsize_LE3,Pstatus_T,Mjob_health,Mjob_other,Mjob_services,Mjob_teacher,Fjob_health,Fjob_other,Fjob_services,Fjob_teacher
0,18,4,4,2,2,0,4,3,4,1,...,0,0,0,0,0,0,0,0,0,1
1,17,1,1,1,2,0,5,3,3,1,...,0,1,0,0,0,0,0,1,0,0
2,15,1,1,1,2,3,4,3,2,2,...,1,1,0,0,0,0,0,1,0,0
3,15,4,2,1,3,0,3,2,2,1,...,0,1,1,0,0,0,0,0,1,0
4,16,3,3,1,2,0,4,3,2,1,...,0,1,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,20,2,2,1,2,2,5,5,4,4,...,1,0,0,0,1,0,0,0,1,0
391,17,3,1,2,1,0,2,4,5,3,...,1,1,0,0,1,0,0,0,1,0
392,21,1,1,1,1,3,5,5,3,3,...,0,1,0,1,0,0,0,1,0,0
393,18,3,2,3,1,0,4,4,1,3,...,1,1,0,0,1,0,0,1,0,0


In [35]:
Y=y
Y.head()

0     6
1     6
2    10
3    15
4    10
Name: G3, dtype: int64

### MODEL TRAINING
- we will use Logistic regression

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
lr = LogisticRegression()

#### Train Test split

In [37]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=10)

#### Train the model

In [38]:
lr.fit(X_train,Y_train)
lr.score(X_test,Y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.26582278481012656

In [47]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

models = {
    "Linear Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boost": GradientBoostingRegressor(),
    "Support Vector Machine": SVR()
}

param_grids = {
    "Linear Regression": {},
    "Decision Tree": {'max_depth':[None,5,10,15]},
    "Random Forest": {'n_estimators':[50,100,200]},
    "Gradient Boost": {'n_estimators':[50,100,200], 'learning_rate': [0.01, 0,1, 0.5]},
    "Support Vector Machine": {'kernel': ['linear', 'rbf'], 'C': [0.1, 1, 10]}
}

for model_name, model in models.items():
    grid_search = GridSearchCV(model, param_grids[model_name], cv=5, scoring='neg_mean_squared_error', verbose=1)
    grid_search.fit(X_train, Y_train)

print("Best parameters for", model_name, ":", grid_search.best_params_)
print("Best mean squared error:", -grid_search.best_score_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best parameters for Support Vector Machine : {'C': 10, 'kernel': 'rbf'}
Best mean squared error: 3.543335643364627
