# Student Grade Prediction Models: Multiple Regression and Classification

**Objective:** Prediction of the final grade of Portugese secondary education students using Regression and classification machine learning models.

# Import Libraries

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.output_wrapper .output {overflow-y: visible;height: fit-content;}</style>"))

In [None]:
# Importing required libraries.
import pandas as pd # Importing pandas
import numpy as np # Importing numpy
import seaborn as sns #Importing Seaborn for data visualisation
import matplotlib.pyplot as plt #Importing matplotlib for data visualisation
from sklearn.preprocessing import OrdinalEncoder,StandardScaler, LabelEncoder# Importing Ordinal encoder to encode the categorical data
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier# Importing random forest regression
from sklearn.preprocessing import MinMaxScaler # Importing to do standardisation
from sklearn.model_selection import train_test_split #Importing to split our data for training
from sklearn.model_selection import cross_val_score,RandomizedSearchCV, GridSearchCV
from sklearn import metrics
from sklearn.metrics import r2_score,confusion_matrix, classification_report,mean_squared_error
from collections import Counter 
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from imblearn.under_sampling import RandomUnderSampler
%matplotlib inline 
sns.set(color_codes=True)

In [None]:
from eda_module import * #Importing the eda_module
global studentDataSet
studentDataSet = getDataSet()
data = studentDataSet

# Exploratory Data Analysis

# Data Mining 

In [None]:
# Calling the EDA function from EDA module
dataInfo()

In [None]:
#Statistics Details
#Calling the function from EDA module
statistics(studentDataSet)

# Data Visualization

In [None]:
#Calling the function to show all the data visualization
plotMain(studentDataSet)

# Checking Collinearity

In [None]:
#Checking the co-relation matrix
checkCorrelation(studentDataSet)

# Common Functions

In [None]:
#Function to divide the data into dependent& independent variable
def divideDependentNIndependent(studentDataSet):
    # Set Dependent and independent variables
    explanatoryVariables = studentDataSet.drop(['G3'], axis=1) # features
    target = studentDataSet['G3']# target feature
    return explanatoryVariables,target

In [None]:
#Function to transform categorical to numbers
def transformCategorical(explanatoryVariables):
    df = explanatoryVariables
    df= df[['school', 'sex', 'address','famsize','Pstatus','Mjob','Fjob','reason','guardian','schoolsup','famsup','paid',
        'activities','nursery','higher','internet','romantic']]
    X_train__ = df 
    #Using Ordinalencoder to encode the categorical variables
    oe = OrdinalEncoder()
    oe.fit(X_train__)
    X_train_enc = oe.transform(X_train__).astype(int)
   #Adding coulmns to make the numpyarray to dataframe
    df = pd.DataFrame(X_train_enc, columns = ['school', 'sex', 'address','famsize','Pstatus','Mjob','Fjob','reason','guardian','schoolsup','famsup','paid',
        'activities','nursery','higher','internet','romantic'])
    #Removing the current categorical columns of the explonatory variable set
    tempDataSet = explanatoryVariables.drop(['school', 'sex', 'address','famsize','Pstatus','Mjob','Fjob','reason','guardian','schoolsup','famsup','paid',
        'activities','nursery','higher','internet','romantic'], axis = 1)    
    # merging the new numerical data set with temporary dataset
    dataSet= pd.merge(tempDataSet,df,  right_index=True, left_index=True)
    #Reindex the column to make it as older index
    column_names = ['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G1', 'G2']
    dataSet = dataSet.reindex(columns=column_names)
    return dataSet

In [None]:
#Standardisation
def scalerFn(x_train,x_test):
    scaler = MinMaxScaler()
    scaler.fit(x_train)
    x_train = scaler.transform(x_train)
    x_test  = scaler.transform(x_test)   
    return x_train,x_test

In [None]:
#Spliting train and test set for machine learning
def trainNtest(X,target):
    X_train, X_test, y_train, y_test = train_test_split(X,target,train_size=0.77)# The dataset is divided into 70 and 30
    x_train,x_test = scalerFn(X_train, X_test)
    return x_train,x_test,y_train,y_test

In [None]:
#Function to find the cross validation score
def crossValidationScore(model,x,y):
    scores = cross_val_score(model, x, y, cv=5)
    print("Accuracy cross validation :",round(scores.mean()*100,2),"%")

In [None]:
#Function to do hypertuning using gridSearchCV
def tuningUsinggridSearchCv(model_name,param_grid,X_train,y_train):
    gridsearch = GridSearchCV(model_name, param_grid, cv = 3, verbose=0, n_jobs = -1)
    model = gridsearch.fit(X_train,y_train)
    return model.best_estimator_

In [None]:
#Function to do hypertuning using randomSearchCV
def tuningUsingRandomSearchCv(model_name,param_grid,X_train,y_train):
    random = RandomizedSearchCV(estimator = model_name, param_distributions = param_grid, cv = 3, verbose=0, n_jobs = -1)
    model = random.fit(X_train,y_train)
    return model.best_estimator_

In [None]:
#Function to evaluate the model
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    modelAccuracy = model.score(X_test, y_test)
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print("Accuracy ({}): {:.2f}%".format(text, modelAccuracy*100))  
    return accuracy

In [None]:
# Function to evaluate the accuracy of the model
def evaluate_model(model, class_balance,X_test, y_test):
    modelAccuracy = model.score(X_test, y_test)
    print("Accuracy ({}): {:.2f}%".format(class_balance, modelAccuracy*100))

In [None]:
def prediction_model(model,class_balance,X_test,y_test):
    y_pred = model.predict(X_test)
    c_matrix =confusion_matrix(y_test,y_pred)
    clr = classification_report(y_test,y_pred)
    plt.figure(figsize=(8,8))
    sns.heatmap(c_matrix,annot=True, fmt='g', vmin=0, cbar=False, cmap='Blues')
    plt.xlabel("Predicted Value")
    plt.ylabel("Actual Value")
    plt.title("Confusion matrix")
    plt.show()
    print(clr)

In [None]:
# Function to plot the feature importance graph
def plot_feature_importance(importance,names,model_type):

    #Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

    #Define size of bar plot
    plt.figure(figsize=(15,8))
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    #Add chart labels
    plt.title(model_type + 'FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')

In [None]:
# Function to categorise the target variable
def categoriseTarget(data):
    #classification – fail is less than 9
    cat = (0, 9, 14, 21)
    cat_name = ['Fail','Good','High']
    data['G3']= pd.cut(data['G3'], bins= cat, labels= cat_name,include_lowest=True)
    student_grade_class = LabelEncoder()
    data['G3']=student_grade_class .fit_transform(data['G3'])
    return data['G3']

In [None]:
#Function to check class imbalance by plotting the target
def checkingImbalance(target):    
    b=sns.countplot(x=target, 
                       facecolor=(0, 0, 0, 0),
                       linewidth=5,
                       edgecolor=sns.color_palette("dark", 3))
    plt.xlabel('Y-train',fontsize=15)
    plt.ylabel('Count',fontsize=15)
    plt.title('Y-train distribution plot')
    x_labels = ['Fail','Good','High']
    b.set_xticklabels(x_labels,fontsize=12)
# checkingImbalance(targetVariable)

In [None]:
#Function to create sample using SMOTE
def applySmote(X_train,y_train):
#     using Counter to display results of naive oversampling
    x, y = SMOTE().fit_resample(X_train, y_train) 
    return x, y

In [None]:
#Function to create sample using Random undersampler
def applyRandomSampler(x,y):
    ros = RandomUnderSampler(random_state=0)
    x, y = ros.fit_resample(x, y)
    return x, y

# Data Preprocessing

In [None]:
# The data is divided to explanatory and target variables
explanatoryVariables,target = divideDependentNIndependent(studentDataSet)

In [None]:
# Checking first 5 view observations of explanatoryVariables
explanatoryVariables.head()

In [None]:
# Checking first 5 view observations of target variables
target.head()

In [None]:
# Transform the categorical explanatory variables to numerical
x = transformCategorical(explanatoryVariables)
x.head()

# Regression

In [None]:
# FUnction of randomforest model
def randomForestRegressionModel(x,y):
    X_train,X_test,y_train,y_test = trainNtest(x,y)#After scaling and spliting
    rf = RandomForestRegressor()
    random_grid = {
                   'max_depth': [10, 20, 30, 50, 60, 70, 80,90,100, None],
                   'min_samples_leaf': [1, 2, 4],
                   'min_samples_split': [2, 5, 10],
                   'n_estimators': [100,130,150, 180, 200],
                   'random_state':[0,15,42]
                    }
    rf_random = tuningUsingRandomSearchCv(rf, random_grid,X_train,y_train)
    # Fit the random search model
    model = rf_random.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print('R2 value  : ',round(r2_score(y_test, y_pred),2))
    Adj_r2 = 1 - (1-r2_score(y_test, y_pred)) * (len(y)-1)/(len(y)-x.shape[1]-1)
    print('Adjusted r2 : ', round(Adj_r2,2))
    y_true = y_test
    y_pred = y_pred
    modelAccuracy = model.score(X_test, y_test)
    print('Mean Squared Error:', round(mean_squared_error(y_test, y_pred),2))
    print('Mean Absolute Error:', round(metrics.mean_absolute_error(y_test, y_pred),2))
    print('Root Mean Squared Error:', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),2))
    print('Accuracy :',round(modelAccuracy,2))
    x_axis = y_pred
    y_axis = y_test - y_pred
    sns.residplot (x=y_pred, y=(y_test - y_pred), lowess = True)
    plt.title( 'Residual Plot' )
    plt.xlabel('Predicted G3')
    plt.xticks(rotation=90)
    plt.ylabel ( 'Residuals')
    # get importance
    plot_feature_importance(model.feature_importances_,explanatoryVariables.columns,'RANDOM FOREST ')

In [None]:
# Calling the model
y_pred = randomForestRegressionModel(x,target)

# Classification

Let's categorise the G3 variable into 3 different categories based on the grade awarded

In [None]:
#Checking the first 5 entries of x to confirm the values are changed to numerical
x.head()

In [None]:
#View first 5 entries
data.head()

In [None]:
#The Y is categorising to specific group
y = categoriseTarget(data)

In [None]:
y

In [None]:
X_train,X_test,y_train,y_test = trainNtest(x,y)#Function to split train and test set

# Support Vector Machine

In [None]:
# SVM model creation
from sklearn.svm import SVC
def svmModel(x,y):
    # SVM Model
    svc = SVC(kernel='rbf', C=1, gamma='auto')
    param_grid={'gamma':[1,1.5,2,3.5],'C': [1, 10], 'kernel': ('linear', 'rbf')}
    model = tuningUsinggridSearchCv(svc,param_grid,X_train,y_train)# function used to tune the parameters to find out the best model
    svc_model = model.fit(x,y)
    return svc_model

# # Random Forest Classification

In [None]:
# RandomForestClassifier Model
from  sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, truncnorm, randint
def randomForestClassifier(X_train,y_train):
    #From Grid search view to tune the model
    rd = RandomForestClassifier()
    param_grid= {'n_estimators': [100,113,150,200],'max_features': ['auto', 'sqrt', 'log2']}
    model=tuningUsinggridSearchCv(rd,param_grid,X_train,y_train)
#     rd = RandomForestClassifier(max_features='log2')
    model = model.fit(X_train,y_train)
    return model

# Multi-Layer Perceptron Neural Networks

In [None]:
#Importing MLPClassifier
def multiLayerPerceptionNeuralNetworksModel(X_train,y_train):
    #Initializing the MLPClassifier
    #Tuned by using GridSearchcv
    mpn = MLPClassifier()
    param = {'hidden_layer_sizes':[150,100,50], 'max_iter': [800],'random_state':[1]}
    mlpmodel = tuningUsinggridSearchCv(mpn,param,X_train, y_train)
    model = mlpmodel.fit(X_train, y_train)#Predicting y for X_val
    return model

# Comparing the models with class imbalanced Data

Evaluating the Svm model for imbalanced data

In [None]:
# Evaluate the svm model for the imbalanced data
model = svmModel(X_train,y_train)
evaluate_model(model,"Imbalanced Data:", X_test,y_test)
prediction_model(model,"Imbalanced Data:",X_test,y_test) 

# Random forest Classification Model

In [None]:
# Evaluate the svm model for the imbalanced data
model = randomForestClassifier(X_train,y_train)
evaluate_model(model,"Imbalanced Data:", X_test,y_test)
prediction_model(model,"Imbalanced Data:",X_test,y_test) 

# Multi-Layer Perceptron Neural Networks

In [None]:
# Evaluate the svm model for the imbalanced data
model = multiLayerPerceptionNeuralNetworksModel(X_train,y_train)
evaluate_model(model,"Imbalanced Data:", X_test,y_test)
prediction_model(model,"Imbalanced Data:",X_test,y_test) 

# Checking the class Imbalance problem

In [None]:
checkingImbalance(y_train)

# SMOTE Oversampler to solve class imbalance issue

In [None]:
#Applying smote to solve class imbalance issue
x_samp,y_samp =  applySmote(X_train,y_train)
checkingImbalance(y_samp)

# Checking the model with sampled data

# SVM MODEL

In [None]:
# Evaluate the svm model for the imbalanced data
model = svmModel(x_samp,y_samp)
evaluate_model(model,"After sampling using SMOTE", X_test,y_test)
prediction_model(model,"After sampling using SMOTE",X_test,y_test) 

# Random forest Classification Model

In [None]:
# Evaluate the svm model for the imbalanced data
model = randomForestClassifier(x_samp,y_samp)
evaluate_model(model,"After sampling using SMOTE", X_test,y_test)
prediction_model(model,"After sampling using SMOTE",X_test,y_test) 

# Multi-Layer Perceptron Neural Networks

In [None]:
# Evaluate the svm model for the imbalanced data
model = multiLayerPerceptionNeuralNetworksModel(x_samp,y_samp)
evaluate_model(model,"After sampling using SMOTE", X_test,y_test)
prediction_model(model,"After sampling using SMOTE",X_test,y_test) 

# RandomUnderSampler to solve class imbalance issue

In [None]:
#Applying smote to solve class imbalance issue
x_random,y_random =  applyRandomSampler(X_train,y_train)
checkingImbalance(y_random)

# Checking the model on resampled data

# SVM MODEL

In [None]:
# Evaluate the svm model for the imbalanced data
model = svmModel(x_random,y_random)
evaluate_model(model,"After sampling using Random undersampler", X_test,y_test)
prediction_model(model,"After sampling using Random undersampler",X_test,y_test) 

# Random Forest Classification Model

In [None]:
# Evaluate the svm model for the imbalanced data
model = randomForestClassifier(x_random,y_random)
evaluate_model(model,"After sampling using Random undersampler", X_test,y_test)
prediction_model(model,"After sampling using Random undersampler",X_test,y_test) 

# Multi-Layer Perceptron Neural Networks

In [None]:
# Evaluate the svm model for the imbalanced data
model = multiLayerPerceptionNeuralNetworksModel(x_random,y_random)
evaluate_model(model,"After sampling using Random undersampler", X_test,y_test)
prediction_model(model,"After sampling using Random undersampler",X_test,y_test) 