# Data Analytics Assignment 2024

## Problem Statement: How different factors effect student performance


## LOADING AND CLEANING THE DATASET

In [None]:
## Making all the necessary imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

In [None]:
# Reading data
stdData = pd.read_csv("StudentPerformanceFactors.csv")
stdData

In [None]:
stdData.describe()

In [None]:
stdData.head()

In [None]:
stdData.shape

In [None]:
stdData.columns

In [None]:
#Deletion of irrelevant columns
stdData = stdData.drop(columns=['Extracurricular_Activities', 'Previous_Scores', 'Motivation_Level', 'Tutoring_Sessions', 'Peer_Influence', 'Physical_Activity', 'Learning_Disabilities', 'Parental_Education_Level', 'Distance_from_Home', 'Gender'])

In [None]:
stdData.head

In [None]:
stdData.head()

In [None]:
# Identifies duplicate columns
stdData.columns.value_counts()

In [None]:
# checks for missing (NaN) values in the stdData DataFrame and returns a count of how many missing values are in each column.
missing_values = stdData.isnull().sum()
missing_values

In [None]:
#Finding the mode of column 'Teacher_Quality' to use it to replace null cells
mode_value = stdData['Teacher_Quality'].mode()[0]

In [None]:
mode_value

In [None]:
stdData.fillna(mode_value, inplace = True)

In [None]:
# checking if there are still empty cells
missing_values = stdData.isnull().sum()
missing_values

In [None]:
# Checks for NaN or out-of-range values in Exam_Score
invalid_exam_scores = stdData[(stdData['Exam_Score'].isnull()) | (stdData['Exam_Score'] < 0) | (stdData['Exam_Score'] > 100)]
print(invalid_exam_scores)

In [None]:
#Makes sure no mark exceeds 100 in the Exam_Score Table by replacing them with 100
stdData['Exam_Score'] = stdData['Exam_Score'].clip(upper=100)

In [None]:
# Making sure the code worked
invalid_exam_scores = stdData[(stdData['Exam_Score'].isnull()) | (stdData['Exam_Score'] < 0) | (stdData['Exam_Score'] > 100)]
print(invalid_exam_scores)

In [None]:
stdData

In [None]:
#The exam score column has too many different values, which diminishes the capacity of the algorithm to make the right prediction.
#We are making a new column that will group ranges of scores into categories.
bins = [0, 50, 60, 70, 75, 101]  # Ranges for Fail, Pass, Credit, Merit, Distinction
labels = ['Fail', 'Pass', 'Credit', 'Merit', 'Distinction']  # Corresponding performance categories

# New column 'Performance_Category' being created based on the 'Exam_Score'
stdData['Performance_Category'] = pd.cut(stdData['Exam_Score'], bins=bins, labels=labels, right=False)

# Displaying the first few rows to verify
print(stdData[['Exam_Score', 'Performance_Category']].head())

In [None]:
#Making a copy of our table before deleting the Exam_Score column 
stdDataOriginal = stdData

In [None]:
stdDataOriginal

In [None]:
# We imported this module to help us change string data to numeric data.
le = LabelEncoder()

In [None]:
# Here we are using the above module to pass the column name where our categorical data is. 
# It is worth mentioning that this approach wouldn't work well in columns with no Inherent order -- categories with no natural order like 'Red', 'Green', Yellow'
stdData['Parental_Involvement'] = le.fit_transform(stdData['Parental_Involvement'])
stdData['Access_to_Resources'] = le.fit_transform(stdData['Access_to_Resources'])
stdData['Internet_Access'] = le.fit_transform(stdData['Internet_Access'])
stdData['Family_Income'] = le.fit_transform(stdData['Family_Income'])
stdData['Teacher_Quality'] = le.fit_transform(stdData['Teacher_Quality'])
stdData['School_Type'] = le.fit_transform(stdData['School_Type'])

In [None]:
stdData.head() #visualizing changes

In [None]:
#Dropping Exam_Score
stdData = stdData.drop('Exam_Score', axis=1)

In [None]:
stdData

In [None]:
# Check for NaN values in the target column
print(stdData['Performance_Category'].isnull().sum())

## EXPLORATORY DATA ANALYTICS: GRAPHS AND VISUALISATIONs

In [None]:
# Code the graphs here

## MACHINE LEARNING: VALIDATION DATASET

In [None]:
# We imported this module to help us change string data to numeric data.
le = LabelEncoder()

In [None]:
# Here we are using the above module to pass the column name where our categorical data is. 
# It is worth mentioning that this approach wouldn't work well in columns with no Inherent order -- categories with no natural order like 'Red', 'Green', Yellow'
stdData['Parental_Involvement'] = le.fit_transform(stdData['Parental_Involvement'])
stdData['Access_to_Resources'] = le.fit_transform(stdData['Access_to_Resources'])
stdData['Internet_Access'] = le.fit_transform(stdData['Internet_Access'])
stdData['Family_Income'] = le.fit_transform(stdData['Family_Income'])
stdData['Teacher_Quality'] = le.fit_transform(stdData['Teacher_Quality'])
stdData['School_Type'] = le.fit_transform(stdData['School_Type'])

In [None]:
stdData.head() #visualizing changes

## Machine :earning: Building our model
### With our dataset ready we can now build our model and test various algorithms using the cross valiation method

In [None]:
# Passing our data frame values to an array and dividing our data set into training data(x) and testing data(y)
array = stdData.values
X = array[:, 0:9]
Y = array[:, 9]
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)

In [None]:
scoring = 'accuracy'

In [None]:
# I had to scale our training data because I was getting some an error in the logistic regression algorithm
# Error: ConvergenceWarning: lbfgs failed to converge (status=1): STOP: TOTAL NO. of ITERATIONS REACHED LIMIT
#The solution was to scale the data set using the function below.
# StandardScaler helps normalize the features to have a mean of 0 and a standard deviation of 1, which can improve convergence during training.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [None]:
models = []
models.append(('LR', LogisticRegression(max_iter=200))) 
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

# Evaluate each model
results = []
names = []

for name, model in models:
    kfold = model_selection.KFold(n_splits=10, shuffle=True, random_state=seed)
    # Use the unscaled data for all models except Logistic Regression
    if name == 'LR':
        cv_results = model_selection.cross_val_score(model, X_train_scaled, Y_train, cv=kfold, scoring=scoring) ## Using our scaled train set here
    else:
        cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)


In [None]:
#Graph for better visualization of scores
plt.figure(figsize=(10, 6))
plt.boxplot(results, tick_labels=names)
plt.title('Algorithm Comparison')
plt.ylabel('Accuracy')
plt.xlabel('Algorithms')
plt.grid()
plt.show()

In [None]:
lda = LogisticRegression(max_iter=200)
lda.fit(X_train, Y_train)
predictions = lda.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print("\t=============================================\n")
print(confusion_matrix(Y_validation, predictions))
print("\t=============================================\n")
print(classification_report(Y_validation, predictions))

In [None]:
print(stdData.shape)

In [None]:
# Check the shapes of the resulting datasets
print(f'X_train shape: {X_train.shape}')
print(f'Y_train shape: {Y_train.shape}')

In [None]:
# Train the Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=seed)
dt_classifier.fit(X_train, Y_train)

## MACHINE LEARNING: FEATURE OF IMPORTANCE
Still in progress...

In [None]:
# Get feature importances
importances = dt_classifier.feature_importances_

# Create a DataFrame for better visualization
feature_importances = pd.DataFrame({'Feature': stdData.columns[0:9], 'Importance': importances})

# Sort the DataFrame by importance
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

# Display the feature importances
print(feature_importances)

# Visualize the feature importances
plt.figure(figsize=(10, 6))
plt.barh(feature_importances['Feature'], feature_importances['Importance'], color='lightgreen')
plt.xlabel('Importance')
plt.title('Feature Importance for Student Performance Prediction using Decision Tree')
plt.show()
