 Machine learning model to Predict students' dropout and academic success

In [88]:
# Importing the libraries

import pandas as pd
from sklearn.linear_model import  LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.metrics import mean_absolute_error


# Reading the dataset from CSV file
student = pd.read_csv('dataset.csv')

student['Target'].unique()

# Convert the Target values to numeric
student['Target'] = student['Target'].map({
    'Dropout':0,
    'Enrolled':1,
    'Graduate':2
})



Developing model using LogisticsRegression

In [89]:
# Developing the model
x = student.drop("Target", axis=1)
y = student['Target']
# Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(x)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# # Evaluate other metrics
cm = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
accuracy = np.mean(y_pred == y_test)
print("Accuracy:", accuracy)
print("R^2 score on training data: {:.2f}".format(model.score(X_train, y_train)))
print("R^2 score on test data: {:.2f}".format(model.score(X_test, y_test)))
print("Mean absolute error: {:.2f}".format(mean_absolute_error(y_test, y_pred)))

print("Confusion Matrix:\n", cm)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

Accuracy: 0.7548022598870057
R^2 score on training data: 0.78
R^2 score on test data: 0.75
Mean absolute error: 0.32
Confusion Matrix:
 [[249  19  48]
 [ 42  36  73]
 [ 14  21 383]]
Precision: 0.7312468576894368
Recall: 0.7548022598870057
F1-Score: 0.7328591968547924


Model using DecisionTreeClassifier

In [90]:
from sklearn.tree import DecisionTreeClassifier

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate accuracy
accuracy = np.mean(y_pred == y_test)
print("Accuracy:", accuracy)

# Confusion Matrix, Precision, Recall and F1-Score
confusion_matrix = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
print("Confusion Matrix:", confusion_matrix)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

Accuracy: 0.6813559322033899
Confusion Matrix: [[203  63  50]
 [ 38  61  52]
 [ 33  46 339]]
Precision: 0.688834880399436
Recall: 0.6813559322033899
F1-Score: 0.6833484994688794


Using Random Forest

In [98]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)
cr = classification_report(y_test, y_pred)
print("Classification Report:\n", cr)


Accuracy: 0.768361581920904
Confusion Matrix:
 [[237  23  56]
 [ 32  49  70]
 [  9  15 394]]
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.75      0.80       316
           1       0.56      0.32      0.41       151
           2       0.76      0.94      0.84       418

    accuracy                           0.77       885
   macro avg       0.72      0.67      0.68       885
weighted avg       0.76      0.77      0.75       885



using use Support Vector Machine (SVM)


In [96]:

# Importing required libraries
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Scaling the data
scaler = StandardScaler()
X = scaler.fit_transform(x)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating and fitting the model
model = SVC(kernel='linear', C=1)
model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = model.predict(X_test)

# Evaluating the model's performance
accuracy = np.mean(y_pred == y_test)
print("Accuracy:", accuracy)

Accuracy: 0.7570621468926554
