In [9]:
# import the necessary packages
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report, precision_recall_curve
import numpy as np

In [10]:
# Split your training and test data

# Load the data from the file
# File is in the ../Data directory
# File name is "data_academic_ready.csv"

cleanedStudentsGradesDf = pd.read_csv("../data/data_academic_ready.csv")
cleanedStudentsGradesDf.head()

# Divide the X and y coordinates into two dataframes for training. gradingScale can be the labels dataframe
# and the rest can be the features dataframe
gradingScaleLabelsDf = cleanedStudentsGradesDf['GRADE']
gradingScaleLabelsDf.head()

featuresDf = cleanedStudentsGradesDf.drop(['GRADE', 'Unnamed: 0'], axis=1)
featuresDf.head()

# Check the shape of the features and labels dataframes
print("Features shape:", featuresDf.shape)
print("Labels shape:", gradingScaleLabelsDf.shape)

# Ensure that the features and labels dataframes have the same number of rows
if featuresDf.shape[0] != gradingScaleLabelsDf.shape[0]:
    raise ValueError("Number of rows in features and labels dataframes are not equal.")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(featuresDf, gradingScaleLabelsDf, test_size=0.3)

Features shape: (39572, 85)
Labels shape: (39572,)


In [11]:
X_train = np.ascontiguousarray(X_train)
y_train = np.ascontiguousarray(y_train)
X_test = np.ascontiguousarray(X_test)
y_test = np.ascontiguousarray(y_test)

In [12]:
# Build the classifiers list
C = 10
# Create different classifiers.
classifiers = {
    'Linear SVC': SVC(kernel='linear', C=C, probability=True,random_state=0),
    'KNN classifier': KNeighborsClassifier(C),
    'SVC': SVC(),
    'RFST': RandomForestClassifier(n_estimators=100),
    'ADA': AdaBoostClassifier(n_estimators=100)
}

In [13]:
n_classifiers = len(classifiers)

for index, (name, classifier) in enumerate(classifiers.items()):
    classifier.fit(X_train, np.ravel(y_train))

    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy (train) for %s: %0.1f%% " % (name, accuracy * 100))
    print(classification_report(y_test,y_pred))

Accuracy (train) for Linear SVC: 41.1% 
              precision    recall  f1-score   support

           A       0.57      0.69      0.62       894
          A+       0.89      0.91      0.90       910
          A-       0.43      0.40      0.41       937
           B       0.32      0.30      0.31       908
          B+       0.30      0.35      0.32       875
          B-       0.26      0.25      0.26       920
           C       0.23      0.20      0.21       928
          C+       0.25      0.24      0.25       903
          C-       0.28      0.21      0.24       940
           D       0.31      0.19      0.23       922
          D+       0.31      0.25      0.27       926
          D-       0.34      0.46      0.39       907
           F       0.62      0.93      0.74       902

    accuracy                           0.41     11872
   macro avg       0.39      0.41      0.40     11872
weighted avg       0.39      0.41      0.40     11872

Accuracy (train) for KNN classifier: 59

Classifiers Accuracy Comparision



| Classifier | Accuracy [%] |
|-----------------|-----------------|
| Linear SVC | 37.5 |
| KNN | 45.2 |
| SVC | 46.6 |
| RFST | 53.2 |
| ADA | 25.6 |

In [14]:
# hyperparameter tuning for a Random Forest classifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

# Assuming X_train, X_test, y_train, y_test are your training and testing data
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Create a Random Forest classifier
rf_clf = RandomForestClassifier(random_state=42)

# Perform grid search cross-validation
grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Initialize a Random Forest classifier with the best parameters
best_rf_clf = RandomForestClassifier(random_state=42, **best_params)

# Train the model on the entire training set
best_rf_clf.fit(X_train, y_train)

# Predict the labels on the test set
y_pred = best_rf_clf.predict(X_test)

# Evaluate model accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.7560646900269542


In [None]:
# hyperparameter tuning for SVC classifier 

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

# Assuming X_train, X_test, y_train, y_test are your training and testing data
# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10],              # Regularization parameter
    'kernel': ['linear', 'rbf'],    # Kernel type
    'gamma': ['scale', 'auto']      # Kernel coefficient for 'rbf' kernel
}

# Create an SVC classifier
svc_clf = SVC()

# Perform grid search cross-validation
grid_search = GridSearchCV(estimator=svc_clf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Initialize an SVC classifier with the best parameters
best_svc_clf = SVC(**best_params)

# Train the model on the entire training set
best_svc_clf.fit(X_train, y_train)

# Predict the labels on the test set
y_pred = best_svc_clf.predict(X_test)

# Evaluate model accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


In [None]:
# hyperparameter tuning for KNN classifier 

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

# Assuming X_train, X_test, y_train, y_test are your training and testing data
# Define the parameter grid
param_grid = {
    'n_neighbors': [3, 5, 7, 9],      # Number of neighbors to consider
    'weights': ['uniform', 'distance'],  # Weighting scheme for predictions
    'p': [1, 2]                        # Power parameter for the Minkowski distance metric
}

# Create a KNN classifier
knn_clf = KNeighborsClassifier()

# Perform grid search cross-validation
grid_search = GridSearchCV(estimator=knn_clf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Initialize a KNN classifier with the best parameters
best_knn_clf = KNeighborsClassifier(**best_params)

# Train the model on the entire training set
best_knn_clf.fit(X_train, y_train)

# Predict the labels on the test set
y_pred = best_knn_clf.predict(X_test)

# Evaluate model accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

