In [3]:
# import the necessary packages
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report, precision_recall_curve
import numpy as np

In [4]:
# Split your training and test data

# Load the data from the file
# File is in the ../Data directory
# Fine name is "cleaned_students.csv"

cleanedStudentsGradesDf = pd.read_csv("../data/cleaned_students.csv")
cleanedStudentsGradesDf.head()

# Divide the X and y coordinates into two dataframes for training. gradingScale can be the labels dataframe
# and the rest can be the features dataframe
gradingScaleLabelsDf = cleanedStudentsGradesDf['grading_scale']
gradingScaleLabelsDf.head()

featuresDf = cleanedStudentsGradesDf.drop(['grading_scale', 'Unnamed: 0'], axis=1)
featuresDf.head()

# Check the shape of the features and labels dataframes
print("Features shape:", featuresDf.shape)
print("Labels shape:", gradingScaleLabelsDf.shape)

# Ensure that the features and labels dataframes have the same number of rows
if featuresDf.shape[0] != gradingScaleLabelsDf.shape[0]:
    raise ValueError("Number of rows in features and labels dataframes are not equal.")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(featuresDf, gradingScaleLabelsDf, test_size=0.3)

Features shape: (32560, 52)
Labels shape: (32560,)


In [17]:
X_train = np.ascontiguousarray(X_train)
y_train = np.ascontiguousarray(y_train)
X_test = np.ascontiguousarray(X_test)
y_test = np.ascontiguousarray(y_test)

In [22]:
# Build the classifiers list
C = 10
# Create different classifiers.
classifiers = {
    #'Linear SVC': SVC(kernel='linear', C=C, probability=True,random_state=0),
    #'KNN classifier': KNeighborsClassifier(C),
    #'SVC': SVC(),
    'RFST': RandomForestClassifier(n_estimators=100),
    'ADA': AdaBoostClassifier(n_estimators=100)
}

In [23]:
n_classifiers = len(classifiers)

for index, (name, classifier) in enumerate(classifiers.items()):
    classifier.fit(X_train, np.ravel(y_train))

    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy (train) for %s: %0.1f%% " % (name, accuracy * 100))
    print(classification_report(y_test,y_pred))

Accuracy (train) for RFST: 28.4% 
              precision    recall  f1-score   support

           A       0.20      0.18      0.19       859
          A+       0.29      0.27      0.28       903
          A-       0.17      0.11      0.14       918
           B       0.20      0.17      0.19       892
          B+       0.26      0.25      0.25       862
          B-       0.30      0.31      0.30       869
           C       0.29      0.32      0.30       876
          C+       0.30      0.34      0.32       897
          C-       0.39      0.48      0.43       929
           D       0.28      0.27      0.28       872
           F       0.33      0.42      0.37       891

    accuracy                           0.28      9768
   macro avg       0.27      0.28      0.28      9768
weighted avg       0.27      0.28      0.28      9768

Accuracy (train) for ADA: 16.0% 
              precision    recall  f1-score   support

           A       0.21      0.21      0.21       859
          A

### Accuracy report:

| Solver | Accuracy [%] |
|-----------------|-----------------|
| Linear SVC | 17.5 |
| KNN | 24 |
| SVC | 24 |
| RFST | 28 |
| ADA | 16 |
