In [9]:
# import the necessary packages
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report, precision_recall_curve
import numpy as np

In [10]:
# Split your training and test data

# Load the data from the file
# File is in the ../Data directory
# File name is "data_academic_ready.csv"

cleanedStudentsGradesDf = pd.read_csv("../data/data_academic_ready.csv")
cleanedStudentsGradesDf.head()

# Divide the X and y coordinates into two dataframes for training. gradingScale can be the labels dataframe
# and the rest can be the features dataframe
gradingScaleLabelsDf = cleanedStudentsGradesDf['GRADE']
gradingScaleLabelsDf.head()

featuresDf = cleanedStudentsGradesDf.drop(['GRADE', 'Unnamed: 0'], axis=1)
featuresDf.head()

# Check the shape of the features and labels dataframes
print("Features shape:", featuresDf.shape)
print("Labels shape:", gradingScaleLabelsDf.shape)

# Ensure that the features and labels dataframes have the same number of rows
if featuresDf.shape[0] != gradingScaleLabelsDf.shape[0]:
    raise ValueError("Number of rows in features and labels dataframes are not equal.")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(featuresDf, gradingScaleLabelsDf, test_size=0.3)

Features shape: (39572, 85)
Labels shape: (39572,)


In [11]:
X_train = np.ascontiguousarray(X_train)
y_train = np.ascontiguousarray(y_train)
X_test = np.ascontiguousarray(X_test)
y_test = np.ascontiguousarray(y_test)

In [12]:
# Build the classifiers list
C = 10
# Create different classifiers.
classifiers = {
    'Linear SVC': SVC(kernel='linear', C=C, probability=True,random_state=0),
    'KNN classifier': KNeighborsClassifier(C),
    'SVC': SVC(),
    'RFST': RandomForestClassifier(n_estimators=100),
    'ADA': AdaBoostClassifier(n_estimators=100)
}

In [13]:
n_classifiers = len(classifiers)

for index, (name, classifier) in enumerate(classifiers.items()):
    classifier.fit(X_train, np.ravel(y_train))

    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy (train) for %s: %0.1f%% " % (name, accuracy * 100))
    print(classification_report(y_test,y_pred))

Accuracy (train) for Linear SVC: 41.1% 
              precision    recall  f1-score   support

           A       0.57      0.69      0.62       894
          A+       0.89      0.91      0.90       910
          A-       0.43      0.40      0.41       937
           B       0.32      0.30      0.31       908
          B+       0.30      0.35      0.32       875
          B-       0.26      0.25      0.26       920
           C       0.23      0.20      0.21       928
          C+       0.25      0.24      0.25       903
          C-       0.28      0.21      0.24       940
           D       0.31      0.19      0.23       922
          D+       0.31      0.25      0.27       926
          D-       0.34      0.46      0.39       907
           F       0.62      0.93      0.74       902

    accuracy                           0.41     11872
   macro avg       0.39      0.41      0.40     11872
weighted avg       0.39      0.41      0.40     11872

Accuracy (train) for KNN classifier: 59

### Classifiers Accuracy Comparision


| Classifier | Accuracy [%] |
|-----------------|-----------------|
| RFST | 75.3 |
| KNN | 59.5 |
| SVC | 59.5 |
| Linear SVC | 41.1 |
| ADA | 27.5 |

In [14]:
# hyperparameter tuning for a Random Forest classifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

# Assuming X_train, X_test, y_train, y_test are your training and testing data
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Create a Random Forest classifier
rf_clf = RandomForestClassifier(random_state=42)

# Perform grid search cross-validation
grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Initialize a Random Forest classifier with the best parameters
best_rf_clf = RandomForestClassifier(random_state=42, **best_params)

# Train the model on the entire training set
best_rf_clf.fit(X_train, y_train)

# Predict the labels on the test set
y_pred = best_rf_clf.predict(X_test)

# Evaluate model accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.7560646900269542


* Note hyperparamter tunning for Random forest improved the accuracy insignifiantly. 75.6% after tunning comparing with 75.3% initial run.
* Processing time: 12 minutes, 17 seconds on 14 Core CPUs, 16 GB memory;

In [None]:
# hyperparameter tuning for SVC classifier 

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

# Assuming X_train, X_test, y_train, y_test are your training and testing data
# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10],              # Regularization parameter
    'kernel': ['linear', 'rbf'],    # Kernel type
    'gamma': ['scale', 'auto']      # Kernel coefficient for 'rbf' kernel
}

# Create an SVC classifier
svc_clf = SVC()

# Perform grid search cross-validation
grid_search = GridSearchCV(estimator=svc_clf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Initialize an SVC classifier with the best parameters
best_svc_clf = SVC(**best_params)

# Train the model on the entire training set
best_svc_clf.fit(X_train, y_train)

# Predict the labels on the test set
y_pred = best_svc_clf.predict(X_test)

# Evaluate model accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


In [None]:
# hyperparameter tuning for KNN classifier 

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

# Assuming X_train, X_test, y_train, y_test are your training and testing data
# Define the parameter grid
param_grid = {
    'n_neighbors': [3, 5, 7, 9],      # Number of neighbors to consider
    'weights': ['uniform', 'distance'],  # Weighting scheme for predictions
    'p': [1, 2]                        # Power parameter for the Minkowski distance metric
}

# Create a KNN classifier
knn_clf = KNeighborsClassifier()

# Perform grid search cross-validation
grid_search = GridSearchCV(estimator=knn_clf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Initialize a KNN classifier with the best parameters
best_knn_clf = KNeighborsClassifier(**best_params)

# Train the model on the entire training set
best_knn_clf.fit(X_train, y_train)

# Predict the labels on the test set
y_pred = best_knn_clf.predict(X_test)

# Evaluate model accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)



In [17]:
# Re-testing Random Forest classifier

RFST = RandomForestClassifier(n_estimators=100)
RFST.fit(X_train, np.ravel(y_train))
y_pred = RFST.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy (train) for RFST: %0.1f%% " % (accuracy * 100))
print(classification_report(y_test,y_pred))

Accuracy (train) for RFST: 75.1% 
              precision    recall  f1-score   support

           A       0.70      0.78      0.74       894
          A+       0.91      0.86      0.89       910
          A-       0.73      0.64      0.68       937
           B       0.71      0.70      0.70       908
          B+       0.71      0.74      0.73       875
          B-       0.74      0.77      0.75       920
           C       0.72      0.70      0.71       928
          C+       0.74      0.74      0.74       903
          C-       0.74      0.80      0.77       940
           D       0.74      0.74      0.74       922
          D+       0.75      0.81      0.78       926
          D-       0.75      0.81      0.78       907
           F       0.85      0.67      0.75       902

    accuracy                           0.75     11872
   macro avg       0.75      0.75      0.75     11872
weighted avg       0.75      0.75      0.75     11872



In [27]:
# Convert the numpy array to a pandas DataFrame
X_test_df = pd.DataFrame(X_test)
y_test_df = pd.DataFrame(y_test)

# print(X_test_df.info())
# print(X_test_df.head())
print(X_test_df.columns)

# Test the RFST model with a student data at location #50 in the test set
# print(f'Student Data: {X_test_df.iloc[50][X_test_df.iloc[50]!=0].keys()}')
print(f'Student Data: {X_test_df.iloc[50][X_test_df.iloc[50]!=0].index.tolist()}')
print(f'Grade: {y_test_df.iloc[50]}')

RangeIndex(start=0, stop=85, step=1)
Student Data: [0, 6, 9, 12, 14, 65, 75, 78]
Grade: 0    A
Name: 50, dtype: object


In [22]:
test= X_test_df.iloc[50].values.reshape(-1, 1).T
proba = RFST.predict_proba(test)
classes = RFST.classes_
resultdf = pd.DataFrame(data=proba, columns=classes)

topPrediction = resultdf.T.sort_values(by=[0], ascending = [False])
topPrediction.head()

Unnamed: 0,0
A,0.83
A+,0.07
A-,0.07
B+,0.02
D-,0.01


In [32]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Assuming X_train, X_test, y_train, y_test are your training and testing data

# Train the Random Forest classifier
RFST = RandomForestClassifier(n_estimators=100)
RFST.fit(X_train, np.ravel(y_train))

# Select 10 random indices
random_indices = np.random.choice(X_test.shape[0], size=10, replace=False)

# Extract the corresponding records from the test dataset
if isinstance(X_test, pd.DataFrame):
    X_sample = X_test.iloc[random_indices]
else:
    X_sample = pd.DataFrame(X_test).iloc[random_indices]

y_sample_actual = y_test[random_indices]

# Predict the labels for the selected records
y_sample_pred = RFST.predict(X_sample)

# Create a DataFrame to display the results
results_df = pd.DataFrame(index=range(10))
results_df['Column Names'] = ['; '.join(map(str, X_sample.iloc[i][X_sample.iloc[i] != 0].index.tolist())) for i in range(10)]
results_df['Actual Value'] = y_sample_actual
results_df['Predicted Value'] = y_sample_pred

# Display the results
print("Randomly Selected Test Records and Predictions:")
print(results_df)


Randomly Selected Test Records and Predictions:
                           Column Names Actual Value Predicted Value
0                 1; 18; 63; 68; 73; 79           B+              B+
1                     2; 11; 60; 72; 76           C-              C-
2       0; 6; 9; 10; 12; 24; 62; 71; 76            D               D
3     2; 11; 58; 60; 65; 70; 74; 80; 81           A+              A+
4  1; 7; 14; 43; 60; 65; 70; 74; 76; 81            A               A
5                 2; 11; 60; 67; 71; 76            D              D+
6         3; 11; 13; 24; 61; 66; 73; 77            D               D
7      1; 7; 11; 51; 64; 66; 71; 78; 84            B               B
8          1; 7; 10; 14; 24; 64; 72; 76           C+              C+
9         4; 11; 50; 60; 61; 69; 74; 79           A-              A-


In [33]:
print(featuresDf.columns)

Index(['STRATUM_Stratum 1', 'STRATUM_Stratum 2', 'STRATUM_Stratum 3',
       'STRATUM_Stratum 4', 'STRATUM_Stratum 5', 'STRATUM_Stratum 6',
       'SISBEN_Level 1', 'SISBEN_Level 2', 'SISBEN_Level 3', 'INTERNET_No',
       'MIC_OVEN_No', 'CAR_Yes', 'MOBILE_No', 'SCHOOL_TYPE_TECHNICAL',
       'SCHOOL_TYPE_TECHNICAL/ACADEMIC',
       'UNIVERSITY_CORPORACION UNIVERSIDAD DE LA COSTA, CUC-BARRANQUILLA',
       'UNIVERSITY_CORPORACION UNIVERSITARIA DE INVESTIGACION Y DESARROLLO -"UDI"-BUCARAMANGA',
       'UNIVERSITY_CORPORACION UNIVERSITARIA DEL HUILA-CORHUILA-NEIVA',
       'UNIVERSITY_CORPORACION UNIVERSITARIA MINUTO DE DIOS -UNIMINUTO-BOGOTÁ D.C.',
       'UNIVERSITY_ESCUELA COLOMBIANA DE INGENIERIA"JULIO GARAVITO"-BOGOTÁ D.C.',
       'UNIVERSITY_FUNDACION UNIVERSIDAD DE AMERICA-BOGOTÁ D.C.',
       'UNIVERSITY_FUNDACION UNIVERSIDAD DE BOGOTA"JORGE TADEO LOZANO"-BOGOTÁ D.C.',
       'UNIVERSITY_FUNDACION UNIVERSITARIA LOS LIBERTADORES-BOGOTÁ D.C.',
       'UNIVERSITY_FUNDACION UNIVERSI

In [37]:
# Same procedure as above but with the original featuresD DataFrame and exporting the results to a csv file

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Assuming X_train, X_test, y_train, y_test are your training and testing data
# Assuming featuresD is the original DataFrame containing the feature names

# Train the Random Forest classifier
RFST = RandomForestClassifier(n_estimators=100)
RFST.fit(X_train, np.ravel(y_train))

# Select 10 random indices
random_indices = np.random.choice(X_test.shape[0], size=10, replace=False)

# Extract the corresponding records from the test dataset
if isinstance(X_test, pd.DataFrame):
    X_sample = X_test.iloc[random_indices]
else:
    X_sample = pd.DataFrame(X_test).iloc[random_indices]

y_sample_actual = y_test[random_indices]

# Predict the labels for the selected records
y_sample_pred = RFST.predict(X_sample)

# Get the column names from the original featuresD dataset
column_names = featuresDf.columns

# Create a DataFrame to display the results
results_df = pd.DataFrame(index=range(10))
results_df['Column Names'] = ['; '.join(column_names[X_sample.iloc[i][X_sample.iloc[i] != 0].index]) for i in range(10)]
results_df['Actual Value'] = y_sample_actual
results_df['Predicted Value'] = y_sample_pred

# Display the results
print("Randomly Selected Test Records and Predictions:")
print(results_df.head(10))

# export results to a csv file
results_df.to_csv("../Data/10_random_results_compared.csv", index=False)


Randomly Selected Test Records and Predictions:
                                        Column Names Actual Value  \
0  STRATUM_Stratum 3; CAR_Yes; UNIVERSITY_ESCUELA...           A-   
1  CAR_Yes; UNIVERSITY_PONTIFICIA UNIVERSIDAD JAV...           A-   
2  STRATUM_Stratum 2; SISBEN_Level 2; SCHOOL_TYPE...           D+   
3  STRATUM_Stratum 3; CAR_Yes; UNIVERSITY_Other; ...            C   
4  STRATUM_Stratum 3; MIC_OVEN_No; SCHOOL_TYPE_TE...            A   
5  UNIVERSITY_Other; QR_PRO_C; CR_PRO_F; CC_PRO_F...           D+   
6  STRATUM_Stratum 3; UNIVERSITY_Other; SCHOOL_NA...           A+   
7  STRATUM_Stratum 4; CAR_Yes; UNIVERSITY_UNIVERS...           A+   
8  STRATUM_Stratum 5; CAR_Yes; SCHOOL_NAT_PRIVATE...            A   
9  STRATUM_Stratum 1; SISBEN_Level 1; INTERNET_No...            C   

  Predicted Value  
0               B  
1              A-  
2              D+  
3              B-  
4               A  
5              D+  
6              A+  
7              A+  
8           