In [1]:
# import the necessary packages
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report, precision_recall_curve
import numpy as np

print("Loading libraries    ...")

Loading libraries    ...


In [2]:
# Split your training and test data

# Load the data from the file
# File is in the ../Data directory
# File name is "data_academic_ready.csv"

cleanedStudentsGradesDf = pd.read_csv("../data/data_academic_readyABCDF.csv")
cleanedStudentsGradesDf.head()

# Divide the X and y coordinates into two dataframes for training. gradingScale can be the labels dataframe
# and the rest can be the features dataframe
gradingScaleLabelsDf = cleanedStudentsGradesDf['GRADE']
gradingScaleLabelsDf.head()

featuresDf = cleanedStudentsGradesDf.drop(['GRADE', 'Unnamed: 0'], axis=1)
featuresDf.head()

# Check the shape of the features and labels dataframes
print("Features shape:", featuresDf.shape)
print("Labels shape:", gradingScaleLabelsDf.shape)

# Ensure that the features and labels dataframes have the same number of rows
if featuresDf.shape[0] != gradingScaleLabelsDf.shape[0]:
    raise ValueError("Number of rows in features and labels dataframes are not equal.")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(featuresDf, gradingScaleLabelsDf, test_size=0.3)

Features shape: (20655, 86)
Labels shape: (20655,)


In [3]:
X_train = np.ascontiguousarray(X_train)
y_train = np.ascontiguousarray(y_train)
X_test = np.ascontiguousarray(X_test)
y_test = np.ascontiguousarray(y_test)

In [4]:
# Build the classifiers list
C = 10
# Create different classifiers.
classifiers = {
    'Linear SVC': SVC(kernel='linear', C=C, probability=True,random_state=0),
    'KNN classifier': KNeighborsClassifier(C),
    'SVC': SVC(),
    'RFST': RandomForestClassifier(n_estimators=100),
    'ADA': AdaBoostClassifier(n_estimators=100)
}

In [5]:
n_classifiers = len(classifiers)

for index, (name, classifier) in enumerate(classifiers.items()):
    classifier.fit(X_train, np.ravel(y_train))

    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy (train) for %s: %0.1f%% " % (name, accuracy * 100))
    print(classification_report(y_test,y_pred))

Accuracy (train) for Linear SVC: 74.9% 
              precision    recall  f1-score   support

           A       0.90      0.89      0.90      1260
           B       0.71      0.73      0.72      1262
           C       0.59      0.62      0.61      1223
           D       0.72      0.61      0.66      1258
           F       0.83      0.90      0.86      1194

    accuracy                           0.75      6197
   macro avg       0.75      0.75      0.75      6197
weighted avg       0.75      0.75      0.75      6197

Accuracy (train) for KNN classifier: 71.5% 
              precision    recall  f1-score   support

           A       0.89      0.82      0.85      1260
           B       0.70      0.63      0.66      1262
           C       0.60      0.70      0.65      1223
           D       0.60      0.75      0.67      1258
           F       0.89      0.68      0.77      1194

    accuracy                           0.71      6197
   macro avg       0.74      0.71      0.72    

### Classifiers Accuracy Comparision


| Classifier | Accuracy [%] |
|-----------------|-----------------|
| RFST | 80.4 |
| SVC | 77.0 |
| Linear SVC | 74.9 |
| KNN | 71.5 |
| ADA | 60.2 |

In [6]:
# hyperparameter tuning for a Random Forest classifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

# Assuming X_train, X_test, y_train, y_test are your training and testing data
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Create a Random Forest classifier
rf_clf = RandomForestClassifier(random_state=42)

# Perform grid search cross-validation
grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Initialize a Random Forest classifier with the best parameters
best_rf_clf = RandomForestClassifier(random_state=42, **best_params)

# Train the model on the entire training set
best_rf_clf.fit(X_train, y_train)

# Predict the labels on the test set
y_pred = best_rf_clf.predict(X_test)

# Evaluate model accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.8024850734226239


* Note hyperparamter tunning for Random forest improved the accuracy insignifiantly. 80.2% after tunning comparing with 80.4% initial run.
* Processing time: 6 minutes, 21 seconds on 14 Core CPUs, 16 GB memory;

In [7]:
# hyperparameter tuning for SVC classifier 

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

# Assuming X_train, X_test, y_train, y_test are your training and testing data
# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10],              # Regularization parameter
    'kernel': ['linear', 'rbf'],    # Kernel type
    'gamma': ['scale', 'auto']      # Kernel coefficient for 'rbf' kernel
}

# Create an SVC classifier
svc_clf = SVC()

# Perform grid search cross-validation
grid_search = GridSearchCV(estimator=svc_clf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Initialize an SVC classifier with the best parameters
best_svc_clf = SVC(**best_params)

# Train the model on the entire training set
best_svc_clf.fit(X_train, y_train)

# Predict the labels on the test set
y_pred = best_svc_clf.predict(X_test)

# Evaluate model accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.786509601420042


In [8]:
# hyperparameter tuning for KNN classifier 

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

# Assuming X_train, X_test, y_train, y_test are your training and testing data
# Define the parameter grid
param_grid = {
    'n_neighbors': [3, 5, 7, 9],      # Number of neighbors to consider
    'weights': ['uniform', 'distance'],  # Weighting scheme for predictions
    'p': [1, 2]                        # Power parameter for the Minkowski distance metric
}

# Create a KNN classifier
knn_clf = KNeighborsClassifier()

# Perform grid search cross-validation
grid_search = GridSearchCV(estimator=knn_clf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Initialize a KNN classifier with the best parameters
best_knn_clf = KNeighborsClassifier(**best_params)

# Train the model on the entire training set
best_knn_clf.fit(X_train, y_train)

# Predict the labels on the test set
y_pred = best_knn_clf.predict(X_test)

# Evaluate model accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)



Accuracy: 0.7621429724060029


In [9]:
# Re-testing Random Forest classifier

RFST = RandomForestClassifier(n_estimators=100)
RFST.fit(X_train, np.ravel(y_train))
y_pred = RFST.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy (train) for RFST: %0.1f%% " % (accuracy * 100))
print(classification_report(y_test,y_pred))

Accuracy (train) for RFST: 79.6% 
              precision    recall  f1-score   support

           A       0.92      0.88      0.90      1260
           B       0.76      0.79      0.77      1262
           C       0.70      0.74      0.72      1223
           D       0.74      0.77      0.75      1258
           F       0.89      0.80      0.84      1194

    accuracy                           0.80      6197
   macro avg       0.80      0.80      0.80      6197
weighted avg       0.80      0.80      0.80      6197



In [10]:
# Convert the numpy array to a pandas DataFrame
X_test_df = pd.DataFrame(X_test)
y_test_df = pd.DataFrame(y_test)

# print(X_test_df.info())
# print(X_test_df.head())
print(X_test_df.columns)

# Test the RFST model with a student data at location #50 in the test set
# print(f'Student Data: {X_test_df.iloc[50][X_test_df.iloc[50]!=0].keys()}')
print(f'Student Data: {X_test_df.iloc[50][X_test_df.iloc[50]!=0].index.tolist()}')
print(f'Grade: {y_test_df.iloc[50]}')

RangeIndex(start=0, stop=86, step=1)
Student Data: [4, 11, 60, 64, 69, 81, 85]
Grade: 0    A
Name: 50, dtype: object


In [11]:
test= X_test_df.iloc[50].values.reshape(-1, 1).T
proba = RFST.predict_proba(test)
classes = RFST.classes_
resultdf = pd.DataFrame(data=proba, columns=classes)

topPrediction = resultdf.T.sort_values(by=[0], ascending = [False])
topPrediction.head()

Unnamed: 0,0
A,0.83
C,0.11
B,0.06
D,0.0
F,0.0


In [12]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Assuming X_train, X_test, y_train, y_test are your training and testing data

# Train the Random Forest classifier
RFST = RandomForestClassifier(n_estimators=100)
RFST.fit(X_train, np.ravel(y_train))

# Select 10 random indices
random_indices = np.random.choice(X_test.shape[0], size=10, replace=False)

# Extract the corresponding records from the test dataset
if isinstance(X_test, pd.DataFrame):
    X_sample = X_test.iloc[random_indices]
else:
    X_sample = pd.DataFrame(X_test).iloc[random_indices]

y_sample_actual = y_test[random_indices]

# Predict the labels for the selected records
y_sample_pred = RFST.predict(X_sample)

# Create a DataFrame to display the results
results_df = pd.DataFrame(index=range(10))
results_df['Column Names'] = ['; '.join(map(str, X_sample.iloc[i][X_sample.iloc[i] != 0].index.tolist())) for i in range(10)]
results_df['Actual Value'] = y_sample_actual
results_df['Predicted Value'] = y_sample_pred

# Display the results
print("Randomly Selected Test Records and Predictions:")
print(results_df)


Randomly Selected Test Records and Predictions:
                               Column Names Actual Value Predicted Value
0         2; 11; 37; 60; 66; 71; 76; 81; 85            A               A
1                 5; 11; 37; 60; 67; 73; 81            C               B
2             2; 11; 25; 60; 65; 67; 72; 78            D               D
3                    11; 13; 61; 68; 74; 77            C               C
4                            34; 61; 68; 79            C               C
5                    11; 61; 66; 67; 72; 80            D               C
6  1; 9; 11; 12; 14; 48; 61; 66; 69; 74; 79            B               B
7                     3; 11; 60; 66; 72; 79            D               D
8      1; 8; 11; 14; 32; 61; 66; 70; 75; 79            B               B
9                 5; 11; 60; 66; 67; 72; 81            D               D


In [None]:
print(featuresDf.columns)

In [13]:
# Same procedure as above but with the original featuresD DataFrame and exporting the results to a csv file

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Assuming X_train, X_test, y_train, y_test are your training and testing data
# Assuming featuresD is the original DataFrame containing the feature names

# Train the Random Forest classifier
RFST = RandomForestClassifier(n_estimators=100)
RFST.fit(X_train, np.ravel(y_train))

# Select 10 random indices
random_indices = np.random.choice(X_test.shape[0], size=10, replace=False)

# Extract the corresponding records from the test dataset
if isinstance(X_test, pd.DataFrame):
    X_sample = X_test.iloc[random_indices]
else:
    X_sample = pd.DataFrame(X_test).iloc[random_indices]

y_sample_actual = y_test[random_indices]

# Predict the labels for the selected records
y_sample_pred = RFST.predict(X_sample)

# Get the column names from the original featuresD dataset
column_names = featuresDf.columns

# Create a DataFrame to display the results
results_df = pd.DataFrame(index=range(10))
results_df['Column Names'] = ['; '.join(column_names[X_sample.iloc[i][X_sample.iloc[i] != 0].index]) for i in range(10)]
results_df['Actual Value'] = y_sample_actual
results_df['Predicted Value'] = y_sample_pred

# Display the results
print("Randomly Selected Test Records and Predictions:")
print(results_df.head(10))

# export results to a csv file
results_df.to_csv("../Data/10_random_results_compared.csv", index=False)


Randomly Selected Test Records and Predictions:
                                        Column Names Actual Value  \
0  SCHOOL_TYPE_TECHNICAL; UNIVERSITY_UNIVERSIDAD ...            D   
1  STRATUM_Stratum 2; SISBEN_Level 2; CAR_Yes; SC...            D   
2  STRATUM_Stratum 4; CAR_Yes; SCHOOL_NAT_PRIVATE...            A   
3  STRATUM_Stratum 1; SISBEN_Level 1; MIC_OVEN_No...            F   
4  STRATUM_Stratum 3; CAR_Yes; SCHOOL_NAT_PRIVATE...            B   
5  STRATUM_Stratum 2; SISBEN_Level 3; MOBILE_No; ...            D   
6  STRATUM_Stratum 1; SISBEN_Level 1; MOBILE_No; ...            C   
7  SCHOOL_NAT_PRIVATE; QR_PRO_C; CR_PRO_D; CC_PRO...            B   
8  STRATUM_Stratum 4; CAR_Yes; SCHOOL_NAT_PUBLIC;...            B   
9  STRATUM_Stratum 3; CAR_Yes; UNIVERSITY_UNIVERS...            B   

  Predicted Value  
0               D  
1               D  
2               A  
3               F  
4               B  
5               C  
6               C  
7               B  
8           