In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import roc_auc_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_validate, GridSearchCV,RandomizedSearchCV
import numpy as np
from matplotlib import pyplot as plt
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.inspection import permutation_importance
import shap


Random Forest Model Data Preparation



In [2]:
#load dat aset from csv source file
data_set = pd.read_csv('DataSets/RawDataWithStudentsRemoved.csv')

print(data_set.head())

#review data set structure
data_set.dtypes
print(data_set.shape)

#remove null term GPA from data set
data_set = data_set.dropna(subset = ['TermGPA'])
print(data_set.shape)

   FakeIdentifier  Age  Gender PrimaryMilitaryAffiliation AcademicCareer  \
0               1   18  Female    No Military Affiliation  Undergraduate   
1               2   72    Male    No Military Affiliation            Law   
2               3   26    Male    No Military Affiliation       Graduate   
3               4   47  Female                    Veteran       Graduate   
4               5   45    Male    No Military Affiliation  Undergraduate   

                          College  NumberofClassesEnrolled  TermGPA  \
0              College of Science                        4     3.75   
1   James E Rogers College of Law                        3     4.00   
2          College of Engineering                        2      NaN   
3       Coll of Ag Life & Env Sci                        2     4.00   
4  College of Information Science                        2     2.00   

   CumulativeGPA  UnitsPassedincludedinGPA  UnitsPassednotincludedinGPA  \
0          3.636                         

In [None]:
print(data_set['AcademicYear'].dtype)
print(data_set['TermGPA'].median())


We split the data set into a train and a test set (80/20) using the same random seed (698).
We decide which features to include and which to drop before training the model. We also must encode the remainig categorical features.

In [9]:
# Y target column
target_column = 'TermGPA'
test_percent = 0.20

#x_columns_to_drop = ['FakeIdentifier' , 'TermGPA']

#data set dropping the target feature
x_features = data_set.drop(['FakeIdentifier' ,
                             'College',
                             'PrimaryMilitaryAffiliation',
                             'CumulativeGPA',
                             'AcademicYear',
                             'UnitsPassedincludedinGPA',
                             'TermGPA'],
                               axis = 1)


#encoding categorical features
columns_to_encode = ['Gender',
                     # 'PrimaryMilitaryAffiliation',
                        'AcademicCareer',
                       #   'College',
                      'UAFullTimePartTime',
                        'AcademicLevelEndofTerm',
                          'FirstGenerationFlag' 
                          ]


y_target = data_set[target_column]


#split the data into a 80/20 train test split using the same random seed for consistency
x_train, x_test, y_train, y_test = train_test_split(x_features, y_target, test_size = test_percent, random_state = 698) 

#encode categorical features, and drop one level to avoid multicolinearity
x_train_encoded = pd.get_dummies(x_train,
                                  columns=columns_to_encode,
                                    drop_first=True,
                                      dtype = int)
x_test_encoded = pd.get_dummies(x_test,
                                 columns=columns_to_encode,
                                 drop_first=True,
                                   dtype = int)
# restore missing features after encoding
x_train_features_encoded_cols = x_train_encoded.columns
x_test_encoded = x_test_encoded.reindex(columns=x_train_features_encoded_cols, fill_value=0)

#trying OneHOt encoder to help with missing features after encoding

#encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) 

# Fit the encoder on training data
#encoder.fit(x_train[columns_to_encode])

# Transform both training and test data
#onehot_encoded_train = encoder.transform(x_train[columns_to_encode])
#one_hot_encoded_test = encoder.transform(x_train[columns_to_encode])

# Convert to DataFrame for better readability (optional)
#encoded_train_df = pd.DataFrame(encoded_train, columns=encoder.get_feature_names_out(['color', 'size']))
#encoded_test_df = pd.DataFrame(encoded_test, columns=encoder.get_feature_names_out(['color', 'size']))



In [4]:
print(x_features)

print(x_train_encoded.columns)

print(x_test_encoded.columns)

       Age  Gender AcademicCareer  NumberofClassesEnrolled  \
0       18  Female  Undergraduate                        4   
1       72    Male            Law                        3   
3       47  Female       Graduate                        2   
4       45    Male  Undergraduate                        2   
5       22  Female       Graduate                        3   
...    ...     ...            ...                      ...   
63392   28  Female  Undergraduate                        3   
63393   47  Female       Graduate                        3   
63395   43    Male       Graduate                        2   
63396   46  Female            Law                        3   
63397   46    Male  Undergraduate                        1   

       UnitsPassedincludedinGPA  UnitsPassednotincludedinGPA  \
0                             4                            1   
1                             3                            0   
3                             3                            0   

In [10]:
#making the RF Regressor Model

metric = mean_squared_error
metric_name = "Mean Squared Error"
rfr_depth = 15
rfr_max_features = None # None, sqrt or log2
rfr_min_sample_split = 4
rfr_criterion = 'squared_error' #criterion{“squared_error”, “absolute_error”, “friedman_mse”, “poisson”}, default=”squared_error”
rfr_estimators = 100

model = RandomForestRegressor(n_estimators = rfr_estimators,
                               criterion = rfr_criterion,
                                 max_features = rfr_max_features,
                                    min_samples_split=rfr_min_sample_split,
                                      max_depth = rfr_depth,
                                          random_state=698)



# Evaluation

We review accuracy metrics ffrom a cross-fold validation, as well as the accuracy of the model run on our test set.

Wel also produce both e Feature Importances and the Permutation Importances for the model. 

In [11]:
#Adding kFold Cross validation--------------------------
kfold = KFold(n_splits=5, shuffle=True, random_state=698)


#for kfold accuracy scores
cv_scores = cross_val_score(model, x_train_encoded, y_train, cv=kfold, scoring='neg_mean_squared_error')

#review KFold Scores in the output (5-fold for our model)
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean cross-validation score:", cv_scores.mean())

Cross-validation scores: [-1.20649613 -1.22360274 -1.25968851 -1.27845234 -1.27908178]
Mean cross-validation score: -1.2494643028908798


In [12]:
#fit the model
model.fit(x_train_encoded, y_train)

# Predict using test data
y_pred = model.predict(x_test_encoded) 

    # compare prediction to actual dev/test data
score = metric(y_test, y_pred)
print(f"{metric_name} on test set: {score:.4f}") #0.2409

r_squared = r2_score(y_test, y_pred)
print(f"r_squared on test set: {r_squared:.4f}") #0.8237

Mean Squared Error on test set: 1.2432
r_squared on test set: 0.0903


In [13]:
#feature importances:
feature_importances = model.feature_importances_ 
print(feature_importances)
    
    #put feature importances in a data frame
feature_names = x_train_encoded.columns 
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances}).sort_values(by = 'Importance', ascending = False)
pd.set_option('display.max_rows', None)
print(feature_importance_df)

[0.2782539  0.11809432 0.01962154 0.0463024  0.00248165 0.00128341
 0.30342902 0.02235039 0.0029224  0.03660027 0.00098871 0.10615354
 0.01660767 0.0449108 ]
                             Feature  Importance
6       AcademicCareer_Undergraduate    0.303429
0                                Age    0.278254
1            NumberofClassesEnrolled    0.118094
11     AcademicLevelEndofTerm_Senior    0.106154
3                        Gender_Male    0.046302
13             FirstGenerationFlag_Y    0.044911
9      AcademicLevelEndofTerm_Junior    0.036600
7               UAFullTimePartTime_P    0.022350
2        UnitsPassednotincludedinGPA    0.019622
12  AcademicLevelEndofTerm_Sophomore    0.016608
8    AcademicLevelEndofTerm_Graduate    0.002922
4                     Gender_Unknown    0.002482
5                 AcademicCareer_Law    0.001283
10    AcademicLevelEndofTerm_Masters    0.000989


In [None]:
#permutation importance

result = permutation_importance(model, x_test_encoded, y_test, scoring = 'neg_mean_squared_error', n_repeats=10, random_state=698, n_jobs=-1)
sorted_idx = result.importances_mean.argsort()

permutation_importances = pd.Series(result.importances_mean[sorted_idx], index=x_test_encoded.columns[sorted_idx]).sort_values(ascending=False)
print("\nPermutation Importance:")
print(permutation_importances)

In [None]:
#shap values

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(x_test_encoded)

In [None]:
shap.summary_plot(shap_values, x_test_encoded, max_display = 25)
plt.title("SHAP Summary Plot")
plt.show()

In [None]:
#feature correlations

x_feature_correlation = x_train_encoded.corr()
target_corr = pd.concat([x_train_encoded, y_train], axis = 1).corr()
#print(target_corr)
#-0.15 corr for first gen and term GPA



In [None]:
# scale the termGPA for training

#scaler = StandardScaler()
#y_train_reshape = y_train.values.reshape(-1, 1)

#y_train_zscore = scaler.fit_transform(y_train_reshape)

#y_test_zscore = 