In [38]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_validate, GridSearchCV,RandomizedSearchCV
import numpy as np
from matplotlib import pyplot as plt
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.inspection import permutation_importance

Random Forest Model Data Preparation



In [None]:
#load dat aset from csv source file
data_set = pd.read_csv('DataSets/RawDataWithStudentsRemoved.csv')

print(data_set.head())

#review data set structure
data_set.dtypes
data_set.shape

#remove null term GPA from data set
data_set = data_set.dropna(subset = ['TermGPA'])

In [14]:
print(data_set['AcademicYear'].dtype)

int64


We split the data set into a train and a test set (80/20) using the same random seed

In [30]:
# Y target column
target_column = 'TermGPA'
test_percent = 0.20

#x_columns_to_drop = ['FakeIdentifier' , 'TermGPA']

#data set dropping the target feature
x_features = data_set.drop(['FakeIdentifier' , 'TermGPA'], axis = 1)


#encoding categorical features
columns_to_encode = ['Gender', 'PrimaryMilitaryAffiliation', 'AcademicCareer', 'College',
                      'UAFullTimePartTime', 'AcademicLevelEndofTerm', 'FirstGenerationFlag' ]


y_target = data_set[target_column]


#split the data into a 80/20 train test split using the same random seed for consistency
x_train, x_test, y_train, y_test = train_test_split(x_features, y_target, test_size = test_percent, random_state = 698) 

#encode categorical features, and drop one level to avoid multicolinearity
x_train_encoded = pd.get_dummies(x_train,
                                  columns=columns_to_encode,
                                    drop_first=True,
                                      dtype = int)
x_test_encoded = pd.get_dummies(x_test,
                                 columns=columns_to_encode,
                                 drop_first=True,
                                   dtype = int)
# restore missing features after encoding
x_train_features_encoded_cols = x_train_encoded.columns
x_test_encoded = x_test_encoded.reindex(columns=x_train_features_encoded_cols, fill_value=0)

#trying OneHOt encoder to help with missing features after encoding

#encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) 

# Fit the encoder on training data
#encoder.fit(x_train[columns_to_encode])

# Transform both training and test data
#onehot_encoded_train = encoder.transform(x_train[columns_to_encode])
#one_hot_encoded_test = encoder.transform(x_train[columns_to_encode])

# Convert to DataFrame for better readability (optional)
#encoded_train_df = pd.DataFrame(encoded_train, columns=encoder.get_feature_names_out(['color', 'size']))
#encoded_test_df = pd.DataFrame(encoded_test, columns=encoder.get_feature_names_out(['color', 'size']))



In [31]:
print(x_features)

print(x_train_encoded.columns)

print(x_test_encoded.columns)

       Age  Gender PrimaryMilitaryAffiliation AcademicCareer  \
0       18  Female    No Military Affiliation  Undergraduate   
1       72    Male    No Military Affiliation            Law   
3       47  Female                    Veteran       Graduate   
4       45    Male    No Military Affiliation  Undergraduate   
5       22  Female    No Military Affiliation       Graduate   
...    ...     ...                        ...            ...   
63392   28  Female              Guard Reserve  Undergraduate   
63393   47  Female    No Military Affiliation       Graduate   
63395   43    Male    No Military Affiliation       Graduate   
63396   46  Female    No Military Affiliation            Law   
63397   46    Male                    Veteran  Undergraduate   

                              College  NumberofClassesEnrolled  CumulativeGPA  \
0                  College of Science                        4          3.636   
1       James E Rogers College of Law                        3       

In [19]:
#making the RF Regressor Model

metric = mean_squared_error
metric_name = "Mean Squared Error"
rfr_depth = 15
rfr_max_features = None # None, sqrt or log2
rfr_min_sample_split = 4
rfr_criterion = 'squared_error' #criterion{“squared_error”, “absolute_error”, “friedman_mse”, “poisson”}, default=”squared_error”
rfr_estimators = 100

model = RandomForestRegressor(n_estimators = rfr_estimators,
                               criterion = rfr_criterion,
                                 max_features = rfr_max_features,
                                    #min_samples_split=rfr_min_sample_split,
                                      max_depth = rfr_depth,
                                          random_state=698)



In [21]:
#Adding kFold Cross validation--------------------------
kfold = KFold(n_splits=5, shuffle=True, random_state=698)


#for kfold accuracy scores
cv_scores = cross_val_score(model, x_train_encoded, y_train, cv=kfold, scoring='neg_mean_squared_error')

#review KFold Scores in the output (5-fold for our model)
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean cross-validation score:", cv_scores.mean())

Cross-validation scores: [-0.23304058 -0.23291789 -0.23475453 -0.23216677 -0.23084918]
Mean cross-validation score: -0.23274579078541305


In [32]:
#fit the model
model.fit(x_train_encoded, y_train)

# Predict using test data
y_pred = model.predict(x_test_encoded) 

    # compare prediction to actual dev/test data
score = metric(y_test, y_pred)
print(f"{metric_name} on test set: {score:.4f}")

Mean Squared Error on test set: 0.2341


In [None]:
#feature importances:
feature_importances = model.feature_importances_ 
print(feature_importances)
    
    #put feature importances in a data frame
feature_names = x_train_encoded.columns 
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
pd.set_option('display.max_rows', None)
print(feature_importance_df)

In [43]:
#permutation importance

result = permutation_importance(model, x_test_encoded, y_test, scoring = 'neg_mean_squared_error', n_repeats=10, random_state=698, n_jobs=-1)
sorted_idx = result.importances_mean.argsort()

permutation_importances = pd.Series(result.importances_mean[sorted_idx], index=x_test_encoded.columns[sorted_idx]).sort_values(ascending=False)
print("\nPermutation Importance:")
print(permutation_importances)


Permutation Importance:
CumulativeGPA                                              1.438900e+00
UnitsPassedincludedinGPA                                   4.231591e-01
NumberofClassesEnrolled                                    1.919041e-02
Age                                                        6.532053e-03
AcademicLevelEndofTerm_Senior                              4.415266e-03
College_Eller College of Management                        3.354824e-03
AcademicYear                                               2.309479e-03
College_Undergraduate Education                            2.253426e-03
College_Coll of Ag Life & Env Sci                          1.086685e-03
College_College of Humanities                              7.304979e-04
UAFullTimePartTime_P                                       5.314346e-04
College_College of Nursing                                 4.820399e-04
College_College of Engineering                             4.585590e-04
College_College of Science             

In [None]:
# scale the termGPA for training

#scaler = StandardScaler()
#y_train_reshape = y_train.values.reshape(-1, 1)

#y_train_zscore = scaler.fit_transform(y_train_reshape)

#y_test_zscore = 