In [1]:
import pandas as pd
import numpy as np
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


In [2]:
# Loading data
file_path = Path("Sample_student_success_data_for_testing.csv")
df_success = pd.read_csv(file_path)
df_success.head()


Unnamed: 0,DBN,School Name,Year,Total Enrollment,Grade 3K,Grade PK (Half Day & Full Day),Grade K,Grade 1,Grade 2,Grade 3,...,Regents w/o Advanced #,Regents w/o Advanced % of cohort,Regents w/o Advanced % of grads,Local #,Local % of cohort,Local % of grads,Still Enrolled #,Still Enrolled % of cohort,Dropped Out #,Dropped Out % of cohort
0,01M015,P.S. 015 Roberto Clemente,2021-22,179,0,15,30,26,24,22,...,23.0,63.9,92.0,2.0,5.6,8.0,3.0,8.3,7.0,19.4
1,01M019,P.S. 019 Asher Levy,2021-22,176,9,7,22,20,22,23,...,23.0,63.9,95.8,1.0,2.8,4.2,4.0,11.1,7.0,19.4
2,01M020,P.S. 020 Anna Silver,2021-22,351,0,42,52,50,42,48,...,44.0,56.4,62.9,6.0,7.7,8.6,1.0,1.3,7.0,9.0
3,01M034,P.S. 034 Franklin D. Roosevelt,2021-22,258,10,9,15,21,25,27,...,44.0,56.4,62.9,6.0,7.7,8.6,1.0,1.3,7.0,9.0
4,01M063,The STAR Academy - P.S.63,2021-22,186,15,10,23,34,27,30,...,103.0,98.1,100.0,0.0,0.0,0.0,0.0,0.0,2.0,1.9


In [3]:
# Removing NaNs
df_success.dropna(inplace=True)

In [4]:
# Define the features set.
X = df_success.copy()

# Dropping target columns and unique identifier columns, etc
X = X.drop(columns=["Dropped Out #", "Dropped Out % of cohort", "DBN", "School Name", "Year", "Demographic Category", "Demographic Variable", "Cohort"])
X.head()


Unnamed: 0,Total Enrollment,Grade 3K,Grade PK (Half Day & Full Day),Grade K,Grade 1,Grade 2,Grade 3,Grade 4,Grade 5,Grade 6,...,Advanced Regents % of cohort,Advanced Regents % of grads,Regents w/o Advanced #,Regents w/o Advanced % of cohort,Regents w/o Advanced % of grads,Local #,Local % of cohort,Local % of grads,Still Enrolled #,Still Enrolled % of cohort
0,179,0,15,30,26,24,22,33,29,0,...,0.0,0.0,23.0,63.9,92.0,2.0,5.6,8.0,3.0,8.3
1,176,9,7,22,20,22,23,34,39,0,...,0.0,0.0,23.0,63.9,95.8,1.0,2.8,4.2,4.0,11.1
2,351,0,42,52,50,42,48,51,66,0,...,25.6,28.6,44.0,56.4,62.9,6.0,7.7,8.6,1.0,1.3
3,258,10,9,15,21,25,27,17,21,28,...,25.6,28.6,44.0,56.4,62.9,6.0,7.7,8.6,1.0,1.3
4,186,15,10,23,34,27,30,26,21,0,...,0.0,0.0,103.0,98.1,100.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Checking datatypes to make sure there are no strings
X.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 97 entries, 0 to 99
Data columns (total 57 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Total Enrollment                  97 non-null     int64  
 1   Grade 3K                          97 non-null     int64  
 2   Grade PK (Half Day & Full Day)    97 non-null     int64  
 3   Grade K                           97 non-null     int64  
 4   Grade 1                           97 non-null     int64  
 5   Grade 2                           97 non-null     int64  
 6   Grade 3                           97 non-null     int64  
 7   Grade 4                           97 non-null     int64  
 8   Grade 5                           97 non-null     int64  
 9   Grade 6                           97 non-null     int64  
 10  Grade 7                           97 non-null     int64  
 11  Grade 8                           97 non-null     int64  
 12  Grade 9   

In [6]:
# Define the target set.
y = df_success["Dropped Out % of cohort"].ravel()
y[:5]


array([19.4, 19.4,  9. ,  9. ,  1.9])

In [7]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Creating a StandardScaler instance.
scaler = StandardScaler()

# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [8]:
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)

Training Features Shape: (72, 57)
Training Labels Shape: (72,)
Testing Features Shape: (25, 57)
Testing Labels Shape: (25,)


In [9]:
# Create a random forest regressor (not a classifier, since our target is a continuous variable)
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf_model = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf_model=rf_model.fit(X_train_scaled, y_train)

In [10]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)


In [11]:
# Getting accuracy data with Regressor/continuous variable model

# Calculate the absolute errors
errors = abs(predictions - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))


Mean Absolute Error: 2.86


In [12]:
y_test

array([ 3.4,  0.7,  3. ,  0.7, 20. , 13. ,  3. ,  6.8, 14.5,  0.7,  4.5,
        4.2, 12.8,  1.3,  1.9,  0. ,  0. ,  0. , 19.4,  2.9, 29.9,  9.8,
        0.7,  9.4,  6.8])

In [13]:
errors

array([0.1479, 0.5301, 0.0423, 0.5157, 3.1423, 1.3781, 1.2061, 6.7756,
       9.4537, 0.5441, 1.0768, 2.0762, 3.3451, 3.717 , 0.6775, 2.8487,
       0.877 , 2.8935, 2.3668, 2.594 , 7.6112, 3.1985, 0.5744, 4.6831,
       9.3212])

In [14]:
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, predictions))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, predictions))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

Mean Absolute Error: 2.863875999999994
Mean Squared Error: 15.488187175599828
Root Mean Squared Error: 3.9355034208598814


In [15]:
# Other ways of quantifying accuracy

# # Calculate mean absolute percentage error (MAPE)
# mape = 100 * (errors / y_test)
# # Calculate and display accuracy
# accuracy = 100 - np.mean(mape)
# print('Accuracy:', round(accuracy, 2), '%.')

In [16]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances


array([0.00207739, 0.00721975, 0.0012014 , 0.00036227, 0.00040136,
       0.00061277, 0.00045578, 0.00142122, 0.00061723, 0.00041737,
       0.00035207, 0.000476  , 0.00148871, 0.00126708, 0.00077753,
       0.0007066 , 0.00216071, 0.00445592, 0.00351763, 0.00340027,
       0.00161282, 0.00275468, 0.0108139 , 0.00667317, 0.00164676,
       0.00158442, 0.00216551, 0.00126583, 0.00263896, 0.01078437,
       0.00355331, 0.00194463, 0.00223395, 0.0057159 , 0.00327107,
       0.00639856, 0.00183042, 0.00208105, 0.00132291, 0.00085357,
       0.010962  , 0.0205768 , 0.32418344, 0.01617886, 0.34078667,
       0.06282061, 0.00588959, 0.00344024, 0.00235338, 0.00918282,
       0.0066581 , 0.00692084, 0.00590375, 0.01707626, 0.04818923,
       0.00691412, 0.00742844])

In [17]:
# Sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)


[(0.34078666924988377, 'Total Regents % of cohort'),
 (0.3241834422549704, 'Total Grads % of cohort'),
 (0.06282060819781544, 'Total Regents % of grads'),
 (0.04818922809938335, 'Local % of grads'),
 (0.02057680171901729, 'Total Grads #'),
 (0.01707626364104943, 'Local % of cohort'),
 (0.016178859232487353, 'Total Regents #'),
 (0.010961996435051729, 'Total Cohort #'),
 (0.010813902348150698, '# Black'),
 (0.010784367844287613, '% Native American'),
 (0.00918282033575579, 'Regents w/o Advanced #'),
 (0.007428440991218546, 'Still Enrolled % of cohort'),
 (0.0072197508056244045, 'Grade 3K'),
 (0.006920840588252827, 'Regents w/o Advanced % of grads'),
 (0.006914123245208449, 'Still Enrolled #'),
 (0.006673170302590673, '% Black'),
 (0.006658100123562389, 'Regents w/o Advanced % of cohort'),
 (0.006398555872185621, '% Students with Disabilities'),
 (0.005903753930859812, 'Local #'),
 (0.00588958520908128, 'Advanced Regents #'),
 (0.00571590326664257, '% Missing Race/Ethnicity Data'),
 (0.0