In [1]:
import pandas as pd
import numpy as np
from path import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import r2_score


In [2]:
# Loading data
file_path = Path("Resources/2022-College-Guide-Main-Rankings-clear.csv")
df_rankings = pd.read_csv(file_path)
df_rankings.head()


Unnamed: 0,UnitID,Rank,Name,8-year graduation rate,Graduation rate rank,"Predicted graduation rate based on % of Pell recipients, incoming SATs, etc.",Graduation rate performance rank,Pell/non-Pell graduation gap,Pell graduation gap rank,Number of Pell graduates,...,% of federal work-study funds spent on service,% of federal work-study funds spent on service rank,Earns Carnegie Community Engagement Classification?,Voting engagement points,% of grads with service-oriented majors,Service-oriented majors rank,Social mobility rank,Research rank,Service rank,Unnamed: 40
0,243744,1,Stanford University (CA),0.963302,6,0.949654,187,-0.021703,46,260.6667,...,0.290903,10,NO,3,0.068361,401,3,2,187,
1,215062,2,University of Pennsylvania (PA),0.964294,5,0.93862,149,-0.041917,104,376.6667,...,0.160573,71,YES,6,0.246743,209,2,9,40,
2,166683,3,MA Institute of Technology (MA),0.950971,10,0.990213,334,-0.032985,72,182.6667,...,0.065477,332,NO,2,0.0,436,5,1,324,
3,186131,4,Princeton University (NJ),0.976861,2,0.947812,137,-0.05274,132,153.3333,...,0.0645,339,NO,6,0.0,436,7,6,224,
4,198419,5,Duke University (NC),0.954474,9,0.967952,267,-0.014278,36,231.6667,...,0.089523,224,YES,5,0.170164,305,12,15,20,


In [3]:
df_rankings.columns

Index(['UnitID', 'Rank', 'Name', '8-year graduation rate',
       'Graduation rate rank',
       'Predicted graduation rate based on % of Pell recipients, incoming SATs, etc.',
       'Graduation rate performance rank', 'Pell/non-Pell graduation gap',
       'Pell graduation gap rank', 'Number of Pell graduates',
       'Actual vs. predicted Pell enrollment', 'Pell performance rank',
       'Median earnings 10 years after entering college',
       'Predicted median earnings 10 years after entering college',
       'Earnings performance rank',
       'Net price of attendance for families below $75,000 income',
       'Net price rank', '% of loan principal remaining 5 years later',
       'Repayment rank', 'Predicted principal remaining',
       'Repayment rate performance rank', 'Research expenditures, in millions',
       'Research expenditures rank', 'Bachelor's to PhD rank',
       'Science & engineering PhDs awarded', 'Science & engineering PhDs rank',
       'Faculty receiving sign

In [4]:
df_rankings.dtypes

UnitID                                                                            int64
Rank                                                                              int64
Name                                                                             object
8-year graduation rate                                                          float64
Graduation rate rank                                                              int64
Predicted graduation rate based on % of Pell recipients, incoming SATs, etc.    float64
Graduation rate performance rank                                                  int64
Pell/non-Pell graduation gap                                                    float64
Pell graduation gap rank                                                          int64
Number of Pell graduates                                                        float64
Actual vs. predicted Pell enrollment                                            float64
Pell performance rank           

In [5]:
df_rankings.count()

UnitID                                                                          442
Rank                                                                            442
Name                                                                            442
8-year graduation rate                                                          442
Graduation rate rank                                                            442
Predicted graduation rate based on % of Pell recipients, incoming SATs, etc.    442
Graduation rate performance rank                                                442
Pell/non-Pell graduation gap                                                    442
Pell graduation gap rank                                                        442
Number of Pell graduates                                                        442
Actual vs. predicted Pell enrollment                                            442
Pell performance rank                                                       

In [6]:
# Drop last column
df_rankings.drop('Unnamed: 40', axis=1, inplace=True)

In [7]:
# Encoding Carnegie column (0=NO, 1=YES)

le = LabelEncoder()
df2_rankings = df_rankings.copy()
df2_rankings['Earns Carnegie Community Engagement Classification?'] = le.fit_transform(df2_rankings['Earns Carnegie Community Engagement Classification?'])


In [8]:
df2_rankings.head()

Unnamed: 0,UnitID,Rank,Name,8-year graduation rate,Graduation rate rank,"Predicted graduation rate based on % of Pell recipients, incoming SATs, etc.",Graduation rate performance rank,Pell/non-Pell graduation gap,Pell graduation gap rank,Number of Pell graduates,...,ROTC rank,% of federal work-study funds spent on service,% of federal work-study funds spent on service rank,Earns Carnegie Community Engagement Classification?,Voting engagement points,% of grads with service-oriented majors,Service-oriented majors rank,Social mobility rank,Research rank,Service rank
0,243744,1,Stanford University (CA),0.963302,6,0.949654,187,-0.021703,46,260.6667,...,308,0.290903,10,0,3,0.068361,401,3,2,187
1,215062,2,University of Pennsylvania (PA),0.964294,5,0.93862,149,-0.041917,104,376.6667,...,250,0.160573,71,1,6,0.246743,209,2,9,40
2,166683,3,MA Institute of Technology (MA),0.950971,10,0.990213,334,-0.032985,72,182.6667,...,12,0.065477,332,0,2,0.0,436,5,1,324
3,186131,4,Princeton University (NJ),0.976861,2,0.947812,137,-0.05274,132,153.3333,...,34,0.0645,339,0,6,0.0,436,7,6,224
4,198419,5,Duke University (NC),0.954474,9,0.967952,267,-0.014278,36,231.6667,...,45,0.089523,224,1,5,0.170164,305,12,15,20


In [9]:
# Define the features set.
X = df2_rankings.copy()

# Dropping target columns and unique identifier columns, and keeping ONLY TOP 15 features from original analysis
X = X.drop(columns=['UnitID', 'Rank', 'Name', 
    'ROTC rank', 'Repayment rate performance rank',
    '% of federal work-study funds spent on service rank', '% of federal work-study funds spent on service',
    'Graduation rate performance rank', 'Earnings performance rank',
    'Median earnings 10 years after entering college', 'Net price rank',
    'Net price of attendance for families below $75,000 income',
    'Actual vs. predicted Pell enrollment', '% of grads with service-oriented majors',
    'Service-oriented majors rank',
    'Predicted graduation rate based on % of Pell recipients, incoming SATs, etc.',
    'Predicted median earnings 10 years after entering college',
    'Pell performance rank',
    'Pell/non-Pell graduation gap', 'Repayment rank',
    '% of loan principal remaining 5 years later',
    'Predicted principal remaining',
    'Pell graduation gap rank',
    'Earns Carnegie Community Engagement Classification?'])
X.head()



Unnamed: 0,8-year graduation rate,Graduation rate rank,Number of Pell graduates,"Research expenditures, in millions",Research expenditures rank,Bachelor's to PhD rank,Science & engineering PhDs awarded,Science & engineering PhDs rank,Faculty receiving significant awards,Faculty in National Academies,Faculty accolades rank,AmeriCorps/Peace Corps rank,Voting engagement points,Social mobility rank,Research rank,Service rank
0,0.963302,6,260.6667,1188.554,10,7,638.3333,5,0.044643,0.292627,1,145,3,3,2,187
1,0.964294,5,376.6667,1509.193,3,27,328.3333,33,0.019748,0.076843,8,95,6,2,9,40
2,0.950971,10,182.6667,987.2567,18,2,599.0,7,0.028059,0.267684,3,399,2,5,1,324
3,0.976861,2,153.3333,385.278,61,3,278.0,45,0.026616,0.118821,5,148,6,7,6,224
4,0.954474,9,231.6667,1196.922,9,13,320.6667,35,0.014617,0.048649,12,39,5,12,15,20


In [10]:
# Checking datatypes to make sure there are no strings
X.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 16 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   8-year graduation rate                442 non-null    float64
 1   Graduation rate rank                  442 non-null    int64  
 2   Number of Pell graduates              442 non-null    float64
 3   Research expenditures, in millions    442 non-null    float64
 4   Research expenditures rank            442 non-null    int64  
 5   Bachelor's to PhD rank                442 non-null    int64  
 6   Science & engineering PhDs awarded    442 non-null    float64
 7   Science & engineering PhDs rank       442 non-null    int64  
 8   Faculty receiving significant awards  442 non-null    float64
 9   Faculty in National Academies         442 non-null    float64
 10  Faculty accolades rank                442 non-null    int64  
 11  AmeriCorps/Peace Co

In [11]:
# Define the target set.
y = df2_rankings["Rank"].ravel()
y[:5]


array([1, 2, 3, 4, 5], dtype=int64)

In [12]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Creating a StandardScaler instance.
scaler = StandardScaler()

# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [13]:
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)

Training Features Shape: (331, 16)
Training Labels Shape: (331,)
Testing Features Shape: (111, 16)
Testing Labels Shape: (111,)


## Random Forest Regressor

In [14]:
# Instantiate model with 1000 decision trees
rf_model = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf_model=rf_model.fit(X_train_scaled, y_train)

In [15]:
# Making predictions using the testing data.
rf_predictions = rf_model.predict(X_test_scaled)


In [16]:
# Evaluate the performance with r2 score
r2_rf= r2_score(y_test,rf_predictions)
print('The r2 score for this Random Forest Regressor model is:', r2_rf)

The r2 score for this Random Forest Regressor model is: 0.9694513612720087


In [17]:
# Getting accuracy data with Regressor/continuous variable model

# Calculate the absolute errors
errors = abs(rf_predictions - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))

# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')


Mean Absolute Error: 13.94
Accuracy: 85.02 %.


In [18]:
y_test

array([340,  46, 201, 374, 111, 355, 295, 169, 303,  84,  70,  68, 140,
       160, 222, 434, 215, 133,   7,  31, 276, 336, 263, 314,  99, 370,
       227, 273, 206, 126, 327, 250, 422, 284, 142, 192, 136, 436, 313,
       428, 168, 171, 337, 230, 204, 367, 157,  25, 292,  64, 262, 196,
         8, 254, 299,  75, 368, 361, 187,  22, 259,  93, 256,  16, 363,
       184, 249,  33,  20, 366,   3,  95, 210,  52, 182, 281, 128,  92,
       382, 349,  49, 411, 158, 224, 383, 421, 387, 351,  51, 195, 296,
        17, 274, 202,  88, 241,  21,  91, 424,  76, 137, 258,  42, 243,
       392, 309,  72, 401, 350, 233, 221], dtype=int64)

In [19]:
errors

array([34.488,  9.579,  8.854,  2.733,  3.698,  7.251, 11.844, 13.821,
       11.638,  1.421, 19.583, 20.306, 10.942, 19.397, 22.064,  2.971,
       72.043,  4.643,  2.975,  1.334, 23.457,  9.547, 66.876,  7.204,
        1.532,  5.526, 13.984,  3.905, 75.368, 18.197, 12.656,  0.611,
        0.221, 24.53 ,  2.44 , 17.372, 16.827,  5.537,  8.407,  0.311,
        0.444,  2.213,  1.228, 15.65 ,  5.485, 14.307,  0.488,  1.365,
        4.556,  7.94 , 37.16 , 13.009, 12.759, 16.196,  6.783, 17.294,
       19.344, 79.285, 19.723, 25.356, 12.48 , 30.477, 62.517, 10.595,
        3.845,  7.84 ,  5.603,  1.06 ,  5.535, 10.69 ,  8.397, 13.273,
        4.338,  0.907,  7.827,  3.017, 66.167,  6.834, 11.831, 10.496,
       13.761,  4.124, 41.164, 68.527, 15.692,  3.175,  8.13 , 10.85 ,
       26.523,  4.801,  0.31 ,  9.202,  0.625,  9.268, 17.626, 10.782,
        4.671,  4.786,  1.83 ,  2.576, 18.697, 18.821, 20.122, 11.801,
       10.085, 10.731,  1.222,  4.286,  1.51 ,  9.487,  2.067])

In [20]:
from sklearn import metrics
# https://machinelearningmastery.com/regression-metrics-for-machine-learning

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, rf_predictions))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, rf_predictions))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, rf_predictions)))

Mean Absolute Error: 13.942873873873875
Mean Squared Error: 475.1348866756756
Root Mean Squared Error: 21.79758901061481


In [21]:
# Other ways of quantifying accuracy


# # Find the original feature indices 
# original_feature_indices = [feature_list.index(feature) for feature in feature_list if feature not in ['ws_1', 'prcp_1', 'snwd_1']]
# # Create a test set of the original features
# original_test_features = test_features[:, original_feature_indices]
# # Make predictions on test data using the model trained on original data
# predictions = rf.predict(original_test_features)
# # Performance metrics
# errors = abs(predictions - test_labels)
# print('Metrics for Random Forest Trained on Original Data')
# print('Average absolute error:', round(np.mean(errors), 2), 'degrees.')


In [22]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances


array([0.00322694, 0.00334247, 0.00237475, 0.00229678, 0.00229364,
       0.00542326, 0.00157546, 0.00164385, 0.00238257, 0.00184359,
       0.0043526 , 0.00349768, 0.00194437, 0.88673357, 0.01970017,
       0.05736832])

In [23]:
# #plotting feature importances

# # Get numerical feature importances
# importances = list(rf_exp.feature_importances_)
# # List of tuples with variable and importance
# feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# # Sort the feature importances by most important first
# feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# # Print out the feature and importances 
# [print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]

# # list of x locations for plotting
# x_values = list(range(len(importances)))
# # Make a bar chart
# plt.bar(x_values, importances, orientation = 'vertical', color = 'r', edgecolor = 'k', linewidth = 1.2)
# # Tick labels for x axis
# plt.xticks(x_values, feature_list, rotation='vertical')
# # Axis labels and title
# plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('Variable Importances');

In [24]:
# Sort the features by their importance.
sorted_list=sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
sorted_df=pd.DataFrame(sorted_list, columns=['Importance', 'Feature'])
sorted_df

Unnamed: 0,Importance,Feature
0,0.886734,Social mobility rank
1,0.057368,Service rank
2,0.0197,Research rank
3,0.005423,Bachelor's to PhD rank
4,0.004353,Faculty accolades rank
5,0.003498,AmeriCorps/Peace Corps rank
6,0.003342,Graduation rate rank
7,0.003227,8-year graduation rate
8,0.002383,Faculty receiving significant awards
9,0.002375,Number of Pell graduates


In [25]:
sorted_df.to_csv('Feature_Importances_with_BOTTOM_15_FEATURES_removed.csv')  

## Support Vector Regression

In [26]:
# Create a SVM model

# Instantiate a SVM model
from sklearn.svm import SVR

## the kernel is the most important part - try different kernels

# Fit the data

regressor = SVR(kernel = 'poly')
svm_model = regressor.fit(X_train, y_train)
svm_model


SVR(kernel='poly')

In [27]:
# Predicting the Test set results
svm_predictions = svm_model.predict(X_test)

In [28]:
from sklearn.metrics import r2_score
r2_svm=r2_score(y_test, svm_predictions)
print('The r2 score for this SVM model is:', r2_svm)


The r2 score for this SVM model is: 0.2939259606183313


In [29]:
#visualize SVR results: https://www.analyticsvidhya.com/blog/2020/03/support-vector-regression-tutorial-for-machine-learning/


## Decision Tree Regression

In [30]:
# Training the Decision Tree Regression model on the Training set
from sklearn.tree import DecisionTreeRegressor
dt_model = DecisionTreeRegressor(random_state = 0)
dt_model = dt_model.fit(X_train, y_train)
dt_model

DecisionTreeRegressor(random_state=0)

In [31]:
# Predicting the Test set results
dt_predictions = dt_model.predict(X_test)

In [32]:
# Evaluate the performance.
from sklearn.metrics import r2_score
r2_dt= r2_score(y_test,dt_predictions)
print('The r2 score for this Decision Tree model is:', r2_dt)

The r2 score for this Decision Tree model is: 0.9408842406087676


## Multiple Linear Regression

In [33]:
# Training the multiple regression model on the Training set
from sklearn.linear_model import LinearRegression
mlr_model = LinearRegression()
mlr_model = mlr_model.fit(X_train, y_train)
mlr_model

LinearRegression()

In [34]:
# Predicting the Test set results
mlr_predictions = mlr_model.predict(X_test)

In [35]:
# Evaluate the performance.
from sklearn.metrics import r2_score
r2_mlr= r2_score(y_test,mlr_predictions)
print('The r2 score for this Multiple Linear Regression model is:', r2_mlr)

The r2 score for this Multiple Linear Regression model is: 0.9805108122668332
