In [None]:
# Import dependencies
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import fbeta_score

In [None]:
# Mounting Google Drive and setting path to import data
from google.colab import drive
drive.mount("/content/drive")
path = "drive/MyDrive/HUDK4050 Midterm Project/notebooks/data/"

# Import the cleaned dataset
dropout_df = pd.read_csv(path + "cleaned_data.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
dropout_df

Unnamed: 0,StudentID,Dropout,CohortTerm_x,RegistrationDate,Gender,BirthMonth,HSDip,HSDipYr,HSGPAUnwtd,HSGPAWtd,...,Father's Highest Grade Level_Middle School,Father's Highest Grade Level_Unknown,Mother's Highest Grade Level_College,Mother's Highest Grade Level_High School,Mother's Highest Grade Level_Middle School,Mother's Highest Grade Level_Unknown,Housing_Off Campus,Housing_On Campus Housing,Housing_With Parent,Race
0,285848,0,1,20110808,2,9,1,-1,-1.0,-1,...,0,0,1,0,0,0,1,0,0,6
1,302176,0,1,20110804,1,4,1,-1,-1.0,-1,...,0,0,0,1,0,0,1,0,0,6
2,301803,0,1,20110809,2,4,1,-1,-1.0,-1,...,0,0,0,1,0,0,1,0,0,6
3,302756,0,1,20110823,2,1,-1,-1,-1.0,-1,...,0,0,0,1,0,0,1,0,0,6
4,301067,0,1,20110420,1,4,1,-1,-1.0,-1,...,1,0,0,1,0,0,1,0,0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12256,317229,0,1,20160912,2,3,1,-1,-1.0,-1,...,0,0,0,1,0,0,1,0,0,1
12257,325536,0,1,20160427,2,4,1,-1,-1.0,-1,...,0,0,0,0,1,0,0,0,1,6
12258,342013,0,1,20160726,2,4,1,-1,-1.0,-1,...,0,0,0,1,0,0,1,0,0,6
12259,359065,0,1,20160825,1,3,1,-1,-1.0,-1,...,0,0,0,1,0,0,1,0,0,6


In [None]:
# Set the faetures and target variable for the dataset
X = dropout_df.drop("Dropout", axis = 1)
y = dropout_df["Dropout"]

# Split the data into training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# Training a Simple Random Forest Model

In [None]:
# Creating a Random Forest object
forest = RandomForestClassifier()

# Train Random Forest model on the cleaned data
forest.fit(X_train, y_train)

In [None]:
# Make predictions on the test dataset using Random Forest
y_pred = forest.predict(X_test)

# Calculate F-beta score (beta = 2) of Random Forest predictions
fbeta_scored = fbeta_score(y_test, y_pred, beta = 2)

print(f"Random Forest F-beta score (beta = 2): {fbeta_scored:.5f}")

Random Forest F-beta score (beta = 2): 0.77839


# Fitting Random Forest Model to Kaggletest Data

In [None]:
# Import kaggle test data
kaggletest_df = pd.read_csv(path + "cleaned_kaggletest.csv")

kaggletest_df

Unnamed: 0,StudentID,CohortTerm_x,RegistrationDate,Gender,BirthMonth,HSDip,HSDipYr,HSGPAUnwtd,HSGPAWtd,FirstGen,...,Father's Highest Grade Level_Middle School,Father's Highest Grade Level_Unknown,Mother's Highest Grade Level_College,Mother's Highest Grade Level_High School,Mother's Highest Grade Level_Middle School,Mother's Highest Grade Level_Unknown,Housing_Off Campus,Housing_On Campus Housing,Housing_With Parent,Race
0,344883,1,20150611,2,11,1,2015,2.62,-1,-1,...,0,0,0,0,0,1,0,1,0,4
1,298782,1,20110516,1,11,1,2011,3.60,-1,-1,...,0,0,0,1,0,0,0,0,1,3
2,311983,1,20120522,1,10,1,2012,3.14,-1,-1,...,0,1,0,0,0,1,0,0,1,1
3,346945,1,20150528,2,3,1,2015,3.96,-1,-1,...,0,0,0,0,0,1,0,0,1,4
4,313391,1,20120611,1,11,1,-1,-1.00,-1,-1,...,0,0,0,1,0,0,0,0,1,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,310102,1,20120703,1,7,1,-1,-1.00,-1,-1,...,0,0,0,1,0,0,1,0,0,1
996,329407,1,20140616,2,10,1,-1,-1.00,-1,-1,...,0,0,1,0,0,0,1,0,0,1
997,323071,1,20130813,1,8,1,-1,-1.00,-1,-1,...,1,0,0,0,1,0,1,0,0,1
998,356228,1,20160629,2,8,1,-1,-1.00,-1,-1,...,0,0,1,0,0,0,1,0,0,1


In [None]:
# Make predictions on the kaggletest DataFrame using Logistic Regression model
y_pred = forest.predict(kaggletest_df)

# Create a new column in kaggletest DataFrame for predictions
kaggletest_df["Dropout"] = y_pred

# Create Submission DataFrame and then Export to .csv

In [None]:
# Splice kaggletest DataFrame to only include StudentID and Dropout label
submission_df = kaggletest_df[["StudentID", "Dropout"]]

submission_df

Unnamed: 0,StudentID,Dropout
0,344883,0
1,298782,1
2,311983,1
3,346945,0
4,313391,0
...,...,...
995,310102,0
996,329407,0
997,323071,1
998,356228,0


In [None]:
# Export submission DataFrame to submission folder as first Random Forest submission
submission_df.to_csv(path + "submissions/random_forest_submission_3.csv", index = False)

# Perfecting a Random Forest Model on F-Beta (beta = 2)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

In [None]:
# Re-instantiate the Random Forest model
forest = RandomForestClassifier()

# Defining a F-beta score as a base metric for hyperparameter tuning
def custom_fbeta_scorer(y_true, y_pred):
  return fbeta_score(y_true, y_pred, beta = 2)

f2_scorer = make_scorer(custom_fbeta_scorer)

# Setting up GridSearchCV to find best hyperparameters for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator = forest, param_grid = param_grid,
                           scoring = f2_scorer, cv = 5)

In [None]:
# Fitting the model
grid_search.fit(X_train, y_train)

# Best parameters and best F-beta score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

best_params, best_score

({'max_depth': 30,
  'min_samples_leaf': 4,
  'min_samples_split': 10,
  'n_estimators': 300},
 0.7759919956365471)

In [None]:
# Train Random Forest with Optimal Hyperparameters
# Creating a Random Forest object
forest = RandomForestClassifier(max_depth = 30, min_samples_leaf = 4,
                                min_samples_split = 10, n_estimators = 300)

# Train Random Forest model on the cleaned data
forest.fit(X_train, y_train)

In [None]:
# Make predictions on the test dataset using Random Forest
y_pred = forest.predict(X_test)

# Calculate F-beta score (beta = 2) of Random Forest predictions
fbeta_scored = fbeta_score(y_test, y_pred, beta = 2)

print(f"Random Forest F-beta score (beta = 2): {fbeta_scored:.5f}")

Random Forest F-beta score (beta = 2): 0.78825


# Make Predictions Using Kaggletest DataFrame

In [None]:
# Import kaggle test data
kaggletest_df = pd.read_csv(path + "cleaned_kaggletest.csv")

# Make predictions on the kaggletest DataFrame using Logistic Regression model
y_pred = forest.predict(kaggletest_df)

# Create a new column in kaggletest DataFrame for predictions
kaggletest_df["Dropout"] = y_pred

# Create Submission DataFrame and then Export to .csv

In [None]:
# Splice kaggletest DataFrame to only include StudentID and Dropout label
submission_df = kaggletest_df[["StudentID", "Dropout"]]

# Export submission DataFrame to submission folder as first Random Forest submission
submission_df.to_csv(path + "submissions/random_forest_submission_4.csv", index = False)