# Pre-Model Training Foundations and Imports

In [None]:
# Import Dependencies
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import train_test_split

In [None]:
# Mounting Google Drive and setting path to import data
from google.colab import drive
drive.mount("/content/drive")
path = "drive/MyDrive/HUDK4050 Midterm Project/notebooks/data/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Import and create DataFrame for student droupout data
dropout_df = pd.read_csv(path + "dropoutTrainLabels.csv")
dropout_df

Unnamed: 0,StudentID,Dropout
0,285848,0
1,302176,0
2,301803,0
3,302756,0
4,301067,0
...,...,...
12256,317229,0
12257,325536,0
12258,342013,0
12259,359065,0


# Preprocessing and Feature Engineering Student Data

We first want to work on cleaning and merging the financial aid dataset and including it into our final dataset that will be used to eventually train models.

## Merging all Student Characteristic Data

In [None]:
# Define a new function to load all CSV files from within a folder
def load_csv_files_from_folder(path):
  files = os.listdir(path)
  # Create a list of DataFrames from all files with .csv extension
  df_list = [pd.read_csv(os.path.join(path, file)) for file in files if file.endswith(".csv")]
  # Return concatenated list from the list of DataFrames
  return pd.concat(df_list, ignore_index = True)

In [None]:
# Import student static data DataFrames
static_df = load_csv_files_from_folder(path + "Student Static Data")

# Import financial aid DataFrame
financial_df = pd.read_excel(path + "fin_aid_fasfa_data.xlsx")
# Replace student ID column index to standardize StudentID
financial_df.rename(columns = {"ID with leading" : "StudentID"}, inplace = True)

# Import all student progress DataFrames
progress_df = load_csv_files_from_folder(path + "Student Progress Data")

In [None]:
# Merging student financial aid, static data, and progress DataFrames into one
merged_df = dropout_df.merge(static_df, on = "StudentID", how = "left")
merged_df = merged_df.merge(financial_df, on = "StudentID", how = "left")
merged_df = merged_df.merge(progress_df, on = "StudentID", how = "left")

# Display the final merged DataFrame
merged_df

Unnamed: 0,StudentID,Dropout,Cohort_x,CohortTerm_x,Campus,Address1,Address2,City,State,Zip,...,Major1,Major2,Complete1,Complete2,CompleteCIP1,CompleteCIP2,TransferIntent,DegreeTypeSought,TermGPA,CumGPA
0,285848,0,2011-12,1,,328 Adams St Apt 1,,Hoboken,NJ,7030.0,...,51.1699,-1.0,8,0,51.3899,-2,-1,6,3.90,3.66
1,285848,0,2011-12,1,,328 Adams St Apt 1,,Hoboken,NJ,7030.0,...,51.1699,-1.0,0,0,-2.0000,-2,-1,6,3.80,3.54
2,285848,0,2011-12,1,,328 Adams St Apt 1,,Hoboken,NJ,7030.0,...,51.3899,-1.0,0,0,-2.0000,-2,-1,6,3.25,3.25
3,302176,0,2011-12,1,,142 Cherry St,,Jersey City,NJ,7305.0,...,51.3801,-1.0,8,0,51.3801,-2,-1,6,3.40,3.68
4,302176,0,2011-12,1,,142 Cherry St,,Jersey City,NJ,7305.0,...,51.3801,-1.0,0,0,-2.0000,-2,-1,6,4.00,3.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52963,325536,0,2016-17,1,,8311 2nd Ave,,North Bergen,NJ,7047.0,...,26.0101,-1.0,0,0,-2.0000,-2,-1,6,3.77,3.87
52964,325536,0,2016-17,1,,8311 2nd Ave,,North Bergen,NJ,7047.0,...,26.0101,-1.0,0,0,-2.0000,-2,-1,6,4.00,4.00
52965,342013,0,2016-17,1,,27 Nichols St 1,,Newark,NJ,7002.0,...,45.1101,-1.0,0,0,-2.0000,-2,-1,6,3.00,3.00
52966,359065,0,2016-17,1,,1804 B St,,Belmar,NJ,7719.0,...,51.0000,-1.0,0,0,-2.0000,-2,-1,6,2.30,2.30


## Cleaning the Merged Student Characteristic Dataset

In [None]:
# Cleaning dataset and only keep the last term each student attended
last_term_attended = merged_df.groupby("StudentID").tail(1)

# Ensuring ordering of StudentIDs maintained as in original dataset
cleaned_df = merged_df[merged_df["StudentID"].isin(merged_df["StudentID"])].drop_duplicates("StudentID")

cleaned_df

Unnamed: 0,StudentID,Dropout,Cohort_x,CohortTerm_x,Campus,Address1,Address2,City,State,Zip,...,Major1,Major2,Complete1,Complete2,CompleteCIP1,CompleteCIP2,TransferIntent,DegreeTypeSought,TermGPA,CumGPA
0,285848,0,2011-12,1,,328 Adams St Apt 1,,Hoboken,NJ,7030.0,...,51.1699,-1.0000,8,0,51.3899,-2,-1,6,3.90,3.66
3,302176,0,2011-12,1,,142 Cherry St,,Jersey City,NJ,7305.0,...,51.3801,-1.0000,8,0,51.3801,-2,-1,6,3.40,3.68
9,301803,0,2011-12,1,,12 Rainbow Street,,Presque Isle,ME,4769.0,...,51.1699,-1.0000,8,0,51.3899,-2,-1,6,4.00,3.97
12,302756,0,2011-12,1,,345 4th St Apt 2,,Jersey City,NJ,7302.0,...,45.0601,-1.0000,7,0,45.0601,-2,-1,6,4.00,3.82
16,301067,0,2011-12,1,,240 3rd St,,Jersey City,NJ,7302.0,...,23.0101,13.1001,0,0,-2.0000,-2,-1,6,4.00,4.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52962,317229,0,2016-17,1,,104 W 43rd St,,Bayonne,NJ,7002.0,...,26.0101,-1.0000,0,0,-2.0000,-2,-1,6,0.00,0.00
52963,325536,0,2016-17,1,,8311 2nd Ave,,North Bergen,NJ,7047.0,...,26.0101,-1.0000,0,0,-2.0000,-2,-1,6,3.77,3.87
52965,342013,0,2016-17,1,,27 Nichols St 1,,Newark,NJ,7002.0,...,45.1101,-1.0000,0,0,-2.0000,-2,-1,6,3.00,3.00
52966,359065,0,2016-17,1,,1804 B St,,Belmar,NJ,7719.0,...,51.0000,-1.0000,0,0,-2.0000,-2,-1,6,2.30,2.30


In [None]:
# Check for amount of missing values in data
missing = cleaned_df.isnull().sum()
missing_percent = (missing / len(cleaned_df)) * 100

# Create a DataFrame to view missing values and percentage
missing_df = pd.DataFrame({"Missing Values" : missing, "Percentage" : missing_percent})
missing_df_sorted = missing_df[missing_df["Missing Values"] > 0].sort_values(
    by = "Percentage", ascending = False)

missing_df_sorted.head(5)

Unnamed: 0,Missing Values,Percentage
Campus,12261,100.0
2012 Work/Study,12165,99.21703
2012 Scholarship,12101,98.695049
2013 Work/Study,12097,98.662426
2014 Work/Study,12035,98.156757


## Filling in Missing Values within Merged DataFrame

In [None]:
# Set a threshold of 60% for dropping columns with more than these missing values
threshold = 60

# Drop columns with missing values with greater percentage than these
columns_drop = missing_df_sorted[missing_df_sorted["Percentage"] > threshold].index
cleaned_df = cleaned_df.drop(columns = columns_drop)

In [None]:
# Check for missing values in the remaining dataset
remaining_missing = cleaned_df.isnull().sum()
remaining_missing_df = pd.DataFrame({"Remaining Missing Values" : remaining_missing})
remaining_missing_df = remaining_missing_df[remaining_missing_df["Remaining Missing Values"] > 0]

# Display the remaining missing values
remaining_missing_df.head(10)

Unnamed: 0,Remaining Missing Values
Address1,103
City,104
State,103
Zip,121
BirthYear,1
Marital Status,1841
Adjusted Gross Income,1841
Parent Adjusted Gross Income,1841
Father's Highest Grade Level,1968
Mother's Highest Grade Level,2161


In [None]:
# Import additional dependencies
from sklearn.impute import SimpleImputer

# Create a imputer for "most_frequent" for categorical columns
categorical_imputer = SimpleImputer(strategy = "most_frequent")
categorical_columns = ["Marital Status", "Father's Highest Grade Level", "Mother's Highest Grade Level"]

# Create a imputer for "median" for numerical columns
numerical_imputer = SimpleImputer(strategy = "median")
numerical_columns = ["Adjusted Gross Income", "Parent Adjusted Gross Income"]

In [None]:
# Impute the categorical columns
cleaned_df[categorical_columns] = categorical_imputer.fit_transform(cleaned_df[categorical_columns])

# Impute the numerical columns
cleaned_df[numerical_columns] = numerical_imputer.fit_transform(cleaned_df[numerical_columns])

# Remove geographical data as well as BirthYear
columns_remove = ["Address1", "City", "State", "Zip", "BirthYear"]
cleaned_df = cleaned_df.drop(columns = columns_remove)

In [None]:
# Impute the "Housing" column with most frequent value
cleaned_df["Housing"] = categorical_imputer.fit_transform(cleaned_df[["Housing"]])

## Dropping Redundant Categorical Variables and Encoding Others

In [None]:
# Drop the potentially redundant categorical variables
columns_drop = ["Cohort_x", "Cohort_y", "AcademicYear", "cohort"]
cleaned_df = cleaned_df.drop(columns = columns_drop)

In [None]:
# Encode remaining categorical variables and use one-hot encoding as default method
categorical_cols = ["Marital Status", "Father's Highest Grade Level",
                    "Mother's Highest Grade Level", "Housing"]
cleaned_df = pd.get_dummies(cleaned_df, columns = categorical_cols)

In [None]:
# Survey where data is currently at
print(cleaned_df.shape)
cleaned_df.columns

(12261, 60)


Index(['StudentID', 'Dropout', 'CohortTerm_x', 'RegistrationDate', 'Gender',
       'BirthMonth', 'Hispanic', 'AmericanIndian', 'Asian', 'Black',
       'NativeHawaiian', 'White', 'TwoOrMoreRace', 'HSDip', 'HSDipYr',
       'HSGPAUnwtd', 'HSGPAWtd', 'FirstGen', 'DualHSSummerEnroll',
       'EnrollmentStatus', 'NumColCredAttemptTransfer',
       'NumColCredAcceptTransfer', 'CumLoanAtEntry', 'HighDeg',
       'MathPlacement', 'EngPlacement', 'GatewayMathStatus',
       'GatewayEnglishStatus', 'cohort term', 'Adjusted Gross Income',
       'Parent Adjusted Gross Income', 'CohortTerm_y', 'Term',
       'CompleteDevMath', 'CompleteDevEnglish', 'Major1', 'Major2',
       'Complete1', 'Complete2', 'CompleteCIP1', 'CompleteCIP2',
       'TransferIntent', 'DegreeTypeSought', 'TermGPA', 'CumGPA',
       'Marital Status_Divorced', 'Marital Status_Married',
       'Marital Status_Separated', 'Marital Status_Single',
       'Father's Highest Grade Level_College',
       'Father's Highest Grade Leve

In [None]:
# One-hot encode race variables into one "Race" feature
races = ["Hispanic", "AmericanIndian", "Asian", "Black", "NativeHawaiian", "White", "TwoOrMoreRace"]

# One-hot encoding the race columns into numerical labels
cleaned_df["Race"] = cleaned_df[races].idxmax(axis = 1).replace({
    "Hispanic" : 1, "AmericanIndian" : 2, "Asian" : 3, "Black" : 4,
    "NativeHawaiian" : 5, "White" : 6, "TwoOrMoreRace" : 7
})
# Drop the original Race binary labeled columns
cleaned_df.drop(races, axis = 1, inplace = True)

In [None]:
# Drop additional columns such as birth month, cohort term, registration date, etc.
columns_remove = [
    "CohortTerm_x", "BirthMonth", "FirstGen", "HSGPAUnwtd", "HSGPAWtd", "FirstGen", "DualHSSummerEnroll",
    "CumLoanAtEntry", "cohort term", "CohortTerm_y", "Term", "Complete2", "TransferIntent", "DegreeTypeSought"
]

In [None]:
# Additional Cleaning for Features which Don't Appear to be Missing
columns_impute = ['NumColCredAttemptTransfer', 'NumColCredAcceptTransfer', 'HighDeg']

median_values = {}
for col in columns_impute:
  valid_values = cleaned_df[(cleaned_df[col] != -1) & (cleaned_df[col] != -2)][col]
  median_values[col] = valid_values.median()

# Replacing -1 values with median values in dataset
for col, median_val in median_values.items():
  cleaned_df.loc[cleaned_df[col] == -1, col] = median_val


# Display calculated median values
median_values

{'NumColCredAttemptTransfer': 70.0,
 'NumColCredAcceptTransfer': 64.0,
 'HighDeg': 0.0}

In [None]:
cleaned_df.shape

(12261, 54)

# Exporting Cleaned Dataset to .csv

In [None]:
# Export cleaned DataFrame to the data folder
cleaned_df.to_csv(path + "cleaned_data.csv", index = False)