In [2]:
# Import dependecies
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import fbeta_score

In [3]:
# Mounting Google Drive and setting path to import data
from google.colab import drive
drive.mount("/content/drive")
path = "drive/MyDrive/HUDK4050 Midterm Project/notebooks/data/"

# Import the cleaned dataset
dropout_df = pd.read_csv(path + "cleaned_data.csv")

Mounted at /content/drive


In [4]:
dropout_df

Unnamed: 0,StudentID,Dropout,CohortTerm_x,RegistrationDate,Gender,BirthMonth,HSDip,HSDipYr,HSGPAUnwtd,HSGPAWtd,...,Father's Highest Grade Level_Middle School,Father's Highest Grade Level_Unknown,Mother's Highest Grade Level_College,Mother's Highest Grade Level_High School,Mother's Highest Grade Level_Middle School,Mother's Highest Grade Level_Unknown,Housing_Off Campus,Housing_On Campus Housing,Housing_With Parent,Race
0,285848,0,1,20110808,2,9,1,-1,-1.0,-1,...,0,0,1,0,0,0,1,0,0,6
1,302176,0,1,20110804,1,4,1,-1,-1.0,-1,...,0,0,0,1,0,0,1,0,0,6
2,301803,0,1,20110809,2,4,1,-1,-1.0,-1,...,0,0,0,1,0,0,1,0,0,6
3,302756,0,1,20110823,2,1,-1,-1,-1.0,-1,...,0,0,0,1,0,0,1,0,0,6
4,301067,0,1,20110420,1,4,1,-1,-1.0,-1,...,1,0,0,1,0,0,1,0,0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12256,317229,0,1,20160912,2,3,1,-1,-1.0,-1,...,0,0,0,1,0,0,1,0,0,1
12257,325536,0,1,20160427,2,4,1,-1,-1.0,-1,...,0,0,0,0,1,0,0,0,1,6
12258,342013,0,1,20160726,2,4,1,-1,-1.0,-1,...,0,0,0,1,0,0,1,0,0,6
12259,359065,0,1,20160825,1,3,1,-1,-1.0,-1,...,0,0,0,1,0,0,1,0,0,6


In [5]:
# Set the faetures and target variable for the dataset
X = dropout_df.drop("Dropout", axis = 1)
y = dropout_df["Dropout"]

# Split the data into training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# Training XGBoost Model

In [6]:
# Create a XGBoost Classifier object
xgb_classifier = xgb.XGBClassifier()

# Training the XGBoost Classifier on the cleaned data
xgb_classifier.fit(X_train, y_train)

In [7]:
# Make predictions on the test set
y_pred = xgb_classifier.predict(X_test)

# Calculate F-beta score (beta = 2) for the XGBoost model
fbeta_scored = fbeta_score(y_test, y_pred, beta = 2)

In [8]:
# Display the F-beta score
print(f"XGBoost Classifier F-beta Score (beta = 2): {fbeta_scored:.5f}")

XGBoost Classifier F-beta Score (beta = 2): 0.76492


# Fitting Logistic Regression Model to Kaggletest Data

In [9]:
# Import kaggle test data
kaggletest_df = pd.read_csv(path + "cleaned_kaggletest.csv")

kaggletest_df

Unnamed: 0,StudentID,CohortTerm_x,RegistrationDate,Gender,BirthMonth,HSDip,HSDipYr,HSGPAUnwtd,HSGPAWtd,FirstGen,...,Father's Highest Grade Level_Middle School,Father's Highest Grade Level_Unknown,Mother's Highest Grade Level_College,Mother's Highest Grade Level_High School,Mother's Highest Grade Level_Middle School,Mother's Highest Grade Level_Unknown,Housing_Off Campus,Housing_On Campus Housing,Housing_With Parent,Race
0,344883,1,20150611,2,11,1,2015,2.62,-1,-1,...,0,0,0,0,0,1,0,1,0,4
1,298782,1,20110516,1,11,1,2011,3.60,-1,-1,...,0,0,0,1,0,0,0,0,1,3
2,311983,1,20120522,1,10,1,2012,3.14,-1,-1,...,0,1,0,0,0,1,0,0,1,1
3,346945,1,20150528,2,3,1,2015,3.96,-1,-1,...,0,0,0,0,0,1,0,0,1,4
4,313391,1,20120611,1,11,1,-1,-1.00,-1,-1,...,0,0,0,1,0,0,0,0,1,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,310102,1,20120703,1,7,1,-1,-1.00,-1,-1,...,0,0,0,1,0,0,1,0,0,1
996,329407,1,20140616,2,10,1,-1,-1.00,-1,-1,...,0,0,1,0,0,0,1,0,0,1
997,323071,1,20130813,1,8,1,-1,-1.00,-1,-1,...,1,0,0,0,1,0,1,0,0,1
998,356228,1,20160629,2,8,1,-1,-1.00,-1,-1,...,0,0,1,0,0,0,1,0,0,1


In [10]:
# Make predictions on the kaggletest DataFrame using Logistic Regression model
y_pred = xgb_classifier.predict(kaggletest_df)

# Create a new column in kaggletest DataFrame for predictions
kaggletest_df["Dropout"] = y_pred

# Create Submission DataFrame and then Export to .csv

In [11]:
# Splice the kaggletest DataFrame to only include StudentID and Dropout label
submission_df = kaggletest_df[["StudentID", "Dropout"]]

submission_df

Unnamed: 0,StudentID,Dropout
0,344883,0
1,298782,1
2,311983,0
3,346945,0
4,313391,0
...,...,...
995,310102,0
996,329407,0
997,323071,1
998,356228,0


In [12]:
# Export submission DataFrame to submission folder as first Logistic Regression submission
submission_df.to_csv(path + "submissions/xgb_submission_3.csv", index = False)

# Optimize our XGB Classifier Using Hyperparameter Tuning

In [13]:
# Import additional dependencies
from xgboost import XGBClassifier

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler

In [14]:
# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [15]:
# Re-instantiate the XGBoost classifier
xgb_classifer = XGBClassifier(use_label_encoder = False, eval_metric = "logloss")

# Defining F-beta score with beta = 2 as scoring metrics for hyperparameter tuning
f2_scorer = make_scorer(fbeta_score, beta = 2)

In [15]:
# Setting up RandomizedSearchCV for hyperparameter tuning
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6, 7, 8],
    'min_child_weight': [1, 2, 3, 4],
    'subsample': [0.6, 0.7, 0.8, 0.9],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9]
}
random_search = RandomizedSearchCV(xgb_classifier, param_distributions = param_dist,
                                   n_iter = 50, scoring = f2_scorer, verbose = 3)

In [16]:
# Fitting the model onto the data
random_search.fit(X_train_scaled, y_train)

# Display the best parameters and the best F-beta score
best_params = random_search.best_params_
best_score = random_search.best_score_

best_params, best_score

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5] END colsample_bytree=0.9, learning_rate=0.2, max_depth=4, min_child_weight=4, n_estimators=300, subsample=0.7;, score=0.797 total time=   5.6s
[CV 2/5] END colsample_bytree=0.9, learning_rate=0.2, max_depth=4, min_child_weight=4, n_estimators=300, subsample=0.7;, score=0.781 total time=   7.7s
[CV 3/5] END colsample_bytree=0.9, learning_rate=0.2, max_depth=4, min_child_weight=4, n_estimators=300, subsample=0.7;, score=0.773 total time=   4.3s
[CV 4/5] END colsample_bytree=0.9, learning_rate=0.2, max_depth=4, min_child_weight=4, n_estimators=300, subsample=0.7;, score=0.785 total time=   4.8s
[CV 5/5] END colsample_bytree=0.9, learning_rate=0.2, max_depth=4, min_child_weight=4, n_estimators=300, subsample=0.7;, score=0.784 total time=   3.1s
[CV 1/5] END colsample_bytree=0.6, learning_rate=0.2, max_depth=3, min_child_weight=2, n_estimators=400, subsample=0.7;, score=0.816 total time=   0.8s
[CV 2/5] END colsample_byt

({'subsample': 0.7,
  'n_estimators': 400,
  'min_child_weight': 2,
  'max_depth': 3,
  'learning_rate': 0.2,
  'colsample_bytree': 0.6},
 0.7869556390357294)

## Training Optimal XGB Classifier

In [25]:
# Instantiate the optimal XGBoost classifier with optimal hyperparameters
xgb_model = XGBClassifier(
    subsample = 0.7,
    n_estimators = 400,
    min_child_weight = 2,
    max_depth = 3,
    learning_rate = 0.2,
    colsample_bytree = 0.6,
    use_label_encoder = False,
    eval_metric = "logloss"
)

# Fit the model to the training data
xgb_model.fit(X_train, y_train)

In [26]:
# Make predictions using the trained model
y_pred = xgb_model.predict(X_test)

# Calculate the F-beta score with beta = 2
f2_score = fbeta_score(y_test, y_pred, beta = 2)
f2_score

0.7766785939802148

# Fitting Logistic Regression Model to Kaggletest Data

In [27]:
# Import kaggle test data
kaggletest_df = pd.read_csv(path + "cleaned_kaggletest.csv")

kaggletest_df

Unnamed: 0,StudentID,CohortTerm_x,RegistrationDate,Gender,BirthMonth,HSDip,HSDipYr,HSGPAUnwtd,HSGPAWtd,FirstGen,...,Father's Highest Grade Level_Middle School,Father's Highest Grade Level_Unknown,Mother's Highest Grade Level_College,Mother's Highest Grade Level_High School,Mother's Highest Grade Level_Middle School,Mother's Highest Grade Level_Unknown,Housing_Off Campus,Housing_On Campus Housing,Housing_With Parent,Race
0,344883,1,20150611,2,11,1,2015,2.62,-1,-1,...,0,0,0,0,0,1,0,1,0,4
1,298782,1,20110516,1,11,1,2011,3.60,-1,-1,...,0,0,0,1,0,0,0,0,1,3
2,311983,1,20120522,1,10,1,2012,3.14,-1,-1,...,0,1,0,0,0,1,0,0,1,1
3,346945,1,20150528,2,3,1,2015,3.96,-1,-1,...,0,0,0,0,0,1,0,0,1,4
4,313391,1,20120611,1,11,1,-1,-1.00,-1,-1,...,0,0,0,1,0,0,0,0,1,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,310102,1,20120703,1,7,1,-1,-1.00,-1,-1,...,0,0,0,1,0,0,1,0,0,1
996,329407,1,20140616,2,10,1,-1,-1.00,-1,-1,...,0,0,1,0,0,0,1,0,0,1
997,323071,1,20130813,1,8,1,-1,-1.00,-1,-1,...,1,0,0,0,1,0,1,0,0,1
998,356228,1,20160629,2,8,1,-1,-1.00,-1,-1,...,0,0,1,0,0,0,1,0,0,1


In [28]:
# Make predictions on the kaggletest DataFrame using Logistic Regression model
y_pred = xgb_model.predict(kaggletest_df)

# Create a new column in kaggletest DataFrame for predictions
kaggletest_df["Dropout"] = y_pred

# Create Submission DataFrame and then Export to .csv

In [29]:
# Splice the kaggletest DataFrame to only include StudentID and Dropout label
submission_df = kaggletest_df[["StudentID", "Dropout"]]

Unnamed: 0,StudentID,Dropout
0,344883,0
1,298782,1
2,311983,0
3,346945,0
4,313391,0
...,...,...
995,310102,0
996,329407,0
997,323071,1
998,356228,0


In [31]:
# Export submission DataFrame to submission folder as first Logistic Regression submission
submission_df.to_csv(path + "submissions/xgb_submission_final.csv", index = False)