#Importing Dependencies

In [None]:
# Import dependencies
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, fbeta_score
from sklearn.preprocessing import LabelEncoder

# Further Importation and Pre-processing of Data

In [None]:
# Mounting Google Drive and setting path to import data
from google.colab import drive
drive.mount("/content/drive")
path = "drive/MyDrive/HUDK4050 Midterm Project/notebooks/data/"

# Import the cleaned dataset
dropout_df = pd.read_csv(path + "cleaned_data.csv")

dropout_df

Mounted at /content/drive


Unnamed: 0,StudentID,Race,EnrollmentStatus,NumColCredAttemptTransfer,HighDeg,MathPlacement,EngPlacement,GatewayMathStatus,Marital Status,Adjusted Gross Income,Parent Adjusted Gross Income,Father's Highest Grade Level,Mother's Highest Grade Level,Housing,Major1,TermGPA,CumGPA,Dropout
0,20932,4,2,81.0,0,0.0,0.0,0,2,52555.000000,0.000000,4,4,4,0.0000,0.00,0.00,1
1,21868,6,2,71.0,0,0.0,0.0,0,1,30600.000000,0.000000,2,2,2,23.0101,4.00,3.82,0
2,21943,1,2,81.0,0,0.0,0.0,0,1,27879.000000,0.000000,4,2,2,26.0101,0.00,0.00,1
3,22163,4,2,91.0,0,0.0,0.0,0,1,26794.000000,0.000000,4,3,3,52.0201,4.00,3.30,0
4,22672,4,2,0.0,0,0.0,0.0,0,1,13262.855278,28317.869674,2,2,2,52.0801,1.85,3.21,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12256,359313,6,1,0.0,0,0.5,0.5,0,1,13262.855278,28317.869674,2,2,2,0.0000,4.00,4.00,0
12257,359320,1,1,0.0,0,0.5,0.5,0,1,13262.855278,28317.869674,2,2,2,42.0101,4.00,4.00,0
12258,359327,6,2,28.0,0,0.0,0.0,0,1,0.000000,25679.000000,4,4,4,52.0801,0.81,0.81,0
12259,359554,6,2,60.0,0,0.0,0.0,0,2,35308.000000,0.000000,2,2,2,45.1001,4.00,4.00,0


In [None]:
# Set the features and target variables for the dataset
X = dropout_df.drop("Dropout", axis = 1)
y = dropout_df["Dropout"]

# Split the data into training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# Training a Base Decision Tree Classifier

In [None]:
# Creating a Decision Tree Classifier object
tree = DecisionTreeClassifier()

# Fit the Decision Tree Classifier onto the training data
tree.fit(X_train, y_train)

In [None]:
# Make predictions using the Decision Tree Classifier
y_pred = tree.predict(X_test)

# Define the accuracy as well as the inbuilt classification report within the trained model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Base Decision Tree Classifier Accuracy: {accuracy:.3f}")

Base Decision Tree Classifier Accuracy: 0.736


In [None]:
# Calculate F-beta score with beta = 2 for base decision tree classifier
f_beta = fbeta_score(y_test, y_pred, beta = 2)

# Display the F-beta score
print(f"Base Decision Tree F-beta Score (beta = 2): {f_beta:.5f}")

Base Decision Tree F-beta Score (beta = 2): 0.65939


# Fitting Decision Tree onto Kaggletest Data

In [None]:
# Import kaggle test data
kaggletest_df = pd.read_csv(path + "cleaned_kaggletest.csv")

kaggletest_df

Unnamed: 0,StudentID,Race,EnrollmentStatus,NumColCredAttemptTransfer,HighDeg,MathPlacement,EngPlacement,GatewayMathStatus,Marital Status,Adjusted Gross Income,Parent Adjusted Gross Income,Father's Highest Grade Level,Mother's Highest Grade Level,Housing,Major1,TermGPA,CumGPA
0,344883,4,1,0.0,0,1.0,1.0,0,1,0.000000,15018.000000,3,4,4,52.0201,2.38,1.86
1,298782,3,1,0.0,0,1.0,0.0,0,1,0.000000,43501.000000,3,2,2,0.0000,3.43,3.35
2,311983,1,1,0.0,0,0.0,1.0,1,1,0.000000,36132.000000,4,4,4,11.0101,0.00,2.38
3,346945,4,1,0.0,0,1.0,0.0,0,1,0.000000,137819.000000,2,4,4,26.0101,1.59,2.58
4,313391,6,1,0.0,0,1.0,0.0,0,1,10378.561837,28299.717314,2,2,2,23.0101,3.70,3.43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,310102,1,2,123.0,2,0.0,0.0,1,1,18363.000000,0.000000,2,2,2,45.1001,2.00,2.57
996,329407,1,2,41.0,0,0.0,0.0,0,1,14489.000000,0.000000,3,3,3,43.0199,3.50,2.77
997,323071,1,2,47.0,0,0.0,0.0,0,1,5270.000000,0.000000,1,1,1,54.0101,0.00,1.86
998,356228,1,2,72.0,0,0.0,0.0,0,1,15955.000000,0.000000,3,3,3,51.3801,2.54,2.57


In [None]:
# Make predictions on the kaggletest DataFrame using Decision Tree model
y_pred = tree.predict(kaggletest_df)

# Create a new column in kaggletest DataFrame for predictions
kaggletest_df["Dropout"] = y_pred

# Create Submission DataFrame and then Export to .csv

In [None]:
# Splice the kaggletest DataFrame to only include StudentID and Dropout label
submission_df = kaggletest_df[["StudentID", "Dropout"]]

submission_df

Unnamed: 0,StudentID,Dropout
0,344883,0
1,298782,1
2,311983,1
3,346945,0
4,313391,0
...,...,...
995,310102,0
996,329407,0
997,323071,1
998,356228,0


In [None]:
# Export submission DataFrame to submission folder as first Decision Tree submission
submission_df.to_csv(path + "submissions/decision_tree_submission_1.csv", index = False)