In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install kmodes

Collecting kmodes
  Downloading kmodes-0.12.2-py2.py3-none-any.whl (20 kB)
Installing collected packages: kmodes
Successfully installed kmodes-0.12.2


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import SMOTE
from kmodes.kprototypes import KPrototypes
from sklearn.cluster import KMeans

In [None]:
path = '/content/drive/MyDrive/IIT Chicago /Coursework /OULAD_analysis/'
# path = '/content/drive/MyDrive/OULAD_analysis/'

assessments = pd.read_csv(path + "assessments.csv")

courses = pd.read_csv(path + "courses.csv")

student_assessment = pd.read_csv(path + "studentAssessment.csv")

student_info = pd.read_csv(path + "studentInfo.csv")

student_registration = pd.read_csv(path + "studentRegistration.csv")

student_vle = pd.read_csv(path + "studentVle.csv")

vle = pd.read_csv(path + "vle.csv")

domains=pd.DataFrame(
    {
        "code_module": ["AAA", "BBB", "CCC", "DDD", "EEE", "FFF", "GGG"],
        "domain": [
            "Social Sciences",
            "Social Sciences",
            "STEM",
            "STEM",
            "STEM",
            "STEM",
            "Social Sciences",
        ],
    },
)

In [None]:
# Handling weights for module GGG
assessments.loc[(assessments['code_module']=='GGG') & (assessments['weight']==0), 'weight'] = 11.11
# Handling weights for module FFF
assessments.loc[(assessments['code_module']=='FFF') & (assessments['weight']==0), 'weight'] = 14.29

# Early Dropout Prediction

In [None]:
def OrdMapping(df):

    imd_band_mapping = {'0-10%': 0,'10-20': 1,'20-30%': 2 , '30-40%':3,'40-50%': 4, '50-60%':5, '60-70%':6, '70-80%':7, '80-90%':8, '90-100%':9 }
    highest_education_mapping= {'No Formal quals':0, 'Lower Than A Level':2, 'A Level or Equivalent':3, 'HE Qualification':4,'Post Graduate Qualification':5}

    df['imd_band'] = df.imd_band.map(imd_band_mapping)
    df['highest_education'] = df.highest_education.map(highest_education_mapping)

    return df

In [None]:
# demographic cluster

cluster_stdInfo = student_info.copy()

# Assume df is already loaded and contains the required columns
categorical_columns = ['gender', 'highest_education', 'imd_band', 'age_band', 'disability', 'region']
numerical_columns = ['num_of_prev_attempts', 'studied_credits']

# Convert categorical columns to category dtypes
for col in categorical_columns:
    cluster_stdInfo[col] = cluster_stdInfo[col].astype('category')

# Creating a copy of the data for clustering that excludes the key columns
cluster_stdInfo = cluster_stdInfo[categorical_columns + numerical_columns].copy()
cluster_stdInfo[categorical_columns] = cluster_stdInfo[categorical_columns].apply(lambda x: x.cat.codes)

# Convert the DataFrame to a NumPy array
data_matrix = cluster_stdInfo.to_numpy()

kproto = KPrototypes(n_clusters=5, init='Cao', n_init=5, verbose=1)

clusters = kproto.fit_predict(data_matrix, categorical=[cluster_stdInfo.columns.get_loc(c) for c in categorical_columns])

student_info['demographic_cluster'] = clusters

Initialization method and algorithm are deterministic. Setting n_init to 1.
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 1, iteration: 1/100, moves: 472, ncost: 12103055.784098083
Run: 1, iteration: 2/100, moves: 6, ncost: 12103052.978690052
Run: 1, iteration: 3/100, moves: 0, ncost: 12103052.978690052
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 2, iteration: 1/100, moves: 981, ncost: 4449176.798128769
Run: 2, iteration: 2/100, moves: 20, ncost: 4448766.431946038
Run: 2, iteration: 3/100, moves: 0, ncost: 4448766.431946038
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 3, iteration: 1/100, moves: 2372, ncost: 12103843.2150009
Run: 3, iteration: 2/100, moves: 44, ncost: 12103835.805208374
Run: 3, iteration: 3/100, moves: 0, ncost: 12103835.805208374
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 4, iteration: 1/100, moves: 10238, nc

In [None]:
## early preds
cuts = [0, 30, 60, 90, 120, 150, 180, 210, 240, 270]
results = []

for cut in cuts:

    print("Point in time:", cut)
    ## create final_weighted_score, perc_ontime_sub features
    student_assessment_cut = student_assessment[student_assessment.date_submitted <= cut]

    assessmentsCombinedDf = student_assessment_cut.merge(assessments, on=['id_assessment'])

    assessmentsCombinedDf['weighted_score'] = assessmentsCombinedDf['score'] * assessmentsCombinedDf['weight'] / 100
    assessmentsCombinedDf['sum_weighted_score'] = assessmentsCombinedDf.groupby(["code_module","code_presentation","id_student"])['weighted_score'].transform('sum')
    assessmentsCombinedDf['sum_weight'] = assessmentsCombinedDf.groupby(["code_module","code_presentation","id_student"])['weight'].transform('sum')

    assessmentsCombinedDf['OntimeSubmission'] = np.where(assessmentsCombinedDf["date_submitted"] <= assessmentsCombinedDf["date"], 1, 0)
    assessmentsCombinedDf['ontime_sum'] = assessmentsCombinedDf.groupby(["code_module","code_presentation","id_student"])['OntimeSubmission'].transform('sum')
    assessmentsCombinedDf['ontime_count'] = assessmentsCombinedDf.groupby(["code_module","code_presentation","id_student"])['OntimeSubmission'].transform('count')

    assessmentsCombinedDf = assessmentsCombinedDf.groupby(['code_module','code_presentation','id_student'], as_index=False).last()
    assessmentsCombinedDf['final_weighted_score'] = assessmentsCombinedDf['sum_weighted_score'] / assessmentsCombinedDf['sum_weight']
    assessmentsCombinedDf["perc_ontime_sub"] = assessmentsCombinedDf['ontime_sum'] / assessmentsCombinedDf['ontime_count']

    assessmentsCombinedDf.drop(["id_assessment", "score", "date", "weight", "weighted_score", "sum_weighted_score",
                                "sum_weight", "OntimeSubmission", "ontime_sum", "ontime_count", "date_submitted",
                                "assessment_type"], axis=1, inplace=True)

    # Create avgClicksPerDay feature
    student_vle_cut = student_vle[student_vle.date <= 0]

    vleCombinedDf = student_vle_cut.merge(vle, on=['code_module','code_presentation', 'id_site']).drop(['week_from', 'week_to'], axis=1)
    vleCombinedDf['avgClicksPerDay'] = (vleCombinedDf.groupby(['code_module','code_presentation','id_student'])['sum_click']
                                        .transform('mean'))
    vleCombinedDf = vleCombinedDf.groupby(['code_module','code_presentation','id_student'], as_index=False).first()
    vleCombinedDf.drop(["id_site", "date", "activity_type", "sum_click"], axis=1, inplace=True)

    # merging student_registration, courses and domains
    std_reg_courses = student_registration.merge(courses, on=["code_module","code_presentation"])
    std_reg_courses_domain = std_reg_courses.merge(domains, on="code_module")
    std_reg_courses_domain.drop(['date_registration','date_unregistration'], axis=1, inplace=True)

    # merging student_info, std_reg_courses_domain, vleCombinedDf, assessmentsCombinedDf
    std_info_reg_courses_domain = student_info.merge(std_reg_courses_domain, on=["code_module", "code_presentation", "id_student"])
    std_info_reg_courses_domain_vle = std_info_reg_courses_domain.merge(vleCombinedDf, on=["code_module", "code_presentation", "id_student"])
    finalCombinedDf = std_info_reg_courses_domain_vle.merge(assessmentsCombinedDf, on=["code_module", "code_presentation", "id_student"])

    # Missing value handling
    finalCombinedDf.loc[(finalCombinedDf.final_weighted_score.isna()), 'final_weighted_score'] = 0
    finalCombinedDf.loc[(finalCombinedDf.imd_band.isna()), 'imd_band'] = '50-60%'

    # behavioural cluster
    # Apply K-means clustering
    kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
    finalCombinedDf['behavioural_cluster'] = kmeans.fit_predict(finalCombinedDf[['avgClicksPerDay']])


    #assesments cluster
    finalCombinedDf['assesments_cluster'] = kmeans.fit_predict(finalCombinedDf[['perc_ontime_sub',
                                                                            'final_weighted_score']])

    # Encoding
    ordinal_features = ['imd_band','highest_education']
    nominal_features = ['code_module','code_presentation','gender','region','age_band','disability']

    # ordinal encoding
    ordMapped_df = OrdMapping(finalCombinedDf)

    # One hot encoding
    onehotEncoded_df = pd.get_dummies(ordMapped_df, columns=nominal_features, drop_first=True, dtype=float)

    # Target variable creation
    onehotEncoded_df['isDropout'] = np.where(onehotEncoded_df["final_result"] == "Withdrawn", 1, 0)

    # print(onehotEncoded_df.describe())
    # print(onehotEncoded_df.columns)
    # print(onehotEncoded_df.shape)

    # final df
    features = ['avgClicksPerDay', 'final_weighted_score', 'perc_ontime_sub', 'demographic_cluster',
                'behavioural_cluster', 'assesments_cluster']

    target = ['isDropout']

    final_df = onehotEncoded_df[features + target]

    # Scoring
    scoring = {'accuracy' : make_scorer(accuracy_score),
              'precision' : make_scorer(precision_score),
              'recall' : make_scorer(recall_score),
              'f1_score' : make_scorer(f1_score),
               'roc_auc' : 'roc_auc'}

    # load dataset
    X_ = final_df[features].values
    y = final_df.isDropout.values

    # prepare models
    # DT
    # RF
    # XGB
    # LGBM
    models = []
    models.append(('DT', DecisionTreeClassifier()))
    models.append(('RF', RandomForestClassifier()))
    models.append(('XGBoost', XGBClassifier()))
    models.append(('LightGBM', LGBMClassifier(force_row_wise=True, verbose=-1, verbose_eval=False)))

    # evaluate each model in turn

    # names = []

    for name, model in models:
        kfold = model_selection.KFold(n_splits=5)
        cv_results = model_selection.cross_validate(model, X_, y, cv=kfold, scoring=scoring)

        mean_accuracy = np.mean(cv_results["test_accuracy"])
        mean_precision = np.mean(cv_results["test_precision"])
        mean_recall = np.mean(cv_results["test_recall"])
        mean_f1_score = np.mean(cv_results["test_f1_score"])
        mean_roc_auc = np.mean(cv_results["test_roc_auc"])

        results.append([cut, name, "No", mean_accuracy, mean_precision, mean_recall, mean_f1_score, mean_roc_auc])
        # names.append(name)

    # final_results_df = pd.DataFrame(data = results, index = names, columns=["mean_accuracy", "mean_precision",
    #                                                                     "mean_recall", "mean_f1_score",
    #                                                                     "mean_roc_auc"])
    # print("Metrics without sampling")
    # print(final_results_df)

    # Scoring with sampling
    # sampled_data_results = []
    # names = []

    for name, model in models:

        imba_pipeline = make_pipeline(SMOTE(random_state=42),
                                      model)

        kfold = model_selection.KFold(n_splits=5)
        cv_sampled_results = model_selection.cross_validate(imba_pipeline, X_, y, cv=kfold, scoring=scoring)

        mean_accuracy = np.mean(cv_sampled_results["test_accuracy"])
        mean_precision = np.mean(cv_sampled_results["test_precision"])
        mean_recall = np.mean(cv_sampled_results["test_recall"])
        mean_f1_score = np.mean(cv_sampled_results["test_f1_score"])
        mean_roc_auc = np.mean(cv_sampled_results["test_roc_auc"])

        results.append([cut, name, "Yes", mean_accuracy, mean_precision, mean_recall, mean_f1_score, mean_roc_auc])
        # names.append(name)

    # final_results_df_withSampling = pd.DataFrame(data = sampled_data_results,
    #                                             index = names, columns=["mean_accuracy", "mean_precision",
    #                                                                         "mean_recall", "mean_f1_score",
    #                                                                         "mean_roc_auc"])
    # print("Metrics with sampling")
    # print(final_results_df_withSampling)

Point in time: 0




Point in time: 30




Point in time: 60




Point in time: 90




Point in time: 120




Point in time: 150




Point in time: 180




Point in time: 210




Point in time: 240




Point in time: 270




In [None]:
pd.DataFrame(data = results, columns=["Cut", "Model", "Sampling","mean_accuracy", "mean_precision",
                                      "mean_recall", "mean_f1_score", "mean_roc_auc"])

[[0,
  'DT',
  'No',
  0.6285897435897436,
  0.3123076923076923,
  0.34849983981862537,
  0.32394048878547005,
  0.5365268352350382],
 [0,
  'RF',
  'No',
  0.6898534798534798,
  0.32191766566766566,
  0.17867184997166025,
  0.22192223967003416,
  0.563423094520919],
 [0,
  'XGBoost',
  'No',
  0.6573992673992674,
  0.2946041934103551,
  0.21803395845141577,
  0.24500724269616417,
  0.546907509251134],
 [0,
  'LightGBM',
  'No',
  0.6860805860805861,
  0.2781512605042017,
  0.16737187543639157,
  0.20427936050006013,
  0.5438605981253185],
 [0,
  'DT',
  'Yes',
  0.6036446886446887,
  0.29205396515578685,
  0.39030615178622774,
  0.33069584666289326,
  0.5304081735808245],
 [0,
  'RF',
  'Yes',
  0.6418681318681319,
  0.2965671971706454,
  0.3178000525723486,
  0.30601735628051413,
  0.5661991044547348],
 [0,
  'XGBoost',
  'Yes',
  0.6189377289377289,
  0.29238152198678513,
  0.36599472633628233,
  0.3243864591055102,
  0.5673221487581485],
 [0,
  'LightGBM',
  'Yes',
  0.636172161172

In [None]:
pd.DataFrame(data = results, columns=["Cut", "Model", "Sampling","mean_accuracy", "mean_precision",
                                      "mean_recall", "mean_f1_score", "mean_roc_auc"]).to_excel("results.xlsx")