In [1]:
# Data import and manipulation
import pandas as pd
import numpy as np
import uuid
import os 
from collections import defaultdict
from datetime import date
import datetime
from pandasql import sqldf
import matplotlib.pyplot as plt
import seaborn as sns
import statistics


# Feature engineering
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# Model Building
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import PredefinedSplit
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV


# Model Evaluation
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve
from sklearn.metrics import roc_auc_score
from sklearn import metrics 
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.utils import resample
from scipy.special import softmax
import plot_utils as easyplt 

## Import Files

In [3]:
# Helper function to help calculate the pass/fail status of target course
def pf_flag(grade): 
    if grade in ['A', 'A-', 'B', 'B+', 'B-', 'C', 'C+', 'P']: 
        return '-1'
    return '1'


# Creates new columns and a unions dataframes to prep for model input
def union_data(train, valid, test):
    # Create new columns per dataframe
    train['pf_flag'] = train.apply(lambda x: 1 if (x['Pass_Fail'] == 'pass') else -1, axis = 1) 
    train['source'] = 'train'
    valid['pf_flag'] = valid.apply(lambda x: 1 if (x['Pass_Fail'] == 'pass') else -1, axis = 1) 
    valid['source'] = 'validate'
    test['pf_flag'] = test.apply(lambda x: 1 if (x['Pass_Fail'] == 'pass') else -1, axis = 1) 
    test['source'] = 'test'
    
    # Union all dataframes
    dfs = [train, valid, test]
    union = pd.concat(dfs)
    
    return union

# Identify the courses with highest enrollment (>= 20% of students enrolled)
def id_common_crs(train_df):
    #Create a grouping by course to identify high enrollment courses
    crs_by_enrl = train_df.groupby(['unique_course']).agg({'uuid': ['nunique']})
    crs_by_enrl.columns = crs_by_enrl.columns.droplevel(0)
    crs_by_enrl = crs_by_enrl.reset_index(inplace=False)
    crs_by_enrl.sort_values(by=['nunique'], inplace=True, ascending=False)
    
    #Create new columns
    crs_by_enrl['tot_stu'] = train_df.uuid.nunique()
    crs_by_enrl['perc'] = (crs_by_enrl['nunique'] / crs_by_enrl['tot_stu']) * 100
    
    #Calculate percent of total to use to create 20% threshhold 
    crs_data = crs_by_enrl.loc[crs_by_enrl['perc'] >= 20].copy()
    crs_data.reset_index(inplace=True)
    
    #List of common courses
    common_crs = crs_data['unique_course'].tolist()
    
    return common_crs

# Function imports file and creates df; drops unnecessary columns and converts to correct data types
# Function also returns list of common courses taken from whole dataset
def import_file(file_path):
    
    #Move directories to find file
    os.getcwd()
    os.chdir('../data')

    
    #Read excel file and drop unnecessary column
    wkbk = pd.ExcelFile(file_path)
    #train
    t1 = pd.read_excel(wkbk, 'train')  
    #validate
    v = pd.read_excel(wkbk, 'validate') 
    #test
    t2 = pd.read_excel(wkbk, 'test')     
    
    # Change data type of enrl_term_id
    t1['enrl_term_id'] = t1['enrl_term_id'].astype(int)
    v['enrl_term_id'] = t1['enrl_term_id'].astype(int)
    t2['enrl_term_id'] = t1['enrl_term_id'].astype(int)
    
    # Create pf_flag for unique_courses
    t1['pf_flag'] = t1.apply(lambda x: -1 if (x['Pass_Fail'] == 'pass') else 1, axis = 1) 
    v['pf_flag'] = v.apply(lambda x: -1 if (x['Pass_Fail'] == 'pass') else 1, axis = 1) 
    t2['pf_flag'] = t2.apply(lambda x: -1 if (x['Pass_Fail'] == 'pass') else 1, axis = 1) 

    # Create target_crs_pass_fail for target course
    t1['target_crs_pass_fail'] = t1['target_crs_grade'].apply(pf_flag)
    v['target_crs_pass_fail'] = v['target_crs_grade'].apply(pf_flag)
    t2['target_crs_pass_fail'] = t2['target_crs_grade'].apply(pf_flag)
    
    
    # Create percentage flags for UCCs and Gateways
    t1['perc_ucc_passed'] = t1['UCCs_passed'] / t1['UCCs_taken']
    t1['perc_gateway_passed'] = t1['gateways_passed'] / t1['gateways_taken']
    t1['perc_ucc_failed'] = t1['UCCs_failed'] / t1['UCCs_taken']
    t1['perc_gateway_failed'] = t1['gateways_failed'] / t1['gateways_taken']
    
    v['perc_ucc_passed'] = v['UCCs_passed'] / v['UCCs_taken']
    v['perc_gateway_passed'] = v['gateways_passed'] / v['gateways_taken']
    v['perc_ucc_failed'] = v['UCCs_failed'] / v['UCCs_taken']
    v['perc_gateway_failed'] = v['gateways_failed'] / v['gateways_taken']
    
    t2['perc_ucc_passed'] = t2['UCCs_passed'] / t2['UCCs_taken']
    t2['perc_gateway_passed'] = t2['gateways_passed'] / t2['gateways_taken']
    t2['perc_ucc_failed'] = t2['UCCs_failed'] / t2['UCCs_taken']
    t2['perc_gateway_failed'] = t2['gateways_failed'] / t2['gateways_taken']
    
    
    # Drop Pass_Fail flag
    t1.drop(['Pass_Fail','target_crs_grade'], axis=1, inplace = True)
    v.drop(['Pass_Fail','target_crs_grade'], axis=1, inplace = True)
    t2.drop(['Pass_Fail','target_crs_grade'], axis=1, inplace = True)
    
    # Find common courses students take and create list (from training data only)
    crs_ = id_common_crs(t1)
    
    
    return t1, v, t2, crs_

## Manipulate Data and prep for models

In [4]:
# Identify object vs numerical columns
def identify_cols(df):
    dtypes = df.dtypes.to_dict()
    obj_list = []
    num_list = []
    exclude_obj = ['uuid', 'unique_course', 'target_crs_pass_fail', 'target_crs']
    exclude_num = ['pf_flag', 'target_crs_pass_fail']
    
    for col_name, typ in dtypes.items():
        if (typ == 'O'and col_name not in exclude_obj ): 
            obj_list.append(col_name)
            
        elif (typ != 'O' and typ != 'bool' and col_name not in exclude_num):
            num_list.append(col_name)
            
    return obj_list, num_list


# One hot encode train, validation and test data
def one_hot_encode(train_data, valid_data, test_data, obj_list):
    
    # Keep copy of all other data
    train_other = train_data.drop(obj_list, axis=1, inplace=False).copy()
    valid_other = valid_data.drop(obj_list, axis=1, inplace=False).copy()
    test_other = test_data.drop(obj_list, axis=1, inplace=False).copy()
    
    
    # Create one hot encoder object
    ohe = OneHotEncoder(sparse=False, handle_unknown ='ignore')
    
    # Encode train, validation and test data
    train_hot_encoded = ohe.fit_transform(train_data[obj_list])
    valid_hot_encoded = ohe.transform(valid_data[obj_list])
    test_hot_encoded = ohe.transform(test_data[obj_list])
 
    # Create dfs to return encoded data
    train_df_ = pd.DataFrame(train_hot_encoded)
    valid_df_ = pd.DataFrame(valid_hot_encoded)
    test_df_ = pd.DataFrame(test_hot_encoded)
    
    # Add column names
    train_df_.columns = ohe.get_feature_names_out()
    valid_df_.columns = ohe.get_feature_names_out()
    test_df_.columns = ohe.get_feature_names_out()
    
    # Add uuids back
    train_df = pd.concat([train_other, train_df_], axis=1)
    valid_df = pd.concat([valid_other, valid_df_], axis=1)
    test_df = pd.concat([test_other, test_df_], axis=1)
    
    return train_df, valid_df, test_df


# Scale numerical data    
def scale_data(train_data, valid_data, test_data, num_cols_list):    
    
    # Sort data using enrollment term; latest to oldest
    train_data.sort_values(by=['enrl_term_id'], inplace=True, ascending=False)
    valid_data.sort_values(by=['enrl_term_id'], inplace=True, ascending=False)
    test_data.sort_values(by=['enrl_term_id'], inplace=True, ascending=False)
    
    '''TRAIN DATA FIRST'''
    # Extract only the numerical columns 
    train_num_cols = train_data[num_cols_list].copy()
    # Rest of columns
    train_other = train_data.drop(num_cols_list, axis=1, inplace=False).copy()
    train_other.reset_index(inplace=True, drop=True)
    # Scale using train numerical data
    scaler = MinMaxScaler(feature_range=(0, 1))
    train_scaled = pd.DataFrame(scaler.fit_transform(train_num_cols), 
                  columns=num_cols_list)
    # Concatenate all train data together again
    train_all = pd.concat([train_other, train_scaled], axis=1)
    # Reset indexes
    train_all.reset_index(inplace=True, drop=True)
 

    '''VALIDATION DATA SECOND'''
    # Extract only the numerical columns 
    valid_num_cols = valid_data[num_cols_list].copy()
    # Rest of columns
    valid_other = valid_data.drop(num_cols_list, axis=1, inplace=False).copy()
    valid_other.reset_index(inplace=True, drop=True)
    # Scale using previously created scaler (using train numerical data)
    valid_scaled = pd.DataFrame(scaler.transform(valid_num_cols), 
                  columns=num_cols_list)
    
    valid_all = pd.concat([valid_other, valid_scaled], axis=1)
    # Reset indexes
    valid_all.reset_index(inplace=True, drop=True)

    
    '''TEST DATA LAST'''
    # Extract only the numerical columns 
    test_num_cols = test_data[num_cols_list].copy()
    # Rest of columns
    test_other = test_data.drop(num_cols_list, axis=1, inplace=False).copy()
    test_other.reset_index(inplace=True, drop=True)
    # Scale using previously created scaler (using train numerical data)
    test_scaled = pd.DataFrame(scaler.transform(test_num_cols), 
                  columns=num_cols_list)
    
    test_all = pd.concat([test_other, test_scaled], axis=1)
    # Reset indexes
    test_all.reset_index(inplace=True, drop=True)
    

    '''ALL DATA - CHANGES'''
    
    # Remove enrl_term_id
    train_all.drop('enrl_term_id', axis=1, inplace=True)
    valid_all.drop('enrl_term_id', axis=1, inplace=True)
    test_all.drop('enrl_term_id', axis=1, inplace=True)
    
    return train_all, valid_all, test_all


# Data pivot of unique_course for most common courses only
def pivot_courses(df, crs_list):
     
    # Create df containing only those courses of interest (prev identified above threshhold enrollment #s)
    df_filtered = df.loc[df['unique_course'].isin(crs_list)].copy()
    
    # Create df that does not contain unique_course column
    df_nocrs = df.loc[:, df.columns!='unique_course']
    
    # Create pivot table to use as input and fill NA with 0
    df_pivot = df_filtered.pivot_table(index='uuid', 
                                values='pf_flag', columns='unique_course').reset_index().rename_axis(None, axis=1)
    df_pivot.fillna(0, inplace = True)
    
    # Merge pivoted courses with rest of data
    new_data = pd.merge(df_nocrs, df_pivot, how='left', on=['uuid'],
         suffixes=('_og', '_stu'), copy=True, indicator=False,
         validate=None)

    new_data.fillna(0, inplace = True)
    
    new_data.drop(['target_crs', 'pf_flag'], axis=1, inplace=True)
    
    return new_data


# Check to ensure that test data looks like train data after course pivot
def check_data_shape(train_df, test_df):
    
    # Add columns to test data if they existed in train data to maintain shape
    for col in train_df.columns:
        if col not in test_df.columns:
            test_df[col] = 0
            
            
    return test_df


# Ensure that target variable is the last variable
def make_target_last(df):
    last = df['target_crs_pass_fail'].copy()
    other = df.drop('target_crs_pass_fail', axis=1, inplace=False)
    new_df = pd.concat([other, last], 1)
    
    new_df.drop_duplicates(subset="uuid", keep ='first', inplace=True)

    
    return new_df


# Creates X (attribute) and Y (target) arrays for data
def create_arrays(df):
    
    if 'uuid' in df.columns:
        df.drop('uuid', axis=1, inplace=True)
        
    # Create separate df
    attributes = df.drop('target_crs_pass_fail', axis=1).copy()
    target = df['target_crs_pass_fail'].copy()
    
    # Create arrays to return
    X_ = attributes.values
    y_ = target.values
    
    return X_, y_

## Pre Model Building

In [5]:
# Create dummy classifier
def run_stratefied_dummy(X, y):
    
    averages = {}
    accuracy = []
    f1_macro = []
    f1_pos = []
    f1_neg = []
    precision =[]
    recall = []


    all_scores = {}
    
    for file in file_names:
        
        strat = DummyClassifier(strategy="stratified", random_state = rng)
        strat.fit(X[file], y[file])
        strat_pred = strat.predict(X[file])
        #strat_score = strat.score(X, y)
        f1_labels = f1_score(y[file], strat_pred, zero_division=0, average=None)
        
        accuracy.append(accuracy_score(y[file], strat_pred))
        f1_macro.append(f1_score(y[file], strat_pred, average='macro'))
        f1_pos.append(f1_labels[1])
        f1_neg.append(f1_labels[0])
        precision.append(precision_score(y[file], strat_pred, pos_label='1', zero_division=1))
        recall.append(recall_score(y[file], strat_pred, pos_label='1', zero_division=1))
        
        all_scores[file] = {#'accuracy' : accuracy_score(y[file], strat_pred), 
                       'f1_positive': f1_labels[1], 'f1_negative': f1_labels[0],
                       'f1_macro': f1_score(y[file], strat_pred, average='macro'),
                        'precision': precision_score(y[file], strat_pred, pos_label='1', zero_division=1),
                        'recall': recall_score(y[file], strat_pred, pos_label='1', zero_division=1)}
        
        auc = metrics.roc_auc_score(y[file].astype(int), strat_pred.astype(int))
        print(file, " " , auc)
    averages = {'f1_positive': statistics.mean(f1_pos), 'f1_negative': statistics.mean(f1_neg),
                       'f1_macro': statistics.mean(f1_macro),
                'precision': statistics.mean(precision),
                'recall': statistics.mean(recall)}
        
        
    return averages

# Import, Preprocess and Prepare Data for model

In [6]:
# High enrollment courses list
file_names = [ 'PSY3211', 'PSY3024', 'PSY3215', 'PSY4931', 'MAR3023', 'ENC1102', 'EAB3002', 
              'QMB3200','MAN3025', 'FIN3403', 'DEP3305', 'GEB3003', 'QMB4680',  
              'CCJ4014', 'EXP3523', 'BUL4310', 'CCJ3628', 'MAN4720', 'SOP3004', 'CLP4146', 
              'BSC2023', 'CJL4064', 'CLP4374']


# Import all files into dictionary
def all_files():
       
    train_dict = {}
    valid_dict = {}
    test_dict = {}
    
    courses_list = {}
    
    for file in file_names:
        path = 'files/{0}.xlsx'.format(file)
        train, validate, test, common_crs = import_file(path)
        
        train_dict[file] = train
        valid_dict[file] = validate
        test_dict[file] = test
        
        courses_list[file] = common_crs
        
    return train_dict, valid_dict, test_dict, courses_list

In [7]:
#Preprocess the data
def pre_process_all(train_, valid_, test_, courses):  

    train_pro = {}
    valid_pro = {}
    test_pro = {}
    
    for file in file_names:
    
        # Create obj and num lists
        obj, num = identify_cols(train_[file])

        # One hot encode data first: 
        tr_one, valid_one, test_one = one_hot_encode(train_[file], valid_[file], test_[file], obj)

        # Scale numerical data: 
        tr_scale, valid_scale, test_scale = scale_data(tr_one, valid_one, test_one, num)

        # Pivot courses next: 
        train_piv = pivot_courses(tr_scale, courses[file])
        valid_piv = pivot_courses(valid_scale, courses[file])
        test_piv = pivot_courses(test_scale, courses[file])

        # Ensure that train, validation and test are the same shape
        train_mid = train_piv.copy()
        valid_mid = check_data_shape(train_piv, valid_piv)
        test_mid = check_data_shape(train_piv, test_piv) 

        # Ensure that target is the last column
        train_final = make_target_last(train_mid)
        valid_final = make_target_last(valid_mid)
        test_final = make_target_last(test_mid)
        
        # Add to dictionaries
        train_pro[file] = train_final
        valid_pro[file] = valid_final
        test_pro[file] = test_final
        
        
    return train_pro, valid_pro, test_pro
    
# Create datasets    
def create_datasets(t, v, te, crs_list):
    #Enrollment + courses
    train_enrl = {}
    valid_enrl = {}
    test_enrl = {}
    
    #Courses only
    train_courses = {}
    valid_courses = {}
    test_courses = {}
    
    
    #ANOVA FEATURES
    cols = [ 'UCCs_failed', 'total_courses_passed', 'target_avg_grade_all', 'UCCs_passed', 'term_gpa', 'perc_ucc_passed', 
            'creds_attp_term', 'total_courses_failed', 'target_avg_grd_term', 'perc_ucc_failed', 'perc_gateway_passed', 
            'perc_gateway_failed', 'INST_GPA']
    
    
    train_vars = {}
    valid_vars = {}
    test_vars = {}
    
    for file in file_names:
        
        '''ENROLLMENT VARS ALL'''
        # All enrollment vars (no demographics)
        train_ecrs = t[file].iloc[:,list(range(12,len(t[file].columns)))]
        valid_ecrs = v[file].iloc[:,list(range(12,len(v[file].columns)))]
        test_ecrs = te[file].iloc[:,list(range(12,len(te[file].columns)))]
        
        # Add to dictionaries
        train_enrl[file] = train_ecrs
        valid_enrl[file] = valid_ecrs
        test_enrl[file] = test_ecrs
        
        '''COURSE VARS ONLY'''
        # All enrollment vars (no demographics)
        train_crs = t[file].iloc[:,list(range((len(t[file].columns) - len(crs_list[file])),len(t[file].columns)))]
        valid_crs = v[file].iloc[:,list(range((len(v[file].columns) - len(crs_list[file])),len(v[file].columns)))]
        test_crs = te[file].iloc[:,list(range((len(te[file].columns) - len(crs_list[file])),len(te[file].columns)))]
          
        #Add to dictionaries
        train_courses[file] = train_crs
        valid_courses[file] = valid_crs
        test_courses[file] = test_crs
        
        '''VAR SELECTION DATA'''
        
        # Selected variables
        train_sel = t[file][cols].copy()
        valid_sel = v[file][cols].copy()
        test_sel = te[file][cols].copy()
        
        
        # Concatenate dfs
        train_sel_all = pd.concat([train_sel, train_crs], axis=1)
        valid_sel_all = pd.concat([valid_sel, valid_crs], axis=1)
        test_sel_all = pd.concat([test_sel, test_crs], axis=1)
          
        #Add to dictionaries
        train_vars[file] = train_sel_all
        valid_vars[file] = valid_sel_all
        test_vars[file] = test_sel_all
        
    
    return train_enrl, valid_enrl, test_enrl, train_courses, valid_courses, test_courses, train_vars, valid_vars, test_vars

def create_arrays_all(t, v, te):
    
    X_train_arr = {}
    y_train_arr = {}
    X_valid_arr = {}
    y_valid_arr = {}
    X_test_arr = {}
    y_test_arr = {}
    
    for file in file_names:
        
        # Prepare training data for model ingestion
        X_train, y_train = create_arrays(t[file])
        X_valid, y_valid = create_arrays(v[file])
        X_test, y_test = create_arrays(te[file])
        
        # Add to dictionaries
        X_train_arr[file] = X_train
        y_train_arr[file] = y_train
        X_valid_arr[file] = X_valid
        y_valid_arr[file] = y_valid
        X_test_arr[file] = X_test
        y_test_arr[file] = y_test

    return X_train_arr, y_train_arr, X_valid_arr, y_valid_arr, X_test_arr, y_test_arr


def create_train_arrays_all(t):
    
    X_train_arr = {}
    y_train_arr = {}
    
    for file in file_names:
        
        # Prepare training data for model ingestion
        X_train, y_train = create_arrays(t[file])
        
        
        # Add to dictionaries
        X_train_arr[file] = X_train
        y_train_arr[file] = y_train
        

    return X_train_arr, y_train_arr

## Create sets to test

In [8]:
train_, valid_, test_, courses = all_files()

In [9]:
# Preprocess files and create dictionaries with data ready for model ingestion
train_p, valid_p, test_p = pre_process_all(train_, valid_, test_, courses)

In [10]:
# Create enrollment variables only dataset as well as courses only dataset for comparison
train_enrl_crs, valid_enrl_crs, test_enrl_crs, train_crs, valid_crs, test_crs, train_sel_var, valid_sel_var, test_sel_var = create_datasets(train_p, valid_p, test_p, courses)

In [11]:
# Create arrays for courses only
X_train_crs, y_train_crs, X_valid_crs, y_valid_crs, X_test_crs, y_test_crs = create_arrays_all(train_crs, valid_crs, test_crs)

# Create arrays for enrollment + courses
X_train_enrl, y_train_enrl, X_valid_enrl, y_valid_enrl, X_test_enrl, y_test_enrl = create_arrays_all(train_enrl_crs, valid_enrl_crs, test_enrl_crs)

# Create arrays for full data set
#X_train_full, y_train_full, X_valid_full, y_valid_full, X_test_full, y_test_full = create_arrays_all(train_p, valid_p, test_p)

# Create arrays for variables selected
X_train_select, y_train_select, X_valid_select, y_valid_select, X_test_select, y_test_select = create_arrays_all(train_sel_var, valid_sel_var, test_sel_var)


## Checking training/valid/test sizes

In [12]:
t_course = 'CLP4374'

In [13]:
train_p[t_course].groupby('target_crs_pass_fail')['target_crs_pass_fail'].count()

target_crs_pass_fail
-1    1383
1       72
Name: target_crs_pass_fail, dtype: int64

In [14]:
train_p[t_course].shape

(1455, 70)

## Run baselines - on enrollment + courses dataset

### Enrollment + courses

In [None]:
strat_baseline  = run_stratefied_dummy(X_train_enrl, y_train_enrl)

In [None]:
strat_baseline

# Model Hyperparameter Tuning

## Making sure to use predefined validation set

In [29]:
# For use with imbalanced data 
svc_params = {
              'C': [1, 10, 100], 
              'gamma': [0.001],
              'kernel': ['linear','rbf', 'poly'],
              'class_weight': ['balanced'] }

tree_params = {'criterion': ["gini"]
              'splitter': ["best","random"]
              }

ada_params = {'learning_rate': [0.01, 0.05, 0.1],
              'algorithm': ['SAMME', 'SAMME.R'],
              'n_estimators' : [50,100]}
    


# For use with pipelines
svc_params_adj = {'svc__' + key: svc_params[key] for key in svc_params}
tree_params_adj = {'decisiontreeclassifier__' + key: tree_params[key] for key in tree_params}
ada_params_adj = {'adaboostclassifier__' + key: ada_params[key] for key in ada_params}

# Function runs grid search on single course after performing SMOTE on the data set to deal with class imbalances
def run_grid_search_SMOTE(model, params, X_train, y_train, X_valid, y_valid, file):
    
    '''Cross Validation Predefined Split'''
    X = np.vstack((X_train, X_valid))
    test_fold = [-1 for _ in range(X_train.shape[0])] + [0 for _ in range(X_valid.shape[0])]
    y = np.concatenate([y_train, y_valid])
    ps = PredefinedSplit(test_fold)
    
    '''Create Pipeline'''
    imba_pipeline = make_pipeline(SMOTE(random_state=1), 
                                  model)
    
    '''Create scorer object'''
    def adjusted_f1(y_true, y_prob):
        f1 = f1_score(y_true, y_prob, zero_division=0, average = 'binary', pos_label='1')
        return f1
    
    score = make_scorer(adjusted_f1, greater_is_better = True)
    
    
    '''Implementing Gridsearch'''
    grid_imba = GridSearchCV(imba_pipeline, param_grid=params, cv=ps, scoring=score, refit=True,
                        return_train_score=True)
    y = y.astype(int)
    
    grid_imba.fit(X, y)
    
    mean_train = statistics.mean(grid_imba.cv_results_['mean_train_score'])
    mean_test = statistics.mean(grid_imba.cv_results_['mean_test_score'])
    
   
    '''Feature Selection'''
    
    rfe = RFECV(grid_imba.best_estimator_.named_steps["adaboostclassifier"], step=1, verbose=1, cv=ps, min_features_to_select=10) 
    rfe_selector = rfe.fit(X,y)
    print(rfe_selector.ranking_)
    print(rfe_selector.support_)
    print(rfe_selector.n_features_)
    
    rfe_results = {'rank': rfe_selector.ranking_, 'supp': rfe_selector.support_, 'feat': rfe_selector.n_features_}
    
    return grid_imba.best_params_, mean_train, mean_test, grid_imba.best_estimator_ , rfe_results 


# Function runs grid search on all courses using SMOTE to deal with class imbalances
def gridsearch_all_SMOTE(model_chosen, params, X_train, y_train, X_valid, y_valid):

    all_results = []
    train_scores = []
    test_scores = []
    estimator = {}
    cv_results_all = {}
    rfe = {}

    
    for file in file_names:
    
        m_results, m_train_score, m_test_score, m_estimator , rfe_res= run_grid_search_SMOTE(model_chosen, params, 
                                                    X_train[file], y_train[file], X_valid[file], y_valid[file], file)



        all_results.append(m_results)
        train_scores.append(m_train_score)
        test_scores.append(m_test_score)
        estimator[file] = m_estimator
        rfe[file] = rfe_res
    
        
    return all_results, train_scores, test_scores, estimator, rfe 

### ADABOOST WITH CRS FEATURES

In [None]:
estimator = DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=1, random_state=1)
results00, train_scores00, test_scores00, models00 = gridsearch_all_SMOTE(AdaBoostClassifier(base_estimator = estimator, random_state = 1), ada_params_adj, 
                                                        X_train_crs,y_train_crs,X_valid_crs,y_valid_crs)

### ADABOOST WITH ENRL FEATURES

In [30]:
estimator = DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=1, random_state=1)
results0, train_scores0, test_scores0, models0 = gridsearch_all_SMOTE(AdaBoostClassifier(base_estimator = estimator, random_state = 1), ada_params_adj, 
                                                        X_train_enrl,y_train_enrl,X_valid_enrl,y_valid_enrl)

### ADABOOST WITH SELECTED ANOVA FEATURES

In [71]:
estimator = DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=1, random_state=1)
results11, train_scores11, test_scores11, models11 = gridsearch_all_SMOTE(AdaBoostClassifier(base_estimator = estimator, random_state = 1), ada_params_adj, 
                                                        X_train_select,y_train_select,X_valid_select,y_valid_select)

### SVM WITH ALL ENROLLMENT FEATURES

In [34]:
results2, train_scores2, test_scores2, models2 = gridsearch_all_SMOTE(SVC(random_state=1, probability=True),svc_params_adj, 
                                          X_train_enrl,y_train_enrl,X_valid_enrl,y_valid_enrl)

### SVM WITH SELECTED ANOVA FEATURES 

In [None]:
results31, train_scores31, test_scores31, models31 = gridsearch_all_SMOTE(SVC(random_state=1, probability=True),svc_params_adj, 
                                          X_train_select,y_train_select,X_valid_select,y_valid_select)

### SVM WITH ALL COURSE FEATURES

In [None]:
results4, train_scores4, test_scores4, models4 = gridsearch_all_SMOTE(SVC(random_state=1, probability=True),svc_params_adj, 
                                          X_train_crs,y_train_crs,X_valid_crs,y_valid_crs)

### DECISION TREE WITH SELECTED FEATURES

In [None]:
results5, train_scores5, test_scores5, models5 = gridsearch_all_SMOTE(DecisionTreeClassifier(random_state=1),tree_params_adj, 
                                          X_train_select,y_train_select,X_valid_select,y_valid_select)

### DECISION TREE WITH COURSES ONLY

In [None]:
results6, train_scores6, test_scores6, models6 = gridsearch_all_SMOTE(DecisionTreeClassifier(random_state=1),tree_params_adj, 
                                          X_train_crs,y_train_crs,X_valid_crs,y_valid_crs)

### DECISION TREE WITH ENROLLMENT ONLY

In [81]:
results7, train_scores7, test_scores7, models7 = gridsearch_all_SMOTE(DecisionTreeClassifier(random_state=1),tree_params_adj, 
                                          X_train_enrl,y_train_enrl,X_valid_enrl,y_valid_enrl)

# TESTING MODELS

In [77]:
# Use this function to get the test scores after hyper parameter tuning and feature selection
def get_test_scores(model_dict, X_train, y_train, X_test, y_test):
    
    precision = []
    recall = []
    f1_neg = []
    f1_pos = []
    f1_macro = []

    all_scores = {}
    all_probs = {}
    all_preds = {}
    
    all_sig_feats = {}
    
    for file in file_names: 

        model_fit = model_dict[file].fit(X_train[file], y_train[file])
        predictions = model_fit.predict(X_test[file])
        all_preds[file] = predictions
        
        f1_labels = f1_score(y_test[file], predictions, zero_division=0, average=None)

        f1_macro.append(f1_score(y_test[file], predictions, average='macro'))
        f1_pos.append(f1_labels[1])
        f1_neg.append(f1_labels[0])
        precision.append(precision_score(y_test[file], predictions, pos_label='1'))
        recall.append(recall_score(y_test[file], predictions, pos_label='1'))

        
        all_scores[file] = {'f1_pos': f1_labels[1],
                            'f1_neg': f1_labels[0],
                            'f1_macro': f1_score(y_test[file], predictions, average='macro'),
                            'precision': precision_score(y_test[file], predictions,pos_label='1', zero_division=0), 
                            'recall': recall_score(y_test[file], predictions,pos_label='1', zero_division=0)}    

        
        ''' Collect Feature Importances (AdaBoost Only)'''
        
        #feature_importances = model_fit.named_steps["adaboostclassifier"].feature_importances_
        #feature_names = model_fit.named_steps["adaboostclassifier"].feature_names_in_
        
        
        all_sig_feats[file] = {'scores': feature_importances}#, 'names': feature_names}
        
        
        '''ROC AUC'''
        
        fpr, tpr, thresholds = metrics.roc_curve(y_test[file],  predictions.astype(int), pos_label='1')
        
        auc = metrics.roc_auc_score(y_test[file].astype(int), predictions.astype(int))
        print(file, auc)

        
    averages = {'f1_positive': statistics.mean(f1_pos), 'f1_negative': statistics.mean(f1_neg),
                           'f1_macro': statistics.mean(f1_macro), 'precision': statistics.mean(precision), 
                            'recall' : statistics.mean(recall)}
    
    return all_scores, averages, all_preds, all_probs#, all_sig_feats

# RUNNING MODELS WITH TEST DATA

In [54]:
score_, avg_, preds_, probs_ = get_test_scores(models3, X_train_crs,y_train_crs, X_test_crs,y_test_crs)

In [55]:
score_df = pd.DataFrame.from_dict(score_, orient='index')

In [None]:
score_df

## COURSES ONLY

### DECISION TREE

In [96]:
score_1, avg_1, preds_1, probs_1 = get_test_scores(models5, X_train_crs,y_train_crs, X_test_crs,y_test_crs)

In [97]:
score_1df = pd.DataFrame.from_dict(score_1, orient='index')

In [None]:
score_1df

### ADABOOST

In [None]:
score_2, avg_2, preds_2, probs_2 = get_test_scores(models5, X_train_crs,y_train_crs, X_test_crs,y_test_crs)

In [None]:
score_2df = pd.DataFrame.from_dict(score_2, orient='index')

In [None]:
score_2df

### SVM

In [None]:
score_3, avg_3, preds_3, probs_3 = get_test_scores(models4, X_train_crs,y_train_crs, X_test_crs,y_test_crs)

In [None]:
score_3df = pd.DataFrame.from_dict(score_3, orient='index')

In [None]:
score_3df 

## ENRL VARS

### DECISION TREE

In [None]:
score_4, avg_4, preds_4, probs_4 = get_test_scores(models7, X_train_enrl,y_train_enrl, X_test_enrl,y_test_enrl)

In [83]:
score_4df = pd.DataFrame.from_dict(score_4, orient='index')

In [None]:
score_4df

### ADABOOST

In [None]:
score_5, avg_5, preds_5, probs_5, all_sign_feats5 = get_test_scores(models0, X_train_enrl,y_train_enrl, X_test_enrl,y_test_enrl)

In [None]:
score_5df = pd.DataFrame.from_dict(score_5, orient='index')

In [None]:
score_5df

### SVM

In [None]:
score_6, avg_6, preds_6, probs_6 = get_test_scores(models2, X_train_enrl,y_train_enrl, X_test_enrl,y_test_enrl)

In [92]:
score_6df = pd.DataFrame.from_dict(score_6, orient='index')

In [None]:
score_6df

## SELECTED VARS

### ADABOOST SELECTED _ ANOVA

In [None]:
score_71, avg_71, preds_71, probs_71 = get_test_scores(models11, X_train_select,y_train_select, X_test_select,y_test_select)

In [219]:
score_71df = pd.DataFrame.from_dict(score_71, orient='index')

In [None]:
score_71df

## Collect Significant Features

In [None]:
# summarize all features
for file in file_names:
    print(file)
    for i in range(X_train_enrl[file].shape[1]):
        if rfe_ft[file]['supp'][i] == True:   
            print('Column: %d, Name: %s, Selected %s, Rank: %.3f' % (i, train_enrl_crs[file].columns[i],
                                                             rfe_ft[file]['supp'][i], rfe_ft[file]['rank'][i]))

In [None]:
# summarize all features
for file in file_names:
    #print(file)
    for i in range(X_train_enrl[file].shape[1]):
        if rfe_ft[file]['supp'][i] == True:   
            print(file + ":" +(train_enrl_crs[file].columns[i]))