### Import Modules

In [21]:
# Core
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from datetime import datetime
import re

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier


Time Function

In [66]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

### Import Data

Additional features are clicks on course material data.

In [2]:
df_ml_click = pd.read_csv('data/post_eda_class_click.csv')

Show all dataframe columns for analyis.

Code Source: 

https://stackoverflow.com/questions/47022070/display-all-dataframe-columns-in-a-jupyter-python-notebook/47022213

In [3]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [4]:
df_ml_click.shape

(37030, 39)

In [5]:
df_ml_click.head(2)

Unnamed: 0,is_banked,code_module,code_presentation,assessment_type,module_presentation_length,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result,dataplus,dualpane,externalquiz,folder,forumng,glossary,homepage,htmlactivity,oucollaborate,oucontent,ouelluminate,ouwiki,page,questionnaire,quiz,repeatactivity,resource,sharedsubpage,subpage,url,assess_date,length_no_cred_ratio,date_registration,score,avg_click
0,0,AAA,2014J,TMA,269,M,Scotland,HE Qualification,80-90%,55<=,0,60,N,Pass,21.0,4.0,12.0,3.0,451.0,3.0,497.0,5.0,5.0,1505.0,9.0,43.0,5.0,12.0,185.0,2.5,31.0,1.0,143.0,143.0,2.6,4.483333,-52.0,61.8,133.118254
1,0,AAA,2013J,TMA,268,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,Pass,14.0,4.0,12.0,3.0,193.0,3.0,138.0,5.0,5.0,553.0,9.0,43.0,5.0,12.0,185.0,2.5,13.0,1.0,32.0,5.0,1.8,1.116667,-159.0,82.0,44.529365


### Preprocessing

In [6]:
df_ml_click['final_result'].value_counts()

Pass           22497
Fail            9035
Distinction     5498
Name: final_result, dtype: int64

Change the final result of 'distinction' to 'pass'.

Code Source:

https://stackoverflow.com/questions/21608228/conditional-replace-pandas/44311454


In [7]:
df_ml_click['final_result'] = np.where(df_ml_click.final_result == 'Distinction', 'Pass', df_ml_click.final_result)

In [10]:
df_ml_click['final_result'].value_counts()

Pass    27995
Fail     9035
Name: final_result, dtype: int64

Change target variable to number using label encoder.

Code Source:

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html

In [12]:
le = preprocessing.LabelEncoder()
le.fit(df_ml_click['final_result'])

LabelEncoder()

In [14]:
list(le.classes_)

['Fail', 'Pass']

In [15]:
codes = le.transform(df_ml_click['final_result'])
codes

array([1, 1, 0, ..., 1, 0, 1])


Split file and create features and target.

In [17]:
X = pd.get_dummies(df_ml_click.drop('final_result', axis=1), prefix_sep='_', drop_first=True)
y = df_ml_click['final_result']

### Machine Learning Models

Create into train and test files. 

Code Source: 

https://stackoverflow.com/questions/29438265/stratified-train-test-split-in-scikit-learn

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y,test_size=0.20,random_state=42)

#### Logistic Regression

Code Source: 

https://chrisalbon.com/machine_learning/model_selection/hyperparameter_tuning_using_grid_search/

Instantiate model and set parameters for grid search.

In [23]:
logistic = LogisticRegression()
penalty = ['l1', 'l2']
C = np.logspace(0, 4, 10)
hyperparameters = dict(C=C, penalty=penalty)

Use gridsearch cv to get best hyperparameters and fit model.

In [25]:
clf = GridSearchCV(logistic, hyperparameters, cv=5, verbose=0)

best_model = clf.fit(X_train, y_train)








View best hyperparameters.

In [26]:
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])

Best Penalty: l2
Best C: 1.0


Run model with best parameters on test dataset.

In [27]:
y_pred_lr = best_model.predict(X_test)

Score results.

In [28]:
print ("Cohen Kappa Test: %0.4f" % cohen_kappa_score(y_test,y_pred_lr))

Cohen Kappa Test: 0.4750


Print confusion matrix and classification report.

In [41]:
print(confusion_matrix(y_test, y_pred_lr,labels=["Pass", "Fail"]))
print(classification_report(y_test, y_pred_lr))

[[5270  329]
 [ 941  866]]
              precision    recall  f1-score   support

        Fail       0.72      0.48      0.58      1807
        Pass       0.85      0.94      0.89      5599

   micro avg       0.83      0.83      0.83      7406
   macro avg       0.79      0.71      0.73      7406
weighted avg       0.82      0.83      0.82      7406



Compute predicted probabilities.

Code Source:

https://campus.datacamp.com/courses/supervised-learning-with-scikit-learn/fine-tuning-your-model?ex=8

In [50]:
y_pred_prob = best_model.predict_proba(X_test)[:,1]

In [54]:
print("AUC: {}".format(roc_auc_score(y_test, y_pred_prob)))

AUC: 0.855672306097035


#### Random Forest - ensemble

Instantiate model and set parameters for Grid Search.

In [67]:
rf = RandomForestClassifier(random_state=42)

parameters = { 'max_features':np.arange(5,10),'n_estimators':[500],'min_samples_leaf': [10,50,100,200,500],
             'max_depth':[8, 9, 10, 11, 12]}

rf_gs = GridSearchCV(rf,parameters,cv=5)

start_time = timer(None) 
rf_gs.fit(X_train, y_train)
timer(start_time)


 Time taken: 1 hours 24 minutes and 37.95 seconds.


In [69]:
print("Tuned Random Forest Parameters: {}".format(rf_gs.best_params_))
print("Best score is {}".format(rf_gs.best_score_))

Tuned Random Forest Parameters: {'max_depth': 12, 'max_features': 9, 'min_samples_leaf': 10, 'n_estimators': 500}
Best score is 0.867539832568188


Run model with best parameters on test dataset.

In [70]:
y_pred_rf = rf_gs.predict(X_test)

Score results.

In [71]:
print ("Cohen Kappa Test: %0.4f" % cohen_kappa_score(y_test,y_pred_rf))

Cohen Kappa Test: 0.6020


Print confusion matrix and classification report.

In [73]:
print(confusion_matrix(y_test, y_pred_rf,labels=["Pass", "Fail"]))
print(classification_report(y_test, y_pred_rf))

[[5455  144]
 [ 808  999]]
              precision    recall  f1-score   support

        Fail       0.87      0.55      0.68      1807
        Pass       0.87      0.97      0.92      5599

   micro avg       0.87      0.87      0.87      7406
   macro avg       0.87      0.76      0.80      7406
weighted avg       0.87      0.87      0.86      7406



Compute predicted probabilities.

In [74]:
y_pred_prob_rf = rf_gs.predict_proba(X_test)[:,1]

In [75]:
print("AUC: {}".format(roc_auc_score(y_test, y_pred_prob_rf)))

AUC: 0.9172894637976403
