## Regression with statsmodels

In [None]:
%%capture

%run './lib/init.ipynb'
from lib import utilities as util
import matplotlib.pyplot as plt

import pandas as pd
import seaborn as sns
import statsmodels.api as sm
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [None]:
trainDf  = util.load_df('trainDf')
validDf  = util.load_df('validDf')
studyVars = [
  'student_duration_mins',
  'timeliness_duration_mins',
  # 'item_attempt_duration_mins',
  'item_type_code_name_swoe',
  'number_of_distinct_instance_items'
]

###### Create regression dataframe
 - completed items
 - study variables
  - target
  - student_duration_mins
  - timeliness_duration_mins
  - item_attempt_duration_mins
  - item_type_code_name_swoe
  - number_of_distinct_instance_items

In [None]:
trnDf = trainDf.select('target',*studyVars).toPandas()
valDf = validDf.select('target',*studyVars).toPandas()

###### Dataframe structure

In [None]:
trnDf.info()

###### Target variable barplot

In [None]:
sns.countplot(x='target',data=trnDf, palette='hls')
plt.show()

###### Target variable percentages

In [None]:
count_no_pass = len(trnDf[trnDf['target'] == 0])
count_pass = len(trnDf[trnDf['target'] == 1])
pct_of_no_pass = count_no_pass / (count_no_pass+count_pass)
print("percentage of no pass is", round(pct_of_no_pass * 100), '%')
pct_of_pass = count_pass/(count_no_pass + count_pass)
print("percentage of passes", round(pct_of_pass * 100), '%')

###### Number of missing values

In [None]:
trnDf.isnull().sum()

###### Number of distinct instance items histogram

In [None]:
trnDf.number_of_distinct_instance_items.hist(bins=50)
plt.title('Distinct instance items')
plt.xlabel('Number')
plt.ylabel('Frequency')

###### Target vs number of distinct instance items scatter plot

In [None]:
trnDf.plot.scatter('number_of_distinct_instance_items', 'target')
plt.ylabel('Pass')
sns.despine()

###### Item type code name swoe histogram

In [None]:
trnDf.item_type_code_name_swoe.hist(bins=50)
plt.title('Item Type Code Name')
plt.xlabel('Category')
plt.ylabel('Frequency')

###### Target vs item type code name scatter plot

In [None]:
trnDf.plot.scatter('item_type_code_name_swoe', 'target')
plt.ylabel('Pass')
sns.despine()

###### Attempt duration minutes histogram

In [None]:
trnDf.item_attempt_duration_mins.hist(bins=100)
plt.title('Attempt Duration')
plt.xlabel('Minutes')
plt.ylabel('Frequency')

###### Target vs Attempt duration minutes scatter plot

In [None]:
trnDf.plot.scatter('item_attempt_duration_mins', 'target')
plt.ylabel('Pass')
sns.despine()

###### Student duration minutes histogram

In [None]:
trnDf.student_duration_mins.hist(bins=100)
plt.title('Student Duration')
plt.xlabel('Minutes')
plt.ylabel('Frequency')

###### Target vs Student duration minutes scatter plot

In [None]:
trnDf.plot.scatter('student_duration_mins', 'target')
plt.ylabel('Pass')
sns.despine()


###### Timeliness duration minutes histogram

In [None]:
trnDf.timeliness_duration_mins.hist(bins=100)
plt.title('Timeliness Duration')
plt.xlabel('Minutes')
plt.ylabel('Frequency')

###### Target vs Timeliness duration minutes scatter plot

In [None]:
trnDf.plot.scatter('timeliness_duration_mins', 'target')
plt.ylabel('Pass')
sns.despine()

###### Check the independence between the independent variables

In [None]:
sns.heatmap( trnDf[trnDf.columns.intersection(studyVars)].corr())
plt.show()

Correlation less than .5

###### Train

In [None]:
y_train = trnDf[['target']]
X_train = trnDf[studyVars]

logreg = sm.Logit(y_train, X_train).fit()

logreg.summary()

###### Validate

In [None]:
y_valid = trnDf[['target']]
X_valid = trnDf[studyVars]

yhat = logreg.predict(X_valid)
prediction = list(map(round, yhat))
#
# # comparing original and predicted values of y
# print('Actual values', list(y_valid.values))
# print('Predictions :', prediction)

###### Test Accuracy

In [None]:
from sklearn.metrics import (confusion_matrix,
                           accuracy_score)

# confusion matrix
cm = confusion_matrix(y_valid, prediction)
print ("Confusion Matrix : \n", cm)

# accuracy score of the model
print('Test accuracy = ', accuracy_score(y_valid, prediction))

###### Reciever Operating Characteristic (ROC) Curve

In [None]:
# Ref: https://jbhender.github.io/Stats506/F18/GP/Group5.html

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

# logit_roc_auc = roc_auc_score(y, pred)
fpr, tpr, thresholds = roc_curve(y_valid, yhat)
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression ')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()


In [None]:
from lib import stepwise

sout = stepwise.forwardSelection(X_train, y_train, model_type ="logistic")

In [None]:
from lib import stepwise

sout = stepwise.backwardSelection(X_train, y_train, model_type ="logistic")