In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import statsmodels.api as sm
import json as json

from sklearn import metrics
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from imblearn.over_sampling import SMOTE

import util.df_enricher as jn
import util.jndb as db


In [2]:
all_ratings = jn.get_enriched_card_ratings()

In [5]:
def compute_logreg( card_types, output_array, all_ratings, target_tnr=0.8, min_attempts=2 ):

    for card_type in card_types:
        ratings = all_ratings[all_ratings.card_type == card_type]
        ratings = ratings[ratings.attempt_num > min_attempts]
        
        X = ratings[['subject_num','difficulty_level','time_spent','attempt_num', 'gap_duration', 'prior_le']]
        Y = ratings['is_correct']
        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
        
        smote = SMOTE()
        X_train_os, y_train_os = smote.fit_resample( X_train, y_train )
        
        logreg = LogisticRegression()
        logreg.fit(X_train_os, y_train_os)
        
        y_prob = logreg.predict_proba( X_test )
        fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
        
        optimal_threshold = 1
        for i in range(len(fpr)):
            tnr = 1-fpr[i]
            if( tnr < target_tnr ):
                optimal_threshold = thresholds[i-1]
                break
        
        y_pred = [ (1 if prob[1] >= optimal_threshold else 0) for prob in y_prob ]
        
        c_mat = confusion_matrix(y_test, y_pred, labels=[0,1])
        accuracy = (c_mat[0][0] + c_mat[1][1])/c_mat.sum()
        
        accuracy_wrong = (c_mat[0][0])/c_mat[0].sum()
        accuracy_right = (c_mat[1][1])/c_mat[1].sum()
        
        output = {}
        output[ 'card_type'      ] = card_type
        output[ 'min_attempts'   ] = min_attempts
        output[ 'features'       ] = logreg.feature_names_in_.tolist()
        output[ 'coefficients'   ] = logreg.coef_[0].tolist()
        output[ 'intercept'      ] = logreg.intercept_[0]
        output[ 'target_tnr'     ] = target_tnr
        output[ 'threshold'      ] = optimal_threshold
        output[ 'accuracy'       ] = accuracy
        output[ 'accuracy_wrong' ] = accuracy_wrong
        output[ 'accuracy_right' ] = accuracy_right
        
        output_array.append( output )

output_array = []
compute_logreg( ['fib','question_answer'], output_array, all_ratings, min_attempts=0 )

print( json.dumps( output_array, indent=2 ) )

[
  {
    "card_type": "fib",
    "min_attempts": 0,
    "features": [
      "subject_num",
      "difficulty_level",
      "time_spent",
      "attempt_num",
      "gap_duration",
      "prior_le"
    ],
    "coefficients": [
      0.028874825186980712,
      0.2673876615843136,
      -0.14252402738105294,
      0.06610361732786209,
      -0.007102258395415879,
      0.01760110504710468
    ],
    "intercept": 0.36563139031343583,
    "target_tnr": 0.8,
    "threshold": 0.5379388011593492,
    "accuracy": 0.8134297690474234,
    "accuracy_wrong": 0.8001345895020189,
    "accuracy_right": 0.8152794682145866
  },
  {
    "card_type": "question_answer",
    "min_attempts": 0,
    "features": [
      "subject_num",
      "difficulty_level",
      "time_spent",
      "attempt_num",
      "gap_duration",
      "prior_le"
    ],
    "coefficients": [
      0.05512335740430377,
      0.04783643779600697,
      -0.06469104314115254,
      0.296037436069639,
      -0.008762533089239969,
      0