In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import statsmodels.api as sm
import json as json
import ipywidgets as widgets

from sklearn import metrics
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from imblearn.over_sampling import SMOTE

import util.df_enricher as jn
import util.jndb as db


In [2]:
all_ratings = jn.get_enriched_card_ratings()

In [3]:
def compute_logreg( card_types, all_ratings, target_tnr=0.8, min_attempts=2 ):

    models = {}
    
    for card_type in card_types:
        ratings = all_ratings[all_ratings.card_type == card_type]
        ratings = ratings[ratings.attempt_num > min_attempts]
        
        X = ratings[['subject_num','difficulty_level','time_spent','attempt_num', 'gap_duration', 'prior_le']]
        Y = ratings['is_correct']
        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
        
        smote = SMOTE()
        X_train_os, y_train_os = smote.fit_resample( X_train, y_train )
        
        logreg = LogisticRegression()
        logreg.fit(X_train_os, y_train_os)
        
        y_prob = logreg.predict_proba( X_test )
        fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
        
        optimal_threshold = 1
        for i in range(len(fpr)):
            tnr = 1-fpr[i]
            if( tnr < target_tnr ):
                optimal_threshold = thresholds[i-1]
                break
        
        y_pred = [ (1 if prob[1] >= optimal_threshold else 0) for prob in y_prob ]
        
        c_mat = confusion_matrix(y_test, y_pred, labels=[0,1])
        accuracy = (c_mat[0][0] + c_mat[1][1])/c_mat.sum()
        
        accuracy_wrong = (c_mat[0][0])/c_mat[0].sum()
        accuracy_right = (c_mat[1][1])/c_mat[1].sum()
        
        model = {}
        model[ 'train_set_size'  ] = X_train_os.shape[0]
        model[ 'min_attempts'    ] = min_attempts
        model[ 'features'        ] = logreg.feature_names_in_.tolist()
        model[ 'coefficients'    ] = logreg.coef_[0].tolist()
        model[ 'intercept'       ] = logreg.intercept_[0]
        model[ 'target_tnr'      ] = target_tnr
        model[ 'threshold'       ] = optimal_threshold
        model[ 'accuracy'        ] = accuracy
        model[ 'precision_wrong' ] = accuracy_wrong
        model[ 'precision_right' ] = accuracy_right

        models[ card_type ] = model

    return models

models = compute_logreg( ['fib','question_answer'], all_ratings, target_tnr=0.85, min_attempts=2 )

print( json.dumps( models, indent=2 ) )

{
  "fib": {
    "train_set_size": 31104,
    "min_attempts": 2,
    "features": [
      "subject_num",
      "difficulty_level",
      "time_spent",
      "attempt_num",
      "gap_duration",
      "prior_le"
    ],
    "coefficients": [
      0.047558320181126004,
      0.36129239969863386,
      -0.1444233626305526,
      0.07394685561672215,
      -0.007826866079033799,
      0.028145634612651946
    ],
    "intercept": -1.2700958533576228,
    "target_tnr": 0.85,
    "threshold": 0.6403677389452799,
    "accuracy": 0.7607049428610767,
    "precision_wrong": 0.8503649635036497,
    "precision_right": 0.7533879374534624
  },
  "question_answer": {
    "train_set_size": 14522,
    "min_attempts": 2,
    "features": [
      "subject_num",
      "difficulty_level",
      "time_spent",
      "attempt_num",
      "gap_duration",
      "prior_le"
    ],
    "coefficients": [
      0.09463481839614755,
      0.058812814452439764,
      -0.07971827455006887,
      0.3806343131040331,
      

In [4]:
ratings = all_ratings[all_ratings.card_type == 'question_answer']
ratings = ratings[ratings.attempt_num > 4]
ratings.loc[77067]

syllabus_name                                 Class-9
subject_name                                  Physics
chapter_id                                       1886
chapter_name        3.4 - Laws of Motion-3rd Law (qa)
card_id                                         82290
card_type                             question_answer
difficulty_level                                   26
timestamp                         2023-09-20 20:57:16
time_spent                                         44
rating                                              E
prior_le                                         80.0
rating_num                                          1
is_correct                                          1
attempt_num                                         6
gap_duration                                       34
total_duration                                     86
subject_num                                        10
Name: 77067, dtype: object

In [5]:
class Predictor:
    def __init__(self, model):
        self.model = model
        self.coeffs = model[ 'coefficients' ]
        self.intercept = model[ 'intercept' ]
        self.threshold = model[ 'threshold' ]

    def set_parameters( self, subject_num, difficulty_level, time_spent, attempt_num, prior_le ):
        self.subject_num      = subject_num
        self.difficulty_level = difficulty_level
        self.time_spent       = time_spent
        self.attempt_num      = attempt_num
        self.prior_le         = prior_le
        self.features = [
            self.subject_num, 
            self.difficulty_level, 
            self.time_spent, 
            self.attempt_num, 
            0, 
            prior_le
        ]

    def predict_outcome( self, gap_duration):
        self.features[4] = gap_duration
        x = np.dot( self.features, self.coeffs ) + self.intercept
        probability = 1/(1 + np.exp(-x))
        return (gap_duration, probability, 
                1 if probability > self.model['threshold'] else 0 ) 

    def predict_outcomes( self, gap_durations ):
        return [ self.predict_outcome( d ) for d in gap_durations ]

    def predict_optimal_gap_duration( self ):
        for d in range( 0, 365 ):
            outcome = self.predict_outcome( d )
            if outcome[2] == 0:
                return d
        return 365


In [29]:
predictor = Predictor( models['question_answer'] )

x = list( range(0, 365) )

def plot_prediction( subject_num, difficulty_level=26, time_spent=44, attempt_num=7, prior_le=80 ):
    predictor.set_parameters( subject_num, difficulty_level, time_spent, attempt_num, prior_le )
    y = [o[1] for o in predictor.predict_outcomes( x )]
    opt_gap = predictor.predict_optimal_gap_duration()
    
    plt.plot( x, y )
    plt.plot( [0,365], [predictor.threshold, predictor.threshold], ':' )
    plt.plot( [opt_gap, opt_gap], [0, 1.0], ':' )

widgets.interact( plot_prediction, 
                  subject_num = widgets.fixed(10),
                  difficulty_level = (10,100,1),
                  time_spent = (10,180,1),
                  attempt_num = (1,10,1),
                  prior_le = (10,100,5) )

interactive(children=(IntSlider(value=26, description='difficulty_level', min=10), IntSlider(value=44, descrip…

<function __main__.plot_prediction(subject_num, difficulty_level=26, time_spent=44, attempt_num=7, prior_le=80)>