In [3]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats as ss

In [4]:
rd = np.random.random

In [5]:
dataDf = pd.read_csv("./data/induce-data-2019-07-16.csv",index_col=0)

In [6]:
dataDf = dataDf.drop("age", axis=1)
dataDf = dataDf.drop("choice", axis=1)
dataDf = dataDf.groupby(["user", "step"]).first().reset_index()

In [7]:
#dataDf = dataDf[dataDf.topic == "geometry"]

In [8]:
dataDf.head()

Unnamed: 0,user,step,createdAt,question,result,topic,ageGroup
0,10fee518,0,2019-06-17 09:21:02.736000+00:00,C_E_F_T,correct,cards,11-13
1,10fee518,1,2019-06-17 09:21:15.081000+00:00,C_E_F_C,correct,cards,11-13
2,10fee518,2,2019-06-17 09:21:24.037000+00:00,C_E_F_O,correct,cards,11-13
3,10fee518,3,2019-06-17 09:21:33.090000+00:00,A_E_F_T,correct,animals,11-13
4,10fee518,4,2019-06-17 09:21:42.062000+00:00,A_E_F_O,correct,animals,11-13


In [9]:
def computeType(row):
    if row.step < 18: return "INTRO"
    if row.step < 36: return "CORE"
    if row.step in [36, 38, 40, 45, 47, 49, 54, 56, 58]: return "TRIK"
    if row.step in [37, 39, 41, 46, 48, 50, 55, 57, 59]: return "FLEX"
    return "DELY"

def computeUniqueId(row):
    return row.question + "_" + computeType(row)

dataDf["uniqueQuestionID"] = dataDf.apply(computeUniqueId, axis=1)
dataDf["type"] = dataDf.apply(computeType, axis=1)

dataDf.head()

Unnamed: 0,user,step,createdAt,question,result,topic,ageGroup,uniqueQuestionID,type
0,10fee518,0,2019-06-17 09:21:02.736000+00:00,C_E_F_T,correct,cards,11-13,C_E_F_T_INTRO,INTRO
1,10fee518,1,2019-06-17 09:21:15.081000+00:00,C_E_F_C,correct,cards,11-13,C_E_F_C_INTRO,INTRO
2,10fee518,2,2019-06-17 09:21:24.037000+00:00,C_E_F_O,correct,cards,11-13,C_E_F_O_INTRO,INTRO
3,10fee518,3,2019-06-17 09:21:33.090000+00:00,A_E_F_T,correct,animals,11-13,A_E_F_T_INTRO,INTRO
4,10fee518,4,2019-06-17 09:21:42.062000+00:00,A_E_F_O,correct,animals,11-13,A_E_F_O_INTRO,INTRO


In [10]:
def feature(row):
    if row.step < 18:
        return {
            "T":"type", "C": "color", "O":"orientation"
        }[row.question[-1]]
    else: 
        return "dual"

dataDf["feature"] = dataDf.apply(feature, axis=1)

dataDf.head()

Unnamed: 0,user,step,createdAt,question,result,topic,ageGroup,uniqueQuestionID,type,feature
0,10fee518,0,2019-06-17 09:21:02.736000+00:00,C_E_F_T,correct,cards,11-13,C_E_F_T_INTRO,INTRO,type
1,10fee518,1,2019-06-17 09:21:15.081000+00:00,C_E_F_C,correct,cards,11-13,C_E_F_C_INTRO,INTRO,color
2,10fee518,2,2019-06-17 09:21:24.037000+00:00,C_E_F_O,correct,cards,11-13,C_E_F_O_INTRO,INTRO,orientation
3,10fee518,3,2019-06-17 09:21:33.090000+00:00,A_E_F_T,correct,animals,11-13,A_E_F_T_INTRO,INTRO,type
4,10fee518,4,2019-06-17 09:21:42.062000+00:00,A_E_F_O,correct,animals,11-13,A_E_F_O_INTRO,INTRO,orientation


In [11]:
from datetime import date
dataDf.to_csv('./induce-data-'+ str(date.today()) + '.csv')

In [12]:
acceptedAnswers = {}

for q in dataDf.question.unique():
    tag = q.split("_")[-1]
    if len(tag) == 1:
        acceptedAnswers[q] = ["correct", "wrong"]
    else:
        if tag == "OT":
            acceptedAnswers[q] = ["orientation", "type"]
        elif tag == "CT":
            acceptedAnswers[q] = ["color", "type"]
        elif tag == "CO":
            acceptedAnswers[q] = ["orientation", "color"]
        else:
            throw("ERROR IN VERIFYING ANSWERS")

In [13]:
acceptedAnswers

{'C_E_F_T': ['correct', 'wrong'],
 'C_E_F_C': ['correct', 'wrong'],
 'C_E_F_O': ['correct', 'wrong'],
 'A_E_F_T': ['correct', 'wrong'],
 'A_E_F_O': ['correct', 'wrong'],
 'A_E_F_C': ['correct', 'wrong'],
 'G_E_F_C': ['correct', 'wrong'],
 'G_E_F_T': ['correct', 'wrong'],
 'G_E_F_O': ['correct', 'wrong'],
 'A_E_M_T': ['correct', 'wrong'],
 'A_E_M_O': ['correct', 'wrong'],
 'A_E_M_C': ['correct', 'wrong'],
 'G_E_M_O': ['correct', 'wrong'],
 'G_E_M_C': ['correct', 'wrong'],
 'G_E_M_T': ['correct', 'wrong'],
 'C_E_M_O': ['correct', 'wrong'],
 'C_E_M_C': ['correct', 'wrong'],
 'C_E_M_T': ['correct', 'wrong'],
 'C_H_F_CO': ['orientation', 'color'],
 'C_H_F_CT': ['color', 'type'],
 'C_H_F_OT': ['orientation', 'type'],
 'G_H_F_OT': ['orientation', 'type'],
 'G_H_F_CO': ['orientation', 'color'],
 'G_H_F_CT': ['color', 'type'],
 'A_H_F_CT': ['color', 'type'],
 'A_H_F_OT': ['orientation', 'type'],
 'A_H_F_CO': ['orientation', 'color'],
 'C_H_M_CO': ['orientation', 'color'],
 'C_H_M_CT': ['color',

# SPLIT TRAIN / TEST

In [None]:
USERS = dataDf.user.unique()

TRAIN_USERS = []
TEST_USERS = []

for u in USERS:
    (TRAIN_USERS if rd() > 0.2 else TEST_USERS).append(u)

TRAIN_DF = dataDf[dataDf.user.isin(TRAIN_USERS)]
TEST_DF = dataDf[dataDf.user.isin(TEST_USERS)]
    
len(TRAIN_USERS), len(TEST_USERS)

# TEST FUNCTION

In [None]:
def test(model):
    LOG_ERR = 0
    MSE_ERR = 0
    ACC_ERR = 0

    for user in TEST_USERS:
        userSequence = [x[1] for x in TEST_DF[TEST_DF.user == user].iterrows()]
        for idx in range(len(userSequence)):
            
            [proba, prediction] = model(userSequence[:idx], userSequence[idx])

            if prediction not in acceptedAnswers[userSequence[idx].question]:
                raise Exception("Predictor incorrectly assessed possible answers")
            
            p = 1e-4 if proba < 1e-4 else 1 - 1e-4 if proba > 1 - 1e-4 else proba
            LOG_ERR -= np.log(p) if prediction == userSequence[idx].result else np.log(1-p)
            MSE_ERR += (1-p) ** 2 if prediction == userSequence[idx].result else p ** 2
            ACC_ERR += 1 if (prediction == userSequence[idx].result) == (p >= 0.5) else 0
    
    LOG_ERR /= len(TEST_DF)
    MSE_ERR /= len(TEST_DF)
    ACC_ERR /= len(TEST_DF)
    
    return LOG_ERR, MSE_ERR, ACC_ERR

In [None]:
len(TEST_DF)

# DUMMY MODEL

In [None]:
def dummy(seq, question):
    prediction = acceptedAnswers[question.question][0]
    return [0.5, prediction]

LOG_ERR, MSE_ERR, ACC_ERR = test(dummy)
print("LOG_ERR = %.3f" % LOG_ERR)
print("MSE_ERR = %.3f" % MSE_ERR)
print("ACC_ERR = %.3f" % ACC_ERR)

# BASELINE MODEL

In [None]:
_DF = TRAIN_DF.groupby(["uniqueQuestionID", "result"]).count()

BASELINE_AGG = {}
for idx,counts in _DF.iterrows():
    uniqueQuestionID, result = idx
    count = counts.question
    BASELINE_AGG[uniqueQuestionID] = BASELINE_AGG.get(uniqueQuestionID, {})
    BASELINE_AGG[uniqueQuestionID][result] = count

def baseline(seq, question):
    q = BASELINE_AGG[question.uniqueQuestionID]
    prediction = list(q.keys())[0]
    [a,b] = [q[k] for k in q.keys()]
    proba = a / (a+b)
    #print(question[0], prediction, proba)
    return [proba, prediction]

LOG_ERR, MSE_ERR, ACC_ERR = test(baseline)
print("LOG_ERR = %.3f" % LOG_ERR)
print("MSE_ERR = %.3f" % MSE_ERR)
print("ACC_ERR = %.3f" % ACC_ERR)

# BIAS MODEL

In [None]:
rows = []
for user in TRAIN_USERS:
    userSequence = TRAIN_DF[TRAIN_DF.user == user]
    userRow = { "user": user }
    for _,q in userSequence.iterrows():
        userRow[q.uniqueQuestionID] = q.result
    rows.append(userRow)
userDf = pd.DataFrame(rows).set_index("user")
userDf.head()

In [None]:
BIAS_AGG = []

for q1 in userDf.columns:
    for r1 in ["correct", "wrong", "orientation","type","color"]:
        _K = userDf[userDf[q1] == r1]
        if(len(_K)) == 0:
            continue
        row = {"from": "from:" + q1 + "==" + r1}
        for q2 in userDf.columns:
            K = _K[q2].tolist()
            K = [y for y in K if y in ["correct", "wrong", "orientation","type","color"]]
            
            if len(K) == 0:
                continue
            
            for r2 in ["correct", "wrong", "orientation","type","color"]:
                c = len([y for y in K if y == r2])
                if(c > 0):
                    row["to:" + q2 + "==" + r2] = c / len(K)

        BIAS_AGG.append(row)

BIAS_AGG = pd.DataFrame(BIAS_AGG).set_index("from").fillna(0.)
BIAS_AGG.head()

In [None]:
def bias(seq, question):
    possibleAnswers = acceptedAnswers[question.question]
    if(len(seq) == 0):
        return baseline(seq,question)
    
    bestProba = 0
    bestPrediction = ""
    for previous in seq:
        for a in possibleAnswers:
            f,t = "from:"+previous.uniqueQuestionID+"=="+previous.result, "to:"+question.uniqueQuestionID+"=="+a
            p = BIAS_AGG[t].loc[f]
            if p > bestProba:
                bestProba = p
                bestPrediction = a
    
    proba = bestProba
    prediction = bestPrediction
    #print(question.uniqueQuestionID, prediction, proba)
    return [proba, prediction]

LOG_ERR, MSE_ERR, ACC_ERR = test(bias)

In [None]:
print("LOG_ERR = %.3f" % LOG_ERR)
print("MSE_ERR = %.3f" % MSE_ERR)
print("ACC_ERR = %.3f" % ACC_ERR)

# BIAS MODEL FIXED

In [None]:
match = {
    'A_E_M_C_INTRO': 'A_E_F_C_INTRO',
    'A_E_M_O_INTRO': 'A_E_F_O_INTRO',
    'A_E_M_T_INTRO': 'A_E_F_T_INTRO',

    'A_H_F_CO_DELY': 'A_H_F_CO_CORE',
    'A_H_F_CO_TRIK': 'A_H_F_CO_CORE',

    'A_H_F_CT_DELY': 'A_H_F_CT_CORE',
    'A_H_F_CT_TRIK': 'A_H_F_CT_CORE',

    'A_H_F_OT_DELY': 'A_H_F_OT_CORE',
    'A_H_F_OT_TRIK': 'A_H_F_OT_CORE',

    'A_H_M_CO_CORE': 'A_H_F_CO_CORE',
    'A_H_M_CO_FLEX': 'A_H_F_CO_TRIK',

    'A_H_M_CT_CORE': 'A_H_F_CT_CORE',
    'A_H_M_CT_FLEX': 'A_H_F_CT_TRIK',

    'A_H_M_OT_CORE': 'A_H_F_OT_CORE',
    'A_H_M_OT_FLEX': 'A_H_F_OT_TRIK',

    'C_E_M_C_INTRO': 'C_E_F_C_INTRO',
    'C_E_M_O_INTRO': 'C_E_F_O_INTRO',
    'C_E_M_T_INTRO': 'C_E_F_T_INTRO',

    'C_H_F_CO_DELY': 'C_H_F_CO_CORE',
    'C_H_F_CO_TRIK': 'C_H_F_CO_CORE',

    'C_H_F_CT_DELY': 'C_H_F_CT_CORE',
    'C_H_F_CT_TRIK': 'C_H_F_CT_CORE',

    'C_H_F_OT_DELY': 'C_H_F_OT_CORE',
    'C_H_F_OT_TRIK': 'C_H_F_OT_CORE',

    'C_H_M_CO_CORE': 'C_H_F_CO_CORE',
    'C_H_M_CO_FLEX': 'C_H_F_CO_TRIK',

    'C_H_M_CT_CORE': 'C_H_F_CT_CORE',
    'C_H_M_CT_FLEX': 'C_H_F_CT_TRIK',

    'C_H_M_OT_CORE': 'C_H_F_OT_CORE',
    'C_H_M_OT_FLEX': 'C_H_F_OT_TRIK',

    'G_E_M_C_INTRO': 'G_E_F_C_INTRO',
    'G_E_M_O_INTRO': 'G_E_F_O_INTRO',
    'G_E_M_T_INTRO': 'G_E_F_T_INTRO',

    'G_H_F_CO_DELY': 'G_H_F_CO_CORE',
    'G_H_F_CO_TRIK': 'G_H_F_CO_CORE',

    'G_H_F_CT_DELY': 'G_H_F_CT_CORE',
    'G_H_F_CT_TRIK': 'G_H_F_CT_CORE',

    'G_H_F_OT_DELY': 'G_H_F_OT_CORE',
    'G_H_F_OT_TRIK': 'G_H_F_OT_CORE',

    'G_H_M_CO_CORE': 'G_H_F_CO_CORE',
    'G_H_M_CO_FLEX': 'G_H_F_CO_TRIK',

    'G_H_M_CT_CORE': 'G_H_F_CT_CORE',
    'G_H_M_CT_FLEX': 'G_H_F_CT_TRIK',

    'G_H_M_OT_CORE': 'G_H_F_OT_CORE',
    'G_H_M_OT_FLEX': 'G_H_F_OT_TRIK'
}

def fixed(seq, question):
    possibleAnswers = acceptedAnswers[question.question]
    prediction = possibleAnswers[0]
    
    if(question.uniqueQuestionID not in match):
        return baseline(seq,question)
    
    previous = [p for p in seq if p.uniqueQuestionID == match[question.uniqueQuestionID]][0]
    
    f,t = "from:"+previous.uniqueQuestionID+"=="+previous.result, "to:"+question.uniqueQuestionID+"=="+prediction
    proba = BIAS_AGG[t].loc[f]
    #print(question.uniqueQuestionID, prediction, proba)
    return [proba, prediction]

LOG_ERR, MSE_ERR, ACC_ERR = test(fixed)

In [None]:
print("LOG_ERR = %.3f" % LOG_ERR)
print("MSE_ERR = %.3f" % MSE_ERR)
print("ACC_ERR = %.3f" % ACC_ERR)

# BAYESIAN

In [None]:
_DF = TRAIN_DF[TRAIN_DF.type.isin(["CORE", "TRIK"])]

K = 12
indexes = [ "A_CO", "A_CT", "A_OT", "C_CO", "C_CT", "C_OT", "G_CO", "G_CT", "G_OT" ]
answers = [ "color", "color", "orientation", "color", "color", "orientation", "color", "color", "orientation" ]
states = [[rd() for _ in range(9)] for _ in range(K)]
userStates = {}
prior = [0 for _ in states]

def computeProba(state, question):
    [topic, _, _ , features] = question.question.split("_")
    probaIndex = indexes.index(topic + "_" + features)
    p = state[probaIndex]
    p = max(1e-5,min(1-1e-5,p))
    return p if answers[probaIndex] == question.result else (1 - p)

def expectation():
    for u in TRAIN_USERS:
        userSequence = [x for _,x in _DF[_DF.user == u].iterrows()]
        _p = [[computeProba(s,q) for q in userSequence] for s in states]
        _p = np.array([np.exp(sum(np.log(np.array(q)))) for q in _p])
        userStates[u] = _p / sum(_p)
    
def maximization():
    for s in states:
        for k in range(9):
            s[k] = {"r":0, "c":0}
    
    for _,question in _DF.iterrows():
        [topic, _, _ , features] = question.question.split("_")
        probaIndex = indexes.index(topic + "_" + features)
        for i,s in enumerate(states):
            _p = userStates[question.user][i]
            s[probaIndex]["r"] += _p if question.result == answers[probaIndex] else 0
            s[probaIndex]["c"] += _p
        
    for s in states:
        for k in range(9):
            s[k] = max(1e-5, min(1-1e-5, s[k]["r"] / s[k]["c"])) if s[k]["c"] > 0 else 0.5
    
def loop():
    for i in range(16):
        print("ITERATION %d . . ." % (i+1), end=" , ")
        print("EXP . . .", end=" , ")
        expectation()
        print("MAX . . .")
        maximization()
    print("Computing prior . . .")
    _prior = np.array([0. for _ in states])
    for _p in userStates.values():
        _prior += _p
    _prior /= sum(_prior)
    return _prior

In [None]:
prior = loop()

In [None]:
TRAIN_DF.head()

In [None]:
def computeTransitions():
    CORE_DF = TRAIN_DF[TRAIN_DF.type.isin(["CORE", "TRIK"])]
    FLEX_DF = TRAIN_DF[TRAIN_DF.type.isin(["FLEX"])]
    
    transitions = {}
    for flex in FLEX_DF.question.unique():
        transitions[flex] = [ np.array([0 for _ in range(K)]) for _ in range(K) ] 
    
    for user in TRAIN_USERS:
        coreSequence = [x for _,x in CORE_DF[CORE_DF.user == user].iterrows()]
        _p = [[computeProba(s,q) for q in coreSequence] for s in states]
        _p = np.exp(np.array([sum(np.log(np.array(q))) for q in _p]) + np.log(prior))
        _p /= sum(_p)
        
        flexSequence = [x for _,x in FLEX_DF[FLEX_DF.user == user].iterrows()]
        for questionFlex in flexSequence:
            _f = np.array([computeProba(s,questionFlex) for s in states])
            _f /= sum(_f)
            for stateFrom in range(K):
                t = transitions[questionFlex.question]
                t[stateFrom] = t[stateFrom] + _p[stateFrom] * _f
    
    for flex in FLEX_DF.question.unique():
        for i in range(K):
            transitions[flex][i] /= sum(transitions[flex][i])

    return transitions


def randomTransitions():
    FLEX_DF = TRAIN_DF[TRAIN_DF.type.isin(["FLEX"])]
    transitions = {}
    for flex in FLEX_DF.question.unique():
        transitions[flex] = [ np.array([rd() for _ in range(K)]) for _ in range(K) ] 
    for flex in FLEX_DF.question.unique():
        for i in range(K):
            transitions[flex][i] /= sum(transitions[flex][i])
    return transitions


TRANSITIONS = randomTransitions()
TRANSITIONS

In [None]:
def bayesian(seq, question):
    if(question.step < 18):
        return baseline(seq,question)

    
    biasQuestions = [q for q in seq if q.type in ["CORE", "TRICK"]]
    _p = [[computeProba(s,q) for q in biasQuestions] for s in states]
    _p = np.exp(np.array([sum(np.log(np.array(q))) for q in _p]) + np.log(prior))
    _p /= sum(_p)
    
    [topic, _, _ , features] = question.question.split("_")
    probaIndex = indexes.index(topic + "_" + features)    
    prediction = answers[probaIndex]
    
    if(question.type in ["CORE", "TRIK"]):
        proba = sum([_p[i] * s[probaIndex] for i,s in enumerate(states)])
        return [proba, prediction] 
    
    if(question.type == "FLEX"):
        t = TRANSITIONS[question.question]
        proba = 0
        for i,_d in enumerate(t):
            proba += sum([_p[i] * _d[k] * s[probaIndex] for k,s in enumerate(states)]) 
        return [proba, prediction]
    
    return baseline(seq,question)


LOG_ERR, MSE_ERR, ACC_ERR = test(bayesian)
print("LOG_ERR = %.3f" % LOG_ERR)
print("MSE_ERR = %.3f" % MSE_ERR)
print("ACC_ERR = %.3f" % ACC_ERR)

In [None]:
print("LOG_ERR = %.3f" % LOG_ERR)
print("MSE_ERR = %.3f" % MSE_ERR)
print("ACC_ERR = %.3f" % ACC_ERR)

In [None]:
print("-------------------------------------------------------------------------")
print("|", end=" ")
for i in range(9):
    print(indexes[i], end="  | ")

for s in states:
    print()
    print("-------------------------------------------------------------------------")
    print("|", end=" ")
    for i in range(len(s)):
        print("%.3f" % s[i], end=" | ")
print()
print("-------------------------------------------------------------------------")


In [None]:
# TODO : This code is broken
#plt.hist([userStates.values()], bins=[-0.25 + 0.5 * k for k in range(2 * len(states))])

In [None]:
userStates

In [None]:
states

In [None]:
dataDf.head()

In [None]:
import matplotlib.pyplot as plt

In [None]:
BIAS_AGG.head()

In [None]:
plt.figure(figsize=(27,12))
plt.plot(BIAS_AGG["to:A_H_M_CO_CORE==color"])
#plt.plot(BIAS_AGG["to:A_H_M_CO_CORE==orientation"])
plt.xticks(rotation='vertical')
plt.show()