# Case studies

In [89]:
import numpy as np
import pandas as pd
import re

In [52]:
DATADIR = "../usb/releases/20201018/"
DROPOUTDATADIR = "../usb/releases/20200302/"
CLIENTSFILE = "clients.csv.gz"
DROPOUTFILE = "dropout.csv.gz"
ANSWERID = "answerId"
ANSWERTEXT = "answerText"
CLIENT = "client"
CLIENTID = "clientID"
DROPOUT = "dropout"
QUESTIONNUMBER = "questionNumber"
COMPLETERCODE = "2"
DROPOUTCODE = "1"

In [13]:
dropout = pd.read_csv(DROPOUTDATADIR+DROPOUTFILE)
dropouts = list(dropout[dropout[DROPOUT]==DROPOUTCODE][CLIENTID])
completers = list(dropout[dropout[DROPOUT]==COMPLETERCODE][CLIENTID])

In [210]:
len(dropouts),len(completers)

(653, 359)

## Case study 1: compare metadata of dropouts and completers

In [185]:
GENDER = "geslacht"
AGE = "leeftijd"
EDUCATION = "opleidng"
YESNO = "YESNO"
YESNOIDS = ["dagritme","dsm2","dsm3","dsm4","dsm5","dsm6","dsm7","dsm8","dsm9","dsm11",
            "medicijn","drugs","eetdrang","insult","psych"]
COLUMNS = [GENDER,EDUCATION,DAY1,AGE]+YESNOIDS
DAY1 = "dag1"
CONVERSION = { GENDER: {WOMAN:0,MAN:1},
               EDUCATION: {"Basisschool":0,"LBO/MAVO":1,"MBO":2,"HAVO/VWO":3,"HBO":4,"WO":5,"REMOVED":np.nan},
               YESNO: {"Nee":0,"Ja":1}}
MAN = "Man"
WOMAN = "Vrouw"

def make_numeric(df):
    row_nbrs = {}
    data_table = []
    for i in range(0,len(df)):
        row = df.iloc[i]
        client = row[CLIENT]
        if not client in row_nbrs:
            row_nbrs[client] = len(data_table)
            data_table.append((4+len(YESNOIDS))*[np.nan])
        if row[ANSWERID] == GENDER or row[ANSWERID] == GENDER+"0" or row[ANSWERID] == GENDER+"t0":
            data_table[row_nbrs[client]][0] = CONVERSION[GENDER][row[ANSWERTEXT]]
        if row[ANSWERID] == EDUCATION  or row[ANSWERID] == EDUCATION+"0" or row[ANSWERID] == EDUCATION+"t0":
            data_table[row_nbrs[client]][1] = CONVERSION[EDUCATION][row[ANSWERTEXT]]
        if row[ANSWERID] == DAY1:
            try:
                number_word = row[ANSWERTEXT].split(":")[1].split()[0]
                if number_word == "Niet": number_word = "0"
                data_table[row_nbrs[client]][2] = int(number_word)
            except: pass
        if row[ANSWERID] == AGE or row[ANSWERID] == AGE+"0" or row[ANSWERID] == AGE+"t0":
            try:
                data_table[row_nbrs[client]][3] = int(re.sub("\D","",row[ANSWERTEXT]))
            except: pass
        for j in range(0,len(YESNOIDS)):
            if row[ANSWERID] == YESNOIDS[j] or row[ANSWERID] == YESNOIDS[j]+"0" or row[ANSWERID] == YESNOIDS[j]+"t0":
                data_table[row_nbrs[client]][4+j] = CONVERSION[YESNO][row[ANSWERTEXT]]

    return(data_table)

In [155]:
client_data = pd.read_csv(DATADIR+CLIENTSFILE)
dropout_data = client_data[client_data[CLIENT].isin(dropouts)]
completer_data = client_data[client_data[CLIENT].isin(completers)]

In [138]:
answer_texts = {}
for i in range(0,len(dropout_data)):
    answer_text = dropout_data.iloc[i][ANSWERTEXT]
    answer_id = dropout_data.iloc[i][ANSWERID]
    if not answer_id in answer_texts.keys(): answer_texts[answer_id] = []
    if not answer_text in answer_texts[answer_id]: answer_texts[answer_id].append(answer_text)
for answer_id in answer_texts:
    if len(answer_texts[answer_id]) == 2: 
        print(answer_id,answer_texts[answer_id])

geslacht ['Vrouw', 'Man']
dagritme ['Ja', 'Nee']
verhaal [nan, 'REMOVED']
dsm1 ['Nee', 'Ja']
dsm2 ['Ja', 'Nee']
dsm3 ['Ja', 'Nee']
dsm4 ['Ja', 'Nee']
dsm5 ['Nee', 'Ja']
dsm6 ['Nee', 'Ja']
dsm7 ['Nee', 'Ja']
dsm8 ['Ja', 'Nee']
dsm9 ['Nee', 'Ja']
dsm10 ['Nee', 'Ja']
dsm11 ['Nee', 'Ja']
medicijn ['Nee', 'Ja']
medi3 [nan, 'REMOVED']
medi4 [nan, 'REMOVED']
medi5 [nan, 'REMOVED']
medivr3 [nan, 'REMOVED']
medivr5 [nan, 'REMOVED']
medidos5 [nan, 'REMOVED']
drugs ['Nee', 'Ja']
drugs2 [nan, 'REMOVED']
drugs3 [nan, 'REMOVED']
drugs4 [nan, 'REMOVED']
drugs5 [nan, 'REMOVED']
drugsvk3 [nan, 'REMOVED']
drugsvk4 [nan, 'REMOVED']
drugsvk5 [nan, 'REMOVED']
drugsln2 [nan, 'REMOVED']
drugsln3 [nan, 'REMOVED']
drugsln4 [nan, 'REMOVED']
drugsln5 [nan, 'REMOVED']
eetdrang ['Nee', 'Ja']
insult ['Nee', 'Ja']
delirium ['Nee', 'Ja']
psych ['Nee', 'Ja']
psywaar3 [nan, 'REMOVED']
psywaar4 [nan, 'REMOVED']
psywaar5 [nan, 'REMOVED']
psyvoor4 [nan, 'REMOVED']
psyvoor5 [nan, 'REMOVED']
psytijd3 [nan, 'REMOVED']
psytij

In [116]:
dropout_data[dropout_data[ANSWERID]=="dag1"].groupby(ANSWERTEXT).count()[CLIENT]

answerText
dinsdag 01 november (Gisteren): Niet gedronken     1
dinsdag 01 september (Gisteren): 2                 1
dinsdag 01 september (Gisteren): Niet gedronken    1
dinsdag 03 januari (Gisteren): Niet gedronken      1
dinsdag 03 november (Gisteren): 11 t/m 15          1
                                                  ..
zondag 29 maart (Gisteren): 7                      1
zondag 29 maart (Gisteren): 9                      1
zondag 29 maart (Gisteren): Niet gedronken         1
zondag 30 augustus (Gisteren): 4                   1
zondag 30 januari (Gisteren): Niet gedronken       1
Name: client, Length: 399, dtype: int64

In [50]:
completer_data[completer_data[QUESTIONNUMBER]==1].groupby(ANSWERTEXT).count()[CLIENT]

answerText
Man      140
Vrouw    219
Name: client, dtype: int64

In [168]:
dropout_table = make_numeric(dropout_data[dropout_data[ANSWERID].isin([GENDER,EDUCATION,DAY1,AGE]+YESNOIDS)])
completer_table = make_numeric(completer_data[completer_data[ANSWERID].isin([GENDER,EDUCATION,DAY1,AGE]+YESNOIDS)])

In [193]:
X = dropout_table+completer_table
y = len(dropout_table)*[DROPOUTCODE]+len(completer_table)*[COMPLETERCODE]

In [207]:
def column_average(table,column_id):
    return(round(np.average([row[column_id] for row in table if not np.isnan(row[column_id])]),2))

In [208]:
from sklearn.feature_selection import f_classif
import numpy as np
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(X)
F,p = f_classif(imp.transform(X),y)
{COLUMNS[i]:(p[i],column_average(completer_table,i),column_average(dropout_table,i)) for i in sorted(range(0,len(p)),key=lambda i:p[i])}

{'leeftijd': (4.6322007232081313e-07, 47.43, 43.56),
 'dag1': (5.5049954236910925e-05, 4.93, 6.33),
 'opleidng': (0.001102226979458426, 3.23, 2.89),
 'geslacht': (0.0023978688268680983, 0.39, 0.51),
 'dagritme': (0.004488127845409525, 0.8, 0.71),
 'drugs': (0.008044668229341712, 0.07, 0.13),
 'dsm11': (0.039930332834349436, 0.55, 0.62),
 'dsm2': (0.04944763686709503, 0.95, 0.91),
 'psych': (0.0653998948081495, 0.64, 0.57),
 'dsm7': (0.07401935906521508, 0.23, 0.29),
 'dsm5': (0.1001909920413593, 0.29, 0.35),
 'dsm8': (0.10474548451394548, 0.36, 0.42),
 'dsm9': (0.12421897072520555, 0.32, 0.37),
 'insult': (0.16899087407186258, 0.01, 0.03),
 'eetdrang': (0.29018157129260574, 0.11, 0.13),
 'medicijn': (0.3115643877908566, 0.52, 0.56),
 'dsm6': (0.3380403597181849, 0.87, 0.84),
 'dsm3': (0.46550863926864083, 0.59, 0.62),
 'dsm4': (0.800440951589716, 0.84, 0.84)}

## Sandbox

In [169]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)

In [170]:
y_pred

array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1,
       0, 0, 2, 0, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 0, 1, 1, 1, 2, 0, 2, 0,
       0, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 1, 2, 1, 0, 2, 1, 1, 1,
       1, 2, 0, 0, 2, 1, 0, 0, 1])

In [172]:
y_test


array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1,
       0, 0, 2, 0, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 0, 1, 1, 1, 2, 0, 2, 0,
       0, 1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 0, 2, 1, 1, 1,
       1, 2, 0, 0, 2, 1, 0, 0, 1])