# Case studies

In [1]:
import numpy as np
import pandas as pd
import re
import sys

In [2]:
DATADIR = "../usb/releases/20201018/"
DROPOUTDATADIR = "../usb/releases/20200302/"
CLIENTSFILE = "clients.csv.gz"
DROPOUTFILE = "dropout.csv.gz"
ANSWERID = "answerId"
ANSWERTEXT = "answerText"
ANSWERTITLE = "answerTitle"
CLIENT = "client"
CLIENTID = "clientID"
DROPOUT = "dropout"
QUESTIONNUMBER = "questionNumber"
REMOVED = "REMOVED"
COMPLETERCODE = "2"
DROPOUTCODE = "1"

In [3]:
def read_dropout_ids():
    dropout = pd.read_csv(DROPOUTDATADIR+DROPOUTFILE)
    dropout_ids = list(dropout[dropout[DROPOUT]==DROPOUTCODE][CLIENTID])
    completer_ids = list(dropout[dropout[DROPOUT]==COMPLETERCODE][CLIENTID])
    return(dropout_ids,completer_ids)

In [4]:
dropout_ids,completer_ids = read_dropout_ids()
len(dropout_ids),len(completer_ids)

(653, 359)

## Case study 1: compare metadata of dropouts and completers

In [74]:
GENDER = "geslacht"
AGE = "leeftijd"
EDUCATION = "opleidng"
DAY1 = "dag1"
MAN = "Man"
WOMAN = "Vrouw"
ANSWER = "answer"
COUNT = "count"
YESNO = "YESNO"
YESNOIDS = ["dagritme","dsm2","dsm3","dsm4","dsm5","dsm6","dsm7","dsm8","dsm9","dsm11",
            "medicijn","drugs","eetdrang","insult","delirium","psych","tabak","canna",
            "coca","speed","xtc","ghb","opiat","sleep","gok","behversl","halluci",
            "suicide","wanen","benniet"]
COLUMNS = [{ANSWER:answer, COUNT:1} for answer in [GENDER,EDUCATION,DAY1,AGE]+YESNOIDS]
CONVERSION = { GENDER: {WOMAN:0,MAN:1},
               EDUCATION: {"Basisschool":0,"LBO/MAVO":1,"MBO":2,"HAVO/VWO":3,"HBO":4,"WO":5,"REMOVED":np.nan},
               YESNO: {"Nee":0,"Ja":1}}

def make_numeric(df):
    row_nbrs = {}
    data_table = []
    for i in range(0,len(df)):
        row = df.iloc[i]
        client = row[CLIENT]
        if not client in row_nbrs:
            row_nbrs[client] = len(data_table)
            data_table.append((4+len(YESNOIDS))*[np.nan])
        if row[ANSWERID] == GENDER or row[ANSWERID] == GENDER+"0" or row[ANSWERID] == GENDER+"t0":
            data_table[row_nbrs[client]][0] = CONVERSION[GENDER][row[ANSWERTEXT]]
        if row[ANSWERID] == EDUCATION  or row[ANSWERID] == EDUCATION+"0" or row[ANSWERID] == EDUCATION+"t0":
            data_table[row_nbrs[client]][1] = CONVERSION[EDUCATION][row[ANSWERTEXT]]
        if row[ANSWERID] == DAY1:
            try:
                number_word = row[ANSWERTEXT].split(":")[1].split()[0]
                if number_word == "Niet": number_word = "0"
                data_table[row_nbrs[client]][2] = int(number_word)
            except: pass
        if row[ANSWERID] == AGE or row[ANSWERID] == AGE+"0" or row[ANSWERID] == AGE+"t0":
            try:
                data_table[row_nbrs[client]][3] = int(re.sub("\D","",row[ANSWERTEXT]))
            except: pass
        for j in range(0,len(YESNOIDS)):
            if row[ANSWERID] == YESNOIDS[j] or row[ANSWERID] == YESNOIDS[j]+"0" or row[ANSWERID] == YESNOIDS[j]+"t0":
                data_table[row_nbrs[client]][4+j] = CONVERSION[YESNO][row[ANSWERTEXT]]

    return(data_table)

In [6]:
def read_data(dropout_ids,completer_ids):
    client_data = pd.read_csv(DATADIR+CLIENTSFILE)
    dropout_data = client_data[client_data[CLIENT].isin(dropout_ids)]
    completer_data = client_data[client_data[CLIENT].isin(completer_ids)]
    return(dropout_data,completer_data)

In [7]:
dropout_data,completer_data = read_data(dropout_ids,completer_ids)

In [148]:
def verify_data_completeness(dropout_data,completer_data,dropout_ids,completer_ids):
    dropout_data_ids = list(dropout_data[CLIENT])
    for client_id in dropout_ids:
        if client_id not in dropout_data_ids:
            print(f"missing dropout client id: {client_id}")

    completer_data_ids = list(completer_data[CLIENT])
    for client_id in completer_ids:
        if client_id not in completer_data_ids:
            print(f"missing completer client id: {client_id}")
            
verify_data_completeness(dropout_data,completer_data,dropout_ids,completer_ids)

missing dropout client id: AdB1263
missing dropout client id: AdB1469


### 1.1 Experiment with binary answer classes

In [260]:
def cleanup_answer_text(text):
    if pd.isna(text): return(text)
    text = str(text).lower()
    text = re.sub("\s+"," ",text)
    text = text.strip()
    return(text)

def normalize_answer_id(answer_id, first_answer_id):
    if first_answer_id == GESLACHT: 
        new_answer_id = answer_id
    elif first_answer_id == GESLACHT0:
        if re.search("0h$",answer_id):
            new_answer_id = re.sub("0h$","h",answer_id)
        else:
            new_answer_id = re.sub("0$","",answer_id)
    elif first_answer_id == GESLACHTT0: 
        new_answer_id = re.sub("t0$","",answer_id)
    else: 
        sys.exit(f"unknown first answer id: {first_answer_id}!")
    if re.search("^(goTo[0-9]|ltgeslacht1|doel)$",answer_id):
        return("")
    if (first_answer_id != GESLACHT and new_answer_id == answer_id and 
        not answer_id == EXCEPTIONANSWERID and not re.search(NONQUESTIONS,answer_id)):
        sys.exit(f"first answer id {first_answer_id} did not change {answer_id}!")
    return(new_answer_id)

In [261]:
def find_binary_answer_classes(dropout_data,completer_data):
    answer_texts = {}
    questionnaire_types = {}
    data = pd.concat([dropout_data,completer_data])
    for i in range(0,len(data)):
        if not client_id in questionnaire_types:
            questionnaire_types[client_id] = data.iloc[i][ANSWERID]
        answer_text = cleanup_answer_text(data.iloc[i][ANSWERTEXT])
        answer_title = cleanup_answer_text(data.iloc[i][ANSWERTITLE])
        if pd.isna(answer_title): answer_title = ""
        answer_id = normalize_answer_id(data.iloc[i][ANSWERID],questionnaire_types[client_id])
        answer_key = answer_id+"#"+answer_title
        if not answer_key in answer_texts.keys(): answer_texts[answer_key] = []
        if not answer_text in answer_texts[answer_key] and not pd.isna(answer_text) and not answer_text == "removed":
            answer_texts[answer_key].append(answer_text)
    results = []
    for answer_key in answer_texts:
        if len(answer_texts[answer_key]) == 2: 
            results.append((answer_key,answer_texts[answer_key]))
    return(results)

In [263]:
results = find_binary_answer_classes(dropout_data,completer_data)

geslacht# ['vrouw', 'man']
dagritme# ['ja', 'nee']
dsm1#1.heb je in de afgelopen 12 maanden gemerkt dat je veel meer alcohol nodig begon te hebben om hetzelfde effect te bereiken of dat dezelfde hoeveelheid minder effect had dan voorheen? ['nee', 'ja']
dsm2#2.heb je in de afgelopen 12 maanden het verlangen gehad om te stoppen of zonder succes geprobeerd te stoppen of minderen met alcohol? ['ja', 'nee']
dsm3#3.heb je in de afgelopen 12 maanden veel tijd besteed aan het gebruik, verkrijgen, of bijkomen van de effecten van alcohol? ['ja', 'nee']
dsm4#4.heb je in de afgelopen 12 maanden vaak alcohol in grotere hoeveelheden of langer gebruikt dan je van plan was, of het moeilijk gevonden te stoppen met het gebruik van alcohol voor je dronken was? ['ja', 'nee']
dsm5#5.voelde je je in de afgelopen 12 maanden ziek of onwel bij het stoppen of minderen met alcohol of gebruikte je alcohol om deze gevoelens te voorkomen? ['nee', 'ja']
dsm6#6.ging je in de afgelopen 12 maanden door met het gebruik 

In [24]:
dropout_table = make_numeric(dropout_data[dropout_data[ANSWERID].isin(COLUMNS)])
completer_table = make_numeric(completer_data[completer_data[ANSWERID].isin(COLUMNS)])

In [76]:
from sklearn.feature_selection import f_classif
import numpy as np
from sklearn.impute import SimpleImputer

def run_anova(dropout_table,completer_table):
    X = dropout_table+completer_table
    y = len(dropout_table)*[DROPOUTCODE]+len(completer_table)*[COMPLETERCODE]
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp.fit(X)
    F,p_values = f_classif(imp.transform(X),y)
    return(p_values)

def column_average(table,column_id):
    return(round(np.average([row[column_id] for row in table if not np.isnan(row[column_id])]),2))

def sort_p_values(p_values,column_names,dropout_table,completer_table):
    return({column_names[i][ANSWER]:(p_values[i],column_names[i][COUNT],column_average(completer_table,i),column_average(dropout_table,i)) 
            for i in sorted(range(0,len(p_values)),key=lambda i:p_values[i])})

In [25]:
p_values = run_anova(dropout_table,completer_table)
sort_p_values(p_values,COLUMNS,dropout_table,completer_table)

{'leeftijd': (4.6322007232081313e-07, 47.43, 43.56),
 'dag1': (5.5049954236910925e-05, 4.93, 6.33),
 'opleidng': (0.001102226979458426, 3.23, 2.89),
 'geslacht': (0.0023978688268680983, 0.39, 0.51),
 'dagritme': (0.004488127845409525, 0.8, 0.71),
 'drugs': (0.008044668229341712, 0.07, 0.13),
 'dsm11': (0.039930332834349436, 0.55, 0.62),
 'dsm2': (0.04944763686709503, 0.95, 0.91),
 'psych': (0.0653998948081495, 0.64, 0.57),
 'dsm7': (0.07401935906521508, 0.23, 0.29),
 'dsm5': (0.1001909920413593, 0.29, 0.35),
 'dsm8': (0.10474548451394548, 0.36, 0.42),
 'dsm9': (0.12421897072520555, 0.32, 0.37),
 'insult': (0.16899087407186258, 0.01, 0.03),
 'eetdrang': (0.29018157129260574, 0.11, 0.13),
 'medicijn': (0.3115643877908566, 0.52, 0.56),
 'dsm6': (0.3380403597181849, 0.87, 0.84),
 'dsm3': (0.46550863926864083, 0.59, 0.62),
 'delirium': (0.7124196594796546, 0.05, 0.06),
 'dsm4': (0.800440951589716, 0.84, 0.84)}

### 1.2 Experiment with all answer classes

In [207]:
GESLACHT = "geslacht"
GESLACHT0 = "geslacht0"
GESLACHTT0 = "geslachtt0"
NONQUESTIONS = "^(goTo[0-9]|ltgeslacht1|doel)$"
EXCEPTIONANSWERID = "mdoel"
MINANSWERCOUNT = 10
MAXPVALUE = 0.01
ANSWER = "answer"
COUNT = "count"

def get_binary_answers(data):
    questionnaire_types = {}
    answers_per_client = {}
    for i in range(0,len(data)):
        client_id = data.iloc[i][CLIENT]
        if not client_id in questionnaire_types:
            questionnaire_types[client_id] = data.iloc[i][ANSWERID]
            answers_per_client[client_id] = []
        answer_id = normalize_answer_id(data.iloc[i][ANSWERID], questionnaire_types[client_id])
        if re.search(NONQUESTIONS,answer_id): continue
        answer_text = cleanup_answer_text(data.iloc[i][ANSWERTEXT])
        answer_title = cleanup_answer_text(data.iloc[i][ANSWERTITLE])
        if not pd.isna(answer_text) and answer_text != "removed":
            if pd.isna(answer_title): answer_title = ""
            answers_per_client[client_id].append(answer_id+"#"+answer_title+"#"+answer_text)
    return(answers_per_client, questionnaire_types)

def count_answers(binary_answers):
    answer_counts = {}
    for client_id in binary_answers:
        for answer in binary_answers[client_id]:
            if answer in answer_counts:
                answer_counts[answer] += 1
            else:
                answer_counts[answer] = 1
    return({answer:answer_counts[answer] for answer in sorted(answer_counts.keys(),key=lambda a:answer_counts[a],reverse=True)})

def make_binary_table(data,binary_answers):
    answer_counts = count_answers(binary_answers)
    binary_table = []
    answers_used = []
    for client_id in data[CLIENT].unique():
        binary_table.append([])
        for answer in answer_counts:
            if answer_counts[answer] >= MINANSWERCOUNT:
                if answer in binary_answers[client_id]:
                    binary_table[-1].append(1)
                else:
                    binary_table[-1].append(0)
    for answer in answer_counts:
        if answer_counts[answer] >= MINANSWERCOUNT:
            answers_used.append({ANSWER:answer, COUNT:answer_counts[answer]})
    return(binary_table, answers_used)

def convert_data_to_binary(dropout_data,completer_data):
    all_data = pd.concat([dropout_data,completer_data])
    binary_answers, questionnaire_types = get_binary_answers(all_data)
    dropout_table_binary, answers_used = make_binary_table(dropout_data,binary_answers)
    completer_table_binary, answers_used = make_binary_table(completer_data,binary_answers)
    return(dropout_table_binary, completer_table_binary, answers_used, binary_answers, questionnaire_types)

def select_p_values(p_values,column_names,dropout_table,completer_table):
    return({column_names[i][ANSWER]:(p_values[i],column_average(completer_table,i),column_average(dropout_table,i)) 
            for i in sorted(range(0,len(p_values)),key=lambda i:p_values[i])
            if column_average(completer_table,i) < column_average(dropout_table,i) and p_values[i] < MAXPVALUE})

In [208]:
dropout_table_binary, completer_table_binary, answers_used, binary_answers, questionnaire_types = convert_data_to_binary(dropout_data,completer_data)

In [151]:
len(dropout_table_binary[0]),len(completer_table_binary[0]),len(answers_used), len(binary_answers)

(1078, 1078, 1078, 1010)

In [115]:
p_values = run_anova(dropout_table_binary,completer_table_binary)

In [192]:
list(sort_p_values(p_values,answers_used,dropout_table_binary,completer_table_binary).items())[:10]

316


[('opiat##nee', (5.940101708792509e-17, 240, 0.09, 0.32)),
 ('opiath##0', (5.940101708792509e-17, 240, 0.09, 0.32)),
 ('mateicn10#10.had je gebrek aan onderdak of had je problemen met huisvesting?#niet / geen',
  (6.94472981470106e-17, 236, 0.09, 0.31)),
 ('national##nederlands', (8.719316831085801e-17, 239, 0.09, 0.32)),
 ('gokken##nee', (9.843857591931877e-17, 716, 0.87, 0.62)),
 ('mateicn13#13.had je er moeite mee voor een veilige slaapplaats of voor beschermende kleding te zorgen?#niet / geen',
  (1.277725495106288e-16, 238, 0.09, 0.32)),
 ('##-1', (1.5524613433478835e-16, 341, 0.08, 0.3)),
 ('ghb##nee', (3.9803276216290714e-16, 235, 0.09, 0.31)),
 ('ghbh##0', (3.9803276216290714e-16, 235, 0.09, 0.31)),
 ('cultherk##nederlands', (5.793707821605359e-16, 234, 0.09, 0.31))]

In [197]:
def print_answer_ids_freqs(dropout_data,completer_data):
    all_data = pd.concat([dropout_data,completer_data])
    for answer_combi in ["opiat##nee","opiath##0","mateicn10#10.had je gebrek aan onderdak of had je problemen met huisvesting?#niet / geen",
                         "national##nederlands","gokken##nee"]:
        answer = answer_combi.split("#")[0]
        print(answer,len(all_data[all_data["answerId"]==answer]),len(all_data[all_data["answerId"]==answer+"0"]),len(all_data[all_data["answerId"]==answer+"t0"]))
        
print_answer_ids_freqs(dropout_data,completer_data)

opiat 0 245 0
opiath 0 0 0
mateicn10 245 245 0
national 0 245 0
gokken 706 0 38


In [198]:
dropout_predictors = list(select_p_values(p_values,answers_used,dropout_table_binary,completer_table_binary).keys())
print(len(dropout_predictors))

316


In [211]:
def get_dropout_predictor_scores(data,dropout_predictors,binary_answers,questionnaire_types):
    scores = []
    for client_id in data[CLIENT].unique():
        score = 0
        for predictor in dropout_predictors:
            if predictor in binary_answers[client_id]: 
                score += 1
        scores.append((score,client_id,questionnaire_types[client_id]))
    return(scores)

In [248]:
dropout_predictor_scores_dropout = get_dropout_predictor_scores(dropout_data,dropout_predictors,binary_answers,questionnaire_types)
dropout_predictor_scores_completer = get_dropout_predictor_scores(completer_data,dropout_predictors,binary_answers,questionnaire_types)
print([score_tuple[0] for score_tuple in sorted(dropout_predictor_scores_dropout,key=lambda s:s[0],reverse=True)])
print([score_tuple[0] for score_tuple in sorted(dropout_predictor_scores_completer,key=lambda s:s[0],reverse=True)])

[119, 118, 117, 116, 116, 115, 115, 115, 114, 114, 114, 114, 113, 113, 113, 112, 112, 112, 112, 112, 111, 111, 111, 111, 111, 111, 110, 110, 110, 110, 110, 110, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 106, 106, 106, 106, 106, 106, 106, 106, 106, 106, 106, 106, 106, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 103, 103, 103, 103, 103, 103, 103, 103, 103, 103, 103, 103, 103, 103, 103, 103, 103, 103, 103, 103, 103, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 100, 100, 100, 100, 100, 100, 100, 99, 99, 99, 99, 99, 98, 98, 98, 98, 98, 97, 97, 97, 97, 97, 97, 97, 97, 96, 96, 96, 96, 96, 94, 94, 93, 92, 91, 91, 90,

In [223]:
print(f"selected 104: 3 completed: {(104-3)/104}")
print(f"selected 245: 32 completed: {(245-32)/245}")

selected 104: 3 completed: 0.9711538461538461
selected 245: 32 completed: 0.8693877551020408


In [244]:
groups = pd.DataFrame.from_dict(questionnaire_types,orient="index").groupby(0).groups
{g:len(groups[g]) for g in groups}

{'geslacht': 706, 'geslacht0': 266, 'geslachtt0': 38}

## Sandbox

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)

In [None]:
y_pred

In [None]:
y_test
