# Tactus metadata

Extract therapy result assessments from the metadata

In [None]:
import csv
import gzip
import numpy as np
import os
import pandas as pd
import re
import sys
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET

sys.path.append("/home/erikt/projects/e-mental-health/enron/orange-hackathon/orangehackathon/libs")
import tactusloaderLIB
import OWEmailSorterLIB
import markduplicatesLIB
import removemarkedtextLIB
import LIWCLIB

%matplotlib inline

In [None]:
DIRECTORY = "/home/erikt/projects/e-mental-health/usb/tmp/20190917/"

All clients (1986) have filled in an intake form. However there are three different formats of intake form, which we distinguish by the label of question 1 about gender:
1. 1-geslacht0: 1045 clients
2. 1-geslacht: 897 clients
3. 1-geslachtt0: 44 clients

In [None]:
VARIANTS = ["1-geslacht0","1-geslacht","1-geslachtt0"]
FILENAMEPREFIX = "AdB"

def countFiles(directory,prefix):
    return(len([fileName for fileName in os.listdir(directory) \
                         if re.search("^"+prefix,fileName)]))

nbrOfFiles = [countFiles(DIRECTORY+variant,FILENAMEPREFIX) for variant in VARIANTS]
plt.bar(VARIANTS,nbrOfFiles)
plt.title("Number of clients per type of intake form")
nbrOfFiles

Few clients fill in one of the subsequents four questionaires: half-way, end, after three months and after six months. We want to use the first two to measure their progress in the therapy so we check how frequently these forms are present 

## Get questionnaire info

In [None]:
import sys
sys.path.insert(1, '/home/erikt/project/e-mental-health/data-processing')
import tactus2table

In [None]:
TITLE = "0-title"
TUSSENMETING = "Lijst tussenmeting"
NAMETING = "Lijst nameting"

tussenmeting = []
nameting = []
questionnairesCountTussenmeting = {variant:0 for variant in VARIANTS}
questionnairesCountNameting = {variant:0 for variant in VARIANTS}
listVariants = {}
for variant in VARIANTS:
    for inFileName in os.listdir(DIRECTORY+"/"+variant):
        if re.search("^"+FILENAMEPREFIX,inFileName):
            listVariants[inFileName] = variant
            root = tactus2table.readRootFromFile(DIRECTORY+"/"+variant+"/"+inFileName)
            questionnaires = tactus2table.getQuestionnaires(root,inFileName)
            for questionnaire in questionnaires:
                if questionnaire[TITLE] == TUSSENMETING:
                    questionnairesCountTussenmeting[variant] += 1
                    tussenmeting.append(inFileName)
                if questionnaire[TITLE] == NAMETING:
                    questionnairesCountNameting[variant] += 1
                    nameting.append(inFileName)

In [None]:
questionnairesCountTussenmeting,questionnairesCountNameting

In [None]:
len(tussenmeting) == 519 and len(nameting) == 312

## Get start years

In [None]:
STARTDATE = "./Treatment/StartDate"
ENDDATE = "./Treatment/EndDate"

def getStartEndDate(inFileName):
    inFile = gzip.open(inFileName,"rt",encoding="utf-8")
    inFileContent = inFile.read()
    inFile.close()
    root = ET.fromstring(inFileContent)
    startDates = root.findall(STARTDATE)
    endDates = root.findall(ENDDATE)
    startDate = ""
    endDate = ""
    if len(startDates) > 0 and not startDates[0].text is None:
        startDate = startDates[0].text.strip()
    if len(endDates) > 0 and not endDates[0].text is None:
        endDate = endDates[0].text.strip()
    return(startDate,endDate)
    
def getStartEndYears(directory):
    startYears = {}
    endYears = {}
    for inFileName in sorted(os.listdir(directory)):
        if re.search("^"+FILENAMEPREFIX,inFileName):
            startDate,endDate = getStartEndDate(directory+inFileName)
            if startDate == "": startYear = 0
            else: startYear = int(startDate[0:4])
            startYears[inFileName] = startYear
            if endDate == "": endYear = 0
            else: endYear = int(endDate[0:4])
            endYears[inFileName] = endYear
    return(startYears,endYears)

startYears,endYears = getStartEndYears(DIRECTORY)

In [None]:
years,counts = np.unique(list([startYears[startYear] for startYear in startYears.keys() if startYears[startYear] > 0]),return_counts=True)
plt.title("start years")
myplot = plt.bar(list(years),list(counts))

In [None]:
listStarted = {variant:0 for variant in VARIANTS}
for fileName in startYears:
    if startYears[fileName] > 0:
        listStarted[listVariants[fileName]] += 1
listEnded = {variant:0 for variant in VARIANTS}
for fileName in endYears:
    if endYears[fileName] > 0:
        listEnded[listVariants[fileName]] += 1

listStarted,sum(listStarted.values()) == 923,listEnded,sum(listEnded.values()) == 691

## Get client status

In [None]:
STATUS = "./Treatment/Status"
STATUSCODES = { 'Dropout':'D','FollowUp':'F','NotStarted':'N;','Part1':'1','Part2':'2','RctControlGroup':'R'}

def getStatus(inFileName):
    inFile = gzip.open(inFileName,"rt",encoding="utf-8")
    inFileContent = inFile.read()
    inFile.close()
    root = ET.fromstring(inFileContent)
    status = root.findall(STATUS)
    if len(status) == 0 or status[0].text is None: return("")
    else: return(status[0].text.strip())

def getStatuses(directory):
    statuses = {}
    for inFileName in sorted(os.listdir(directory)):
        if re.search("^"+FILENAMEPREFIX,inFileName):
            status = getStatus(directory+inFileName)
            statuses[inFileName] = status
    return(statuses)

statuses = getStatuses(DIRECTORY)

## Create bar graph

In [None]:
barWidth = 0.23
index = np.arange(len(VARIANTS))
startedValues = list(listStarted.values())
tussenmetingValues = list(questionnairesCountTussenmeting.values())
nametingValues = list(questionnairesCountNameting.values())

fig,ax = plt.subplots(figsize=(12,4))
for i in range(0,len(nbrOfFiles)):
    ax.text(i-0.2*barWidth,nbrOfFiles[i]+10,str(nbrOfFiles[i]))
    ax.text(barWidth+i-0.2*barWidth,startedValues[i]+10,str(startedValues[i]))
    ax.text(2*barWidth+i-0.2*barWidth,tussenmetingValues[i]+10,str(tussenmetingValues[i]))
    ax.text(3*barWidth+i-0.2*barWidth,nametingValues[i]+10,str(nametingValues[i]))
plt.bar(index,nbrOfFiles,barWidth,label="intake")
plt.bar(index+barWidth,listStarted.values(),
        barWidth,label="start date")
plt.bar(index+2*barWidth,questionnairesCountTussenmeting.values(),
        barWidth,label="tussenmeting")
plt.bar(index+3*barWidth,
        questionnairesCountNameting.values(),
        barWidth,label="nameting")
plt.xticks(index+barWidth,VARIANTS)
plt.title("Number of responses per type of intake form")
plot = plt.legend(prop={"size":12})

## Count emails (outputs three lines) 

Since the response frequencies for clients with intake format "1-geslacht" is a lot higher than those of the other two formats, it makes sense to focus on this format. Let's check how many of those 897 clients sent at least one email to the counselor.

In [None]:
CLIENT = "CLIENT"
FIELDIDFROM = 1
FIELDIDTEXT = 5
SHORTPROGRAMME = "Duurt circa zes weken"
SELFHELP1 = "zelfhulpvariant"
SELFHELP2 = "zelf aan de slag te gaan"
SELFHELP3 = "zelfhulpprogramma"

listClientSentEmail = {}
listShortProgramme= []
listSelfHelp = []
listVariants = {}
for variant in VARIANTS:
    for inFileName in os.listdir(DIRECTORY+"/"+variant):
        if re.search("^"+FILENAMEPREFIX,inFileName):
            listVariants[inFileName] = variant
            root = tactus2table.readRootFromFile(DIRECTORY+"/"+variant+"/"+inFileName)
            emails = tactus2table.getEmailData(root,inFileName)
            nbrOfClientEmails = 0
            shortProgramme = False
            selfHelp = False
            for email in emails:
                if email[FIELDIDFROM] == CLIENT: 
                    nbrOfClientEmails += 1
                if re.search(SHORTPROGRAMME,email[FIELDIDTEXT]):
                    shortProgramme = True
                if re.search(SELFHELP1,email[FIELDIDTEXT]) or \
                   re.search(SELFHELP2,email[FIELDIDTEXT]) or \
                   re.search(SELFHELP3,email[FIELDIDTEXT]):
                    selfHelp = True
            if nbrOfClientEmails > 0: 
                listClientSentEmail[inFileName] = nbrOfClientEmails
            if selfHelp and not shortProgramme: listSelfHelp.append(inFileName)
            if shortProgramme: listShortProgramme.append(inFileName)
    print(variant,len(listClientSentEmail.keys()),len(listShortProgramme),len(listSelfHelp))

## File selection inspection

We want to know which clients were selected for analysis. Here we store an overview of all aspects of the data files. First we read the file containing the list of 990 clients selected by our student.

In [None]:
STUDENTFILE = "/home/erikt/projects/e-mental-health/usb/releases/20200302/dropout.csv.gz"
DROPOUT = "dropout"
CLIENTID = "clientID"

studentSelection = []
dropout = {}
inFile = gzip.open(STUDENTFILE,"rt",encoding="utf-8")
csvreader = csv.DictReader(inFile)
for row in csvreader:
    row[CLIENTID] = re.sub("(-an)?.xml(.gz)?$","",row[CLIENTID])
    if row[DROPOUT] == "1" or row[DROPOUT] == "2": 
        studentSelection.append(row[CLIENTID])
        dropout[row[CLIENTID]] = row[DROPOUT]
inFile.close()

(len(studentSelection) == 1012)

The student did not receive all intake forms. We check for which clients an intake form was available and for which not.

In [None]:
INTAKEFILE = "/home/erikt/projects/e-mental-health/usb/releases/20200121/intake.csv.gz"
IDFIELD = "0-id"

intakeForms = []
inFile = gzip.open(INTAKEFILE,"rt",encoding="utf-8")
csvreader = csv.DictReader(inFile)
for row in csvreader: intakeForms.append(row[IDFIELD])
inFile.close()

(len(intakeForms) == 1940)

Check what the relation is between the clients that have filled in the tussenmeting and nameting questioniares in comparison with the dropout labels

In [None]:
GZEXTENSION = ".gz"
XMLEXTENSION = ".xml"
ANOSTRING = "-an" 
TOTAL = "total"
UNKNOWN = "unknown"
DROPOUT = "dropout"
FINISHER = "finisher"

def shortenFileName(fileName):
    fileName = re.sub(GZEXTENSION,"",fileName)
    fileName = re.sub(XMLEXTENSION,"",fileName)
    fileName = re.sub(ANOSTRING,"",fileName)
    return(fileName)

def tk(key):
    return({'1':DROPOUT,"2":FINISHER,"U":UNKNOWN,"T":TOTAL}[key])

countsAssessed = {FINISHER:0,DROPOUT:0,UNKNOWN:0,TOTAL:0}
for fileName in dropout:
    countsAssessed[tk(dropout[fileName])] += 1
    countsAssessed[TOTAL] += 1
countsSentMail = {FINISHER:0,DROPOUT:0,UNKNOWN:0,TOTAL:0}
for fileName in listClientSentEmail:
    if listClientSentEmail[fileName] > 0:
        fileName = shortenFileName(fileName)
        if fileName in dropout:
            countsSentMail[tk(dropout[fileName])] += 1
            countsSentMail[TOTAL] += 1
countsStarted = {FINISHER:0,DROPOUT:0,UNKNOWN:0,TOTAL:0}
for fileName in startYears:
    if startYears[fileName] > 0:
        fileName = shortenFileName(fileName)
        if fileName in dropout:
            countsStarted[tk(dropout[fileName])] += 1
            countsStarted[TOTAL] += 1
countsTussenmeting = {FINISHER:0,DROPOUT:0,UNKNOWN:0,TOTAL:0}
for fileName in tussenmeting:
    fileName = shortenFileName(fileName)
    if fileName in dropout:
        countsTussenmeting[tk(dropout[fileName])] += 1
        countsTussenmeting[TOTAL] += 1
countsNameting = {FINISHER:0,DROPOUT:0,UNKNOWN:0,TOTAL:0}
for fileName in nameting:
    fileName = shortenFileName(fileName)
    if fileName in dropout:
        countsNameting[tk(dropout[fileName])] += 1
        countsNameting[TOTAL] += 1

In [None]:
pd.DataFrame({"assessed":countsAssessed,"sent mail":countsSentMail,"started":countsStarted,"tussenmeting":countsTussenmeting,"nameting":countsNameting}).T

What are the differences between our selection (791 clients) and the one of the student (1012)? Which clients that we selected were not chosen by the student? Which client did the student chose but we did not?

In [None]:
def shortenFileName(inFileName):
    return(re.sub("(-an)?.xml(.gz)?$","",inFileName))

def extendFileName(inFileName):
    return(re.sub("$",".xml.gz",inFileName))

countedExtra = 0
countedMissed = 0
for fileName in startYears:
    shortFileName = shortenFileName(fileName)
    if startYears[fileName] > 0 and \
       fileName in listClientSentEmail and listClientSentEmail[fileName] > 0: 
        if not shortFileName in studentSelection: 
            countedExtra += 1

countedMissed = 0
for fileName in studentSelection:
    longFileName = extendFileName(fileName)
    if (not longFileName in startYears or startYears[longFileName] == 0 or \
        not longFileName in listClientSentEmail or listClientSentEmail[longFileName] == 0): 
        countedMissed += 1
        if dropout[fileName] == "2": print("finisher",fileName)

countedExtra,countedMissed,countedOther

We did not select any client that the student did not choose as well. However, the student did select 221 clients that, according to the metadata (field StartDate) never started with the treatment. We inspected a few of them and found that they either followed another treatment programme (short or self help) or did not start with the intensive programme. 216 clients were labeled as dropout and five as finisher. These five were also checked and found to be irrelevant for this study. 

## Data selection

Here we save our list of selected clients, based on the StartDate field in the metadata.

In [None]:
DROPOUT = "dropout"
FILE = "file"
UNKNOWN = "?"

outFile = open("out.csv","w")
csvwriter = csv.DictWriter(outFile,[FILE,DROPOUT])
csvwriter.writeheader()

for fileName in startYears:
    shortFileName = shortenFileName(fileName)
    if startYears[fileName] > 0 and \
       fileName in listClientSentEmail and listClientSentEmail[fileName] > 0: 
        csvwriter.writerow({FILE:shortFileName,DROPOUT:dropout[shortFileName]})
    else:
        csvwriter.writerow({FILE:shortFileName,DROPOUT:UNKNOWN})

outFile.close()

## Therapy completion assessment by student

The student regarded therapies as completed when the patient had received an email with the subject *Afsluiting*. Let's check which clients satisfy this criterion. First read all mails via Orange.

In [None]:
ANODIRECTORY = "/home/erikt/projects/e-mental-health/usb/releases/20191217/"

def convertFileName(fileName):
    return(re.sub(ANOSTRING,"",fileName))

allMails = {}
for patientId in list(range(1,1988)):
    fileName = tactusloaderLIB.makeFileName(str(patientId))
    try:
        mails = tactusloaderLIB.processFile(ANODIRECTORY,fileName+".gz")
        if len(mails[0]) > 0:
            sortedMails = OWEmailSorterLIB.filterEmails(mails[0],filter_asc=True)
            markedMails = markduplicatesLIB.processCorpus(sortedMails)
            strippedMails = removemarkedtextLIB.processCorpus(markedMails)
            allMails[convertFileName(fileName)+GZEXTENSION] = strippedMails
    except:
        print("problem processing file",fileName)
        continue

In [None]:
len(allMails) == 1982

In [None]:
def getXmlText(inFileName,query):
    inFile = gzip.open(inFileName,"rt",encoding="utf-8")
    inFileContent = inFile.read()
    inFile.close()
    root = ET.fromstring(inFileContent)
    xmlData = root.findall(query)
    return([xmlDataItem.text for xmlDataItem in xmlData])

In [None]:
MAILSUBJECT = "./Messages/Message/Subject"
MAILBODY = "./Messages/Message/Body"
MAILWORDPOS = "afsluit"
MAILWORDNEG = "niet actief"
STOP = "stop"
MINMAILNBR = 16
ANOSTRING = "-an" 
SUBJECTID = 2
    
def getCompletions(directory):
    afsluitings = {}
    nietactiefs = {}
    afsluitingIds = {}
    nietactiefIds = {}
    for inFileName in sorted(os.listdir(directory)):
        if re.search("^"+FILENAMEPREFIX,inFileName) and inFileName in allMails:
            afsluiting = False
            nietactief = False
            for i in range(0,len(allMails[inFileName])):
                subject = allMails[inFileName][i].metas[SUBJECTID].lower()
                if re.search(MAILWORDPOS,subject):
                    afsluiting = True
                    afsluitingIds[inFileName] = i+1
                if re.search(MAILWORDNEG,subject):
                    nietactief = True
                    nietactiefIds[inFileName] = i+1
            afsluitings[inFileName] = afsluiting
            nietactiefs[inFileName] = nietactief
    return(afsluitings,nietactiefs,afsluitingIds,nietactiefIds)

afsluitings,nietactiefs,afsluitingIds,nietactiefIds = getCompletions(DIRECTORY)
len([inFileName for inFileName in afsluitings if afsluitings[inFileName]])

In [None]:
(values,counts) = np.unique(list(afsluitingIds.values()),return_counts=True)
fig,ax = plt.subplots(figsize=(15,4))
plt.title("mail subject = afsluiting: count per n-th mail")
myplot = plt.bar(list(values),list(counts))

(values,counts) = np.unique(list(nietactiefIds.values()),return_counts=True)
fig,ax = plt.subplots(figsize=(15,4))
plt.title("mail subject = niet actief: count per n-th mail")
myplot = plt.bar(list(values),list(counts))

In [None]:
for fileName in nietactiefIds.keys():
    if nietactiefIds[fileName] > 100: print(nietactiefIds[fileName],fileName)

In [None]:
TREATMENTPHRASE = "een geschikte kandidaat om te"
TEXTID = 3

longTreatment = {}
for inFileName in sorted(os.listdir(DIRECTORY)):
    if re.search("^"+FILENAMEPREFIX,inFileName):
        longTreatment[inFileName] = False
        for i in range(0,len(allMails[inFileName])):
            mailText = allMails[inFileName][i].metas[TEXTID].lower()
            if re.search(TREATMENTPHRASE,mailText): 
                longTreatment[inFileName] = True
                break

In [None]:
countsLongTreatment = {str(True):0,str(False):0,TOTAL:0}
for fileName in dropout:
    if dropout[fileName] == "2":
        countsLongTreatment[str(longTreatment[fileName+".xml.gz"])] += 1
        countsLongTreatment[TOTAL] += 1
countsLongTreatment

## Save analysis to file data-processing/out.csv

In [None]:
def shortenFileName(inFileName):
    return(re.sub("(-an)?.xml(.gz)?$","",inFileName))

selectionChoices = {}

outFile = open("out.csv","w")
csvwriter = csv.DictWriter(outFile,["file","student","selected","status","started",\
            "ended","afsluiting","nietactief","intake","clientMail","shortProgramme",\
            "selfHelp","tussenmeting","nameting","dropout","variant"])
csvwriter.writeheader()

for inFileName in sorted(os.listdir(DIRECTORY)):
    if re.search("^"+FILENAMEPREFIX,inFileName):
        shortFileName = shortenFileName(inFileName)
        row = {"file":shortFileName}
        if shortFileName in studentSelection: row["student"] = 1
        else: row["student"] = 0
        if inFileName in listClientSentEmail and \
           not inFileName in listShortProgramme and \
           not inFileName in listSelfHelp: row["selected"] = 1
        else: row["selected"] = 0
        if inFileName in statuses and statuses[inFileName] != "NotStarted": 
            row["status"] = statuses[inFileName]
        else: row["status"] = statuses[inFileName]
        if inFileName in startYears and startYears[inFileName] > 0: row["started"] = 1
        else: row["started"] = 0
        if inFileName in endYears and endYears[inFileName] > 0: row["ended"] = 1
        else: row["ended"] = 0
        if inFileName in afsluitings and afsluitings[inFileName]: 
            row["afsluiting"] = 1
        else: row["afsluiting"] = 0
        if inFileName in nietactiefs and nietactiefs[inFileName]: 
            row["nietactief"] = 1
        else: row["nietactief"] = 0
        if shortFileName in intakeForms: row["intake"] = 1
        else: row["intake"] = 0
        if inFileName in listClientSentEmail: 
            row["clientMail"] = listClientSentEmail[inFileName]
        else: row["clientMail"] = 0
        if inFileName in listShortProgramme: row["shortProgramme"] = 1
        else: row["shortProgramme"] = 0
        if inFileName in listSelfHelp: row["selfHelp"] = 1
        else: row["selfHelp"] = 0
        if inFileName in tussenmeting: row["tussenmeting"] = 1
        else: row["tussenmeting"] = 0
        if inFileName in nameting: row["nameting"] = 1
        else: row["nameting"] = 0
        if shortFileName in dropout: row["dropout"] = dropout[shortFileName]
        else: row["dropout"] = "?"
        row["variant"] = listVariants[inFileName]
        csvwriter.writerow(row)
        selectionChoicesKey = \
            str(row["afsluiting"])+str(row["started"])+str(row["dropout"])
        if not selectionChoicesKey in selectionChoices: 
            selectionChoices[selectionChoicesKey] = 0
        selectionChoices[selectionChoicesKey] += 1
        if selectionChoicesKey == "111": print(selectionChoicesKey,inFileName,\
                                               listClientSentEmail[inFileName])
        
outFile.close()

In [None]:
for key in sorted(selectionChoices): print(key,selectionChoices[key])
print("number of starting clients not sending an email:",\
      len([fileName for fileName in startYears if startYears[fileName] > 0 and \
                                                  not fileName in listClientSentEmail]))
print("suspected short programme clients that started therapy:",\
      len([fileName for fileName in listShortProgramme if startYears[fileName] > 0]))
print("suspected self assessment clients that started therapy:",\
      len([fileName for fileName in listSelfHelp if startYears[fileName] > 0]))

The student selected 769 (421+348) clients that started the treament, that is clients for which a date was present in the metadata field *StartDate*. The student also selected 219 (214+5) clients that, according to the metadata, did not start the treatment. In order to assess the quality of the metadata, we inspected the client in this group that had sent the most emails (83) and indeed found that these mails were not treatment-related. It seems safe to use the value of the metadata field *StartDate* as an indication of the client starting the intensive treatment.

The student ignored 995 (841+154) clients. 841 had not started treatment but 154 had started the treatment. However, of those 154, no emails were available for 132 clients. For 21 of the other clients, the student did not have access to an intake form. The reason of exclusion for the remaining single client (926) is unclear.

**SUMMARY:** Since 923 (348+421+154) clients started the treatment (in the intensive programme) and 132 of those did not respond to the counselor, we have **791 (923-132) treatments available for analysis**.

Of those 791 treatments, the student assessed 348 as completed and 421 early dropout. This means that **22 treatments are unassesed with respect to dropout**. The student has not assessed these 22 treatments. Except for one case (926) this choice was made because at the time there were no intake forms available for these clients.

## Therapy success assessment

In [None]:
COMPLETED = "C"
UNKNOWN = "U"

combinations = {}

for fileName in afsluitings:
    shortName = shortenFileName(fileName)
    if not shortName in dropout: dropout[shortName] = UNKNOWN
    if afsluitings[fileName]: key = COMPLETED+dropout[shortName]
    else: key = "-"+dropout[shortName]
    if fileName in startYears and startYears[fileName] > 0: key = "1"+key
    else: key = "0"+key
    if fileName in nameting: key = "1"+key
    else: key = "0"+key
    if not key in combinations: combinations[key] = 0
    combinations[key] += 1
    if re.search("C1$",key):
        print(key,fileName)
    
combinations

In the keys, *1* and *2* at the final position indicate the dropout status of clients as assessed by the student, where *1* stands for dropout and *2* stands for therapy completion. *U* indicates that the student has not assessed the client. *C* at the penultimate position in a key means that we assessed the client as a completer. A hyphen (*-*) represents all other cases (dropout and unassessed).

Category *1C2* should contain 348 entries but lacked six (count: 342). One (982) was caused by the keyword *afsluiting* being misspelled, which we corrected in the data file. The other five did not include an email with the subject *afsluiting* (category: 1-2; clients: 15, 203, 474, 685 and 1932). However, two clients had filled in the nameting form (474 and 685).

~~We defined the completion criterion as: **the client has started (StartDate is filled) and has recieved an email with the string *afsluit* in the Subject or has filled in a nameting questionnaire**.~~

In [None]:
initialClientCount = sum(nbrOfFiles)
print("Initial number of clients:",initialClientCount)

nbrOfStarters = len([fileName for fileName in startYears if startYears[fileName] > 0])
print("Number of starting clients:",nbrOfStarters,\
      "(excluded:",initialClientCount-nbrOfStarters,")")

nbrOfMailers = 0
for fileName in startYears:
    if startYears[fileName] > 0:
        if fileName in listClientSentEmail:
            nbrOfMailers += 1
print("Number of mailing clients :",nbrOfMailers,\
      "(excluded:",nbrOfStarters-nbrOfMailers,")")

nbrOfCompleters = 0
nbrOfDropouts = 0
for fileName in startYears:
    if startYears[fileName] > 0:
        if fileName in listClientSentEmail:
            if afsluitings[fileName]: nbrOfCompleters += 1
            else: nbrOfDropouts += 1
print("      Number of completers:",nbrOfCompleters)
print("        Number of dropouts:",nbrOfDropouts)

## Manual labeling

AdB0445 1 

AdB0461 1

AdB0509 1

AdB0709 1

AdB0853 2

AdB0873 1

AdB0926 2

AdB0942 2

AdB0950 -

AdB1102 2

AdB1343 2

AdB1398 1

AdB1419 1

AdB1491 1 (final mails missing?)

AdB1532 2

AdB1536 1

AdB1612 2

AdB1620 1

AdB1645 2

AdB1744 1

AdB1771 2

AdB1849 1

AdB1871 1

## Not used for now...

Of the intermediate questionnaire, question 8 (8-intefft) is most interesting: *DO you consider this Internet therapy to be an effective method for changing alcohol-related habits?* (*Vind je internetbehandeling een effectieve methode om je drinkgewoonte te veranderen?*) A positive answer signals a positive effect of the therapy on the client. Let's see how they responded to this question.

In [None]:
VARIANT = "1-geslacht"
QUESTION8 = "8-intefft"
QUESTION9 = "9-inteff1"

responses = {}
for inFileName in os.listdir(DIRECTORY+"/"+VARIANT):
    if re.search("^"+FILENAMEPREFIX,inFileName):
        root = tactus2table.readRootFromFile(DIRECTORY+"/"+VARIANT+"/"+inFileName)
        questionnaires = tactus2table.getQuestionnaires(root,inFileName)
        for questionnaire in questionnaires:
            if questionnaire[TITLE] == TUSSENMETING:
                if QUESTION8 in questionnaire:
                    response = questionnaire[QUESTION8]
                    if response in responses: responses[response] += 1
                    else: responses[response] = 1
                elif QUESTION9 in questionnaire:
                    response = questionnaire[QUESTION9]
                    if response in responses: responses[response] += 1
                    else: responses[response] = 1
                else: print("no inteff question in",inFileName)

responses

We expected a overwhelming positive response (because the people for which the therapy did not work could have stopped). The answers are indeed mostly positive but interestingly150 of the 470 clients responded with *I don't know* (*ik weet het niet*). Four clients filled in a different form where the question had number 9 instead of 8.  

In [None]:
inFileName = "AdB0010.xml.gz"
root = tactus2table.readRootFromFile(DIRECTORY+"/"+VARIANT+"/"+inFileName)
questionnaires = tactus2table.getQuestionnaires(root,inFileName)
questionnaires

In [None]:
data = pd.read_csv(DIRECTORY+"/"+FILE)

In [None]:
FIELDNAME = "14-drugs"
ages = []
for rowId in range(0,len(data)):
    ages.append(re.sub(" *leeftijd in jaren *","",data[FIELDNAME][rowId]))
agesPD = pd.Series(ages)

In [None]:
data["30-week2"].value_counts().sort_index().plot(kind="bar",figsize=(14,6))


In [None]:
# end form: 18-weekn 19-weekn

In [None]:
[x for x in data]


In [None]:
data["30-week2"]