# T-score

This notebook contains a pipeline for comparing the vocabulary of two sets of Tactus emails with eachother by the t-score. The goal is to find tokens which appear more frequently in one set than in the other, and vice versa. This notebook uses much of the preprocessing of the notebook liwc.py in this directory. 

The first code block specifies the required libraries. This includes some general Python libraries and some specific libraries developed in our research project. These project-specific libraries can be found in the folder orangehackathon/libs

In [None]:
import csv
import re
import sys
import time

sys.path.append("../libs/")
import tactusloaderLIB
import OWEmailSorterLIB
import markduplicatesLIB
import removemarkedtextLIB
import LIWCLIB

The next code block specifies the location of the therapy files

In [None]:
DIRECTORY = "/home/erikt/projects/e-mental-health/usb/releases/20191217"

One Python function was developed for storing the results of the data analysis (SaveResults). In Orange3 the module SaveData can be used for this task. (SaveResults might not be necessary for this notebook tscore.ipynb)

In [None]:
DEFAULTOUTFILE="out.csv"
FIELDNAMEDATE = "date"
FIELDNAMEFROM = "from"
FIELDNAMEFILE = "file"
FIELDNAMENBROFMAILS = "nbr of mails"
CLIENT = "CLIENT"
COUNSELOR = "COUNSELOR"
FROMTARGET = CLIENT
NBROFMATCHES = "Number of matches"

# data selection settings
PROCESSALLFEATURES = True
AVERAGEROWS = False
NBROFKEPTROWS = 4
MINNBROFMATCHES = 50
STUDENTFEATURENAMES = [FIELDNAMEFILE,FIELDNAMEFROM,FIELDNAMENBROFMAILS,"4 i","7 shehe","8 they","31 posemo",\
                       "32 negemo","50 cogproc","51 insight","52 cause","54 tentat",\
                       "90 focuspast","91 focuspresent","92 focusfuture"]

def addZero(string):
    while len(string) < 2: string = "0"+string
    return(string)

def time2str(timeObj):
    date = str(timeObj.tm_year)+"-"+addZero(str(timeObj.tm_mon))+"-"+addZero(str(timeObj.tm_mday))
    time = addZero(str(timeObj.tm_hour))+":"+addZero(str(timeObj.tm_min))+":"+addZero(str(timeObj.tm_sec))
    return(date+" "+time)

def floatPrecision5(number):
    if type(number) != type(0.5): return(number)
    else: return(float("{0:.5f}".format(number)))

def saveResults(allLiwcResults,fileName=DEFAULTOUTFILE):
    if len(allLiwcResults) > 0:
        fieldNames = STUDENTFEATURENAMES
        if PROCESSALLFEATURES:
            fieldNames = [x.name for x in allLiwcResults[0].domain.variables]
            fieldNames += [x.name for x in allLiwcResults[0].domain.metas]
            fieldNames += [FIELDNAMENBROFMAILS]
        outFile = open(fileName,"w")
        with outFile as csvFile:
            csvwriter = csv.DictWriter(csvFile,fieldnames=fieldNames)
            csvwriter.writeheader()
            for liwcResults in allLiwcResults:
                if AVERAGEROWS:
                    rowCounter = 0
                    row = {}
                    for liwcResultsRow in liwcResults:
                        liwcResultsRow[FIELDNAMEFILE] = re.sub("-an.xml.gz","",str(liwcResultsRow[FIELDNAMEFILE]))
                        if liwcResultsRow[FIELDNAMEFROM] == FROMTARGET:
                            rowCounter += 1
                            nbrOfMatches = 0
                            if NBROFMATCHES in liwcResultsRow: nbrOfMatches = int(liwcResultsRow[NBROFMATCHES])
                            if (NBROFKEPTROWS == 0 or rowCounter <= NBROFKEPTROWS) and \
                               (MINNBROFMATCHES == 0 or nbrOfMatches >= MINNBROFMATCHES):
                                for fieldName in fieldNames:
                                    if fieldName == FIELDNAMEDATE:
                                        row[fieldName] = time2str(time.localtime(liwcResultsRow[fieldName].value))
                                    elif not re.match("^\d+\s",fieldName):
                                        try: row[fieldName] = liwcResultsRow[fieldName].value
                                        except: pass
                                    elif fieldName in row: 
                                        row[fieldName] += floatPrecision5(liwcResultsRow[fieldName].value)
                                    else: 
                                        row[fieldName] = floatPrecision5(liwcResultsRow[fieldName].value)
                    if len(row) > 0:
                        for fieldName in row:
                            if re.match("^\d+\s",fieldName) and rowCounter > 0: 
                                row[fieldName] = floatPrecision5(row[fieldName]/min(rowCounter,NBROFKEPTROWS))
                        row[FIELDNAMENBROFMAILS] = rowCounter
                        csvwriter.writerow(row)
                else:
                    rowCounter = 0
                    row = {}
                    for liwcResultsRow in liwcResults:
                        liwcResultsRow[FIELDNAMEFILE] = re.sub("-an.xml.gz","",str(liwcResultsRow[FIELDNAMEFILE]))
                        if liwcResultsRow[FIELDNAMEFROM] == FROMTARGET:
                            rowCounter += 1
                            nbrOfMatches = liwcResultsRow[NBROFMATCHES]
                            if (NBROFKEPTROWS == 0 or rowCounter <= NBROFKEPTROWS) and \
                               (MINNBROFMATCHES == 0 or nbrOfMatches >= MINNBROFMATCHES):
                                for fieldName in fieldNames:
                                    if fieldName == FIELDNAMEDATE:
                                        row[fieldName] = time2str(time.localtime(liwcResultsRow[fieldName].value))
                                    elif not re.match("^\d+\s",fieldName):
                                        try: row[fieldName] = liwcResultsRow[fieldName].value
                                        except: pass
                                    else: 
                                        row[fieldName] = floatPrecision5(liwcResultsRow[fieldName].value)
                                if len(row) > 0: csvwriter.writerow(row)
        outFile.close()

We will comparethe texts in emails from clients that finished the treatment versus clients that dropped out. Thus we need the metadata which specifies the results of the therapy for each client.

In [None]:
import gzip

DIRDROPOUT = "/home/erikt/projects/e-mental-health/usb/releases/20200218"
FILEDROPOUT = "dropoutAUKE.csv.gz"
DELIMITER = ","
FIELDNAMEDROPOUT = "dropout"
FIELDNAMETEXT = "text"
FIELDNAMECLIENTID = "clientID"

dropout = {}
inFile = gzip.open(DIRDROPOUT+"/"+FILEDROPOUT,"rt",encoding="utf-8")
csvreader = csv.DictReader(inFile,delimiter=DELIMITER)
for row in csvreader: dropout[row[FIELDNAMECLIENTID]] = row[FIELDNAMEDROPOUT]
inFile.close()

Finally there is a loop which loads each available therapy file, runs the Orange3 pipeline. The Orange3 pipeline contains these parts:

1. tactusloader: determine file name and read its contents
2. sortMails: sort the mails from the file chronologically
3. markduplicates: mark the parts of the mail text included from an earlier mail
4. removemarkedtext: remove the marked text from the mail


In [None]:
MAXMAILS = 4

allLiwcResults = []
mailTexts = ["","",""]
for patientId in list(range(1,1988)):
    fileName = tactusloaderLIB.makeFileName(str(patientId))
    fileNameId = re.sub("-an.xml$","",fileName)
    if fileNameId in dropout and (dropout[fileNameId] == "1" or dropout[fileNameId] == "2"):
        mailText = ""
        try:
            mails = tactusloaderLIB.processFile(DIRECTORY,fileName+".gz")
            #print(fileName,len(mails),len(mails[0]),len(mails[1]),mails[0][0])
            if len(mails) > 0:
                sortedMails = OWEmailSorterLIB.filterEmails(mails[0],filter_asc=True)
                markedMails = markduplicatesLIB.processCorpus(sortedMails)
                strippedMails = removemarkedtextLIB.processCorpus(markedMails)
                #print(fileName,dropout[fileNameId],len(strippedMails),strippedMails[0])
                mailCounter = 0
                for strippedMail in strippedMails:
                    if strippedMail[FIELDNAMEFROM] == CLIENT and mailCounter < MAXMAILS:
                        mailText += str(strippedMail[FIELDNAMETEXT])
                        mailCounter += 1
                #print(mailText)
                #break
        except:
            print("problem processing file",fileName)
            continue
        mailTexts[int(dropout[fileNameId])] += mailText

In [None]:
for i in range(0,len(mailTexts)): print(len(mailTexts[i]))

Convert the text to the data format of the t-score script: /home/erikt/projects/newsgac/fasttext-runs/tscore.py

In [None]:
NBROFTOKENS = "totalFreq"
NBROFTYPES = "nbrOfWords"
WORDFREQS = "wordFreqs"

def makeTscoreData(text):
    data = { NBROFTOKENS:0, NBROFTYPES:0, WORDFREQS:{} }
    for token in text.split():
        data[NBROFTOKENS] += 1
        if token in data[WORDFREQS]: 
            data[WORDFREQS][token] += 1
        else:
            data[WORDFREQS][token] = 1
            data[NBROFTYPES] += 1
    return(data)

tscoreData1 = makeTscoreData(mailTexts[1])
tscoreData2 = makeTscoreData(mailTexts[2])

In [None]:
sys.path.append("/home/erikt/projects/newsgac/fasttext-runs")
import tscore
import operator

outFile = open("out.csv","w")
csvwriter = csv.DictWriter(outFile,["token","tscore","freqDropouts","freqFinishers"])
csvwriter.writeheader()
tscores = tscore.computeTscore(tscoreData1,tscoreData2)
for tuple in sorted(tscores.items(), key=operator.itemgetter(1)):
    (token,tscore) = tuple
    if token in tscoreData1[WORDFREQS]: frequency1 = tscoreData1[WORDFREQS][token]
    else: frequency1 = 0
    if token in tscoreData2[WORDFREQS]: frequency2 = tscoreData2[WORDFREQS][token]
    else: frequency2 = 0
    csvwriter.writerow({"token":token,"tscore":tscore,"freqDropouts":frequency1,"freqFinishers":frequency2})
outFile.close()