# Fasttext experiments with tactus data

First import required modules: general Python modules and modules from Orange3

In [None]:
import csv
import fasttext
import gzip
import numpy as np
import os
import pandas as pd
import re
import sys

sys.path.append("/home/erikt/projects/e-mental-health/enron/orange-hackathon/orangehackathon/libs")
import tactusloaderLIB
import OWEmailSorterLIB
import markduplicatesLIB
import removemarkedtextLIB
import LIWCLIB

Next, read the mails from the data set via Orange 3 and store them in the data structure allMails. This takes several minutes so we do not want to do this often.

In [None]:
ANODIRECTORY = "/home/erikt/projects/e-mental-health/usb/releases/20191217/"
ANOSTRING = "-an"
GZEXTENSION = ".gz"
XMLEXTENSION = ".xml"

os.chdir(ANODIRECTORY)

def shortenFileName(fileName):
    fileName = re.sub(GZEXTENSION,"",fileName)
    fileName = re.sub(XMLEXTENSION,"",fileName)
    fileName = re.sub(ANOSTRING,"",fileName)   
    return(fileName)

In [None]:
LASTFILENBR = 1987

def getAllTactusMails():
    allMails = {}
    missingFiles = []
    for patientId in list(range(1,LASTFILENBR+1)):
        if patientId % 100 == 0: print(patientId,end=" ")
        fileName = tactusloaderLIB.makeFileName(str(patientId))+GZEXTENSION
        if os.path.isfile(ANODIRECTORY+fileName):
            mails = tactusloaderLIB.processFile(ANODIRECTORY,fileName)
            if len(mails[0]) > 0:
                sortedMails = OWEmailSorterLIB.filterEmails(mails[0],filter_asc=True)
                markedMails = markduplicatesLIB.processCorpus(sortedMails)
                strippedMails = removemarkedtextLIB.processCorpus(markedMails)
                allMails[shortenFileName(fileName)] = strippedMails
        else: missingFiles.append(fileName)
    if len(missingFiles) > 0: print("\nmissing files:",missingFiles)
    return(allMails)

allMails = getAllTactusMails()

We tried to store the allMails data structure so that it did not need to be computed every time this notebook is started. But there is not enough space available on our encrypted usb stick (required: 100+Mb). Furthermore, Orange3's from_file routine (from orangecontrib.text.corpus.Corpus) does not work, which makes reading back the data complicated. 

In [None]:
STORAGEDIR = "/home/erikt/projects/e-mental-health/usb/tactus/"
CSVEXTENSION = ".csv"

for clientId in allMails.keys():
    fileName = STORAGEDIR+clientId+CSVEXTENSION
    allMails[clientId].save(fileName)
    break

for fileName in os.listdir(STORAGEDIR):
    if re.search(CSVEXTENSION,fileName):
        clientId = re.sub(CSVEXTENSION,"",fileName)
        df = pd.read_csv(STORAGEDIR+clientId+CSVEXTENSION)
        variables = df.loc[2:,["date","from","to"]].to_numpy()
        variablesDomain = df.loc[0,"date":"from"]
        metas = df.loc[2:,"file":].to_numpy()
        metasDomain = df.loc[0,"file":]
        break

We also need labels for the data. We will use the dropout labels provided by a student's project

In [None]:
SELECTEDFILE = "/home/erikt/projects/e-mental-health/usb/releases/20200305/selected.csv.gz"
DROPOUT = "dropout"
FILE = "file"
NBROFCLIENTS = 791

dropouts = {}
inFile = gzip.open(SELECTEDFILE,"rt",encoding="utf-8")
csvreader = csv.DictReader(inFile)
for row in csvreader:
    row[FILE] = re.sub("(-an)?.xml(.gz)?$","",row[FILE])
    if row[DROPOUT] == "1" or row[DROPOUT] == "2": 
        dropouts[row[FILE]] = row[DROPOUT]
inFile.close()

len(dropouts) == NBROFCLIENTS

Fasttext operates on files so we should store our data in a file to enable fasttext to access it.

In [None]:
import random

CLIENT = "CLIENT"
COUNSELOR = "COUNSELOR"
FROMFIELD = "from"
SPACE = " "
LABELPREFIX = "__label__"
OUTFILENAME = "fasttext.txt"
NBROFCLIENTMAILS = 4

def getFieldId(corpus,fieldName):
    fieldId = -1
    for i in range(0,len(corpus.domain.metas)):
        if str(corpus.domain.metas[i]) == fieldName:
            fieldId = i
    return(fieldId)

def getLastCounselorMailId(allMails,clientId):
    for i in range(-1,-len(allMails[clientId])-1,-1):
        if allMails[clientId][i][FROMFIELD] == COUNSELOR: return(i)
    sys.exit("getLastCounselortMailId: client "+clientId+" did not receive any emails!")

def selectLastCounselorMail(allMails):
    selectedData = []
    selectedLabels = []
    if len(allMails) > 0:
        firstClient = list(allMails.keys())[0]
        subjectId = getFieldId(allMails[firstClient],"subject")
        textId = getFieldId(allMails[firstClient],"text")
        for clientId in allMails:
            if clientId in dropouts:
                lastCounselorMailId = getLastCounselorMailId(allMails,clientId)
                subject = allMails[clientId][lastCounselorMailId].metas[subjectId]
                mailText = allMails[clientId][lastCounselorMailId].metas[textId]
                selectedLabels.append(dropouts[clientId])
                selectedData.append(subject+" "+mailText)
    return(selectedData,selectedLabels)

def getNextClientMailId(allMails,clientId,startClientMailId):
    for i in range(startClientMailId+1,len(allMails[clientId])):
        if allMails[clientId][i][FROMFIELD] == CLIENT: return(i)
    return(-1)

def selectFirstClientMails(allMails,nbrOfMails):
    selectedData = []
    selectedLabels = []
    if len(allMails) > 0:
        firstClient = list(allMails.keys())[0]
        subjectId = getFieldId(allMails[firstClient],"subject")
        textId = getFieldId(allMails[firstClient],"text")
        for clientId in allMails:
            if clientId in dropouts:
                startClientMailId = -1
                data = ""
                clientMailId = 0
                clientMailCounter = 0
                while clientMailId >= 0 and clientMailCounter < nbrOfMails:
                    clientMailId = getNextClientMailId(allMails,clientId,startClientMailId)
                    if clientMailId >= 0:
                        subject = allMails[clientId][clientMailId].metas[subjectId]
                        mailText = allMails[clientId][clientMailId].metas[textId]
                        data += subject+SPACE+mailText+SPACE
                        clientMailCounter += 1
                        startClientMailId = clientMailId
                if len(data) > 0:
                    selectedLabels.append(dropouts[clientId])
                    selectedData.append(data)
    return(selectedData,selectedLabels)

def storeTactusDataInFastTextFile(X,y,outFileName):
    if len(X) != len(y): 
        sys.exit("storeTactusDataInFastTextFile(): incompatable lengths of X and y")
    outFile = open(outFileName,"w")
    for i in range(0,len(X)):
        print(LABELPREFIX+str(y[i]),X[i],file=outFile)
    outFile.close()
    return()

def shuffleXy(X,y):
    if len(X) != len(y): sys.exit("shuffleXy(): incompatable lengths of X and y")
    shuffledX = []
    shuffledY = []
    while len(X) > 0:
        r = random.randint(0,len(X)-1)
        shuffledX.append(X[r])
        shuffledY.append(y[r])
        X[r] = X[0]
        y[r] = y[0]
        X.pop(0)
        y.pop(0)
    return(shuffledX,shuffledY)

Now we can start a fasttext experiment using this data set

In [None]:
NBROFFOLDS = 5

selectedData,selectedLabels = selectFirstClientMails(allMails,NBROFCLIENTMAILS)
X,y = shuffleXy(selectedData,selectedLabels)
foldBoundaries = [round((f)*len(X)/NBROFFOLDS) for f in range(0,NBROFFOLDS+1)]

In [None]:
DIM = 300
EPOCHSTART = 30
EPOCHEND = 31
EPOCHSTEP = 5
TRAINFILE = "fasttext-train.txt"
TESTFILE = "fasttext-test.txt"
WIKIFILENAME = "wiki.nl.vec"
CCFILENAME = "cc.nl.300.vec"
WIKIDIR = "/home/erikt/projects/newsgac/fasttext-runs/"

def fasttextPredict(X,y,foldBoundaries,epochStart,epochEnd,epochStep):
    predictions = {str(epoch):[] for epoch in range(epochStart,epochEnd,epochStep)}
    for f in range(0,len(foldBoundaries)-1):
        startTest = foldBoundaries[f]
        endTest = foldBoundaries[f+1]
        storeTactusDataInFastTextFile(X[startTest:endTest],y[startTest:endTest],TESTFILE)
        storeTactusDataInFastTextFile(X[:startTest]+X[endTest:],y[:startTest]+y[endTest:],TRAINFILE)
        for epoch in range(epochStart,epochEnd,epochStep):
            model = fasttext.train_supervised(ANODIRECTORY+TRAINFILE,dim=DIM,epoch=epoch) 
                                            # pretrainedVectors=WIKIDIR+WIKIFILENAME)
            testFile = open(TESTFILE,"r")
            for line in testFile:
                tokens = line.strip().split()
                if re.search("^"+LABELPREFIX,tokens[0]): tokens.pop(0)
                line = " ".join(tokens)
                predictions[str(epoch)].append(re.sub(LABELPREFIX,"",model.predict(line)[0][0]))
            testFile.close()
    return(predictions)

In [None]:
predictions = fasttextPredict(X,y,foldBoundaries,EPOCHSTART,EPOCHEND,EPOCHSTEP)

In [None]:
import numpy as np

uniqueCounts = np.unique(y,return_counts=True)
totals = {list(uniqueCounts[0])[i]:list(uniqueCounts[1])[i] for i in range(0,len(uniqueCounts[0]))}
totals['0'] = len(y)

def evaluate(predictions,y):
    for epoch in predictions:
        counts = {'0':0,'1':0,'2':0}
        for i in range(0,len(predictions[epoch])):
            if predictions[epoch][i] == y[i]: 
                counts['0'] += 1
                counts[y[i]] += 1
        print(epoch,end = " # ")
        for key in counts: print(key,":",round(counts[key]/totals[key],3),end="; ",sep="")
        print()
    pd.DataFrame([[counts['1'],totals['1']-counts['1']],[totals['2']-counts['2'],counts['2']]],columns=['1','2'],index=['1','2'])

evaluate(predictions,y)

When trying to predict dropout based on the final counselor mail, using a Wikipedia dictionary did not improve accuracy (83.7% vs 85.2%), at least not when training 30 epochs.

## Model explanation

Find out which tokens contribute to which classes.

In [None]:
storeTactusDataInFastTextFile(X,y,TRAINFILE)
model = fasttext.train_supervised(ANODIRECTORY+TRAINFILE,dim=DIM,epoch=EPOCHSTART)

seen = {}
predictions = {}
for mailText in X:
    for token in mailText.split():
        if not token in seen:
            seen[token] = True
            labels,scores = model.predict(token)
            for i in range(0,len(labels)):
                label = list(labels)[i]
                score = list(scores)[i]
                if not label in predictions: predictions[label] = {}
                predictions[label][token] = score

In [None]:
len(predictions["__label__1"]),len(predictions["__label__2"])

In [None]:
sorted1 = {k: v for k, v in sorted(predictions["__label__1"].items(), key=lambda item: item[1], reverse=True)}
{ key:sorted1[key] for key in list(sorted1.keys())[0:20] }

In [None]:
sorted2 = {k: v for k, v in sorted(predictions["__label__2"].items(), key=lambda item: item[1], reverse=True)}
{ key:sorted2[key] for key in list(sorted2.keys())[0:20] }

There are many more tokens that trigger the largest class (1/dropout) rather than the smallest (2/finisher; about 3200 vs about 80 for the final counselor mail). The tokens with the highest scores according to fastText do not seem very interesting.

## Notes

1. about the data: some clients did not even do do the first assignment: answer questions from the counselor. There probably is not enough material to properly analyze them. This assumption should be checked and, if found to be correct, a further selection should be made.
2. it could be good idea to not select the first four client mails but only one: the, presumably extensive, introductory email in which the clients describe themselves

In [None]:
CUTOFF = 1000

counter = {'1':0,'2':0}
for clientId in allMails:
    if clientId in dropouts:
        maxLength = 0
        maxLengthPos = -1
        clientMailCounter = 0
        for m in range(0,len(allMails[clientId])):
            if allMails[clientId][m][FROMFIELD] == CLIENT:
                clientMailCounter += 1
                nbrOfWords = len(str(allMails[clientId][m]["text"]).split())
                if nbrOfWords > maxLength and clientMailCounter <= 4: 
                    maxLength = nbrOfWords
                    maxLengthPos = clientMailCounter
        if maxLength >= CUTOFF: 
            counter[dropouts[clientId]] += 1
            # print(clientId,dropouts[clientId],maxLength,maxLengthPos)
counter

In [None]:
def countWordsInMails(clientId):
    nbrOfWords = []
    for m in range(0,len(allMails[clientId])):
        if allMails[clientId][m][FROMFIELD] == CLIENT:
            nbrOfWords.append(len(str(allMails[clientId][m]["text"]).split()))
    pd.DataFrame(nbrOfWords).plot.bar()

countWordsInMails("AdB0023")

In [None]:
def selectByLongestMails(allMails,nbrOfMails):
    selectedData = []
    selectedLabels = []
    if len(allMails) > 0:
        firstClient = list(allMails.keys())[0]
        subjectId = getFieldId(allMails[firstClient],"subject")
        textId = getFieldId(allMails[firstClient],"text")
        for clientId in allMails:
            if clientId in dropouts:
                startClientMailId = -1
                data = ""
                clientMailId = 0
                clientMailCounter = 0
                while clientMailId >= 0 and clientMailCounter < nbrOfMails:
                    clientMailId = getNextClientMailId(allMails,clientId,startClientMailId)
                    if clientMailId >= 0:
                        mailText = allMails[clientId][clientMailId].metas[textId]
                        if len(mailText.split()) >= CUTOFF: data += mailText+SPACE
                        clientMailCounter += 1
                        startClientMailId = clientMailId
                if len(data) > 0:
                    selectedLabels.append(dropouts[clientId])
                    selectedData.append(data)
    return(selectedData,selectedLabels)

selectedData,selectedLabels = selectByLongestMails(allMails,NBROFCLIENTMAILS)
X,y = shuffleXy(selectedData,selectedLabels)
foldBoundaries = [round((f)*len(X)/NBROFFOLDS) for f in range(0,NBROFFOLDS+1)]

In [None]:
predictions = fasttextPredict(X,y,foldBoundaries,5,50,5)
evaluate(predictions,y)