# Fasttext experiments with tactus data

First import required modules: general Python modules and modules from Orange3

In [1]:
import csv
import fasttext
import gzip
import os
import re
import sys

sys.path.append("/home/erikt/projects/e-mental-health/enron/orange-hackathon/orangehackathon/libs")
import tactusloaderLIB
import OWEmailSorterLIB
import markduplicatesLIB
import removemarkedtextLIB
import LIWCLIB

Next, read the mails from the data set via Orange 3 and store them in the data structure allMails. This takes several minutes so we do not want to do this often.

In [2]:
ANODIRECTORY = "/home/erikt/projects/e-mental-health/usb/releases/20191217/"
ANOSTRING = "-an"
GZEXTENSION = ".gz"
XMLEXTENSION = ".xml"

os.chdir(ANODIRECTORY)

def shortenFileName(fileName):
    return(re.sub(XMLEXTENSION,"",re.sub(ANOSTRING,"",fileName)))

In [3]:
def getAllTactusMails():
    allMails = {}
    for patientId in list(range(1,1988)):
        if patientId % 100 == 0: print(patientId,end=" ")
        fileName = tactusloaderLIB.makeFileName(str(patientId))
        try:
            mails = tactusloaderLIB.processFile(ANODIRECTORY,fileName+GZEXTENSION)
            if len(mails[0]) > 0:
                sortedMails = OWEmailSorterLIB.filterEmails(mails[0],filter_asc=True)
                markedMails = markduplicatesLIB.processCorpus(sortedMails)
                strippedMails = removemarkedtextLIB.processCorpus(markedMails)
                allMails[shortenFileName(fileName)] = strippedMails
        except:
            print("problem processing file",fileName)
            continue
    return(allMails)

allMails = {}
# allMails = getAllTactusMails()

We also need labels for the data. We will use the dropout labels provided by a student's project

In [4]:
SELECTEDFILE = "/home/erikt/projects/e-mental-health/usb/releases/20200305/selected.csv.gz"
DROPOUT = "dropout"
FILE = "file"

dropout = {}
inFile = gzip.open(SELECTEDFILE,"rt",encoding="utf-8")
csvreader = csv.DictReader(inFile)
for row in csvreader:
    row[FILE] = re.sub("(-an)?.xml(.gz)?$","",row[FILE])
    if row[DROPOUT] == "1" or row[DROPOUT] == "2": 
        dropout[row[FILE]] = row[DROPOUT]
inFile.close()

Fasttext operates on files so we should store our data in a file to enable fasttext to access it.

In [5]:
LASTMAILID = -1
LABELPREFIX = "__label__"
OUTFILENAME = "fasttext.txt"

def getFieldId(corpus,fieldName):
    fieldId = -1
    for i in range(0,len(corpus.domain.metas)):
        if str(corpus.domain.metas[i]) == fieldName:
            fieldId = i
    return(fieldId)

def storeTactusDataInFastTextFile(allMails):
    if len(allMails) > 0:
        firstClient = list(allMails.keys())[0]
        subjectId = getFieldId(allMails[firstClient],"subject")
        textId = getFieldId(allMails[firstClient],"text")

        outFile = open(OUTFILENAME,"w")
        for client in allMails:
            if client in dropout:
                subject = allMails[client][LASTMAILID].metas[subjectId]
                mailText = allMails[client][LASTMAILID].metas[textId]
                print(LABELPREFIX+str(dropout[client]),subject,mailText,file=outFile)
        outFile.close()
    
storeTactusDataInFastTextFile(allMails)

Now we can start a fasttext experiment using this data set

In [13]:
DIM = 300
EPOCH = 18
TRAIN = "fasttext-train.txt"
TEST = "fasttext-test.txt"
WIKIFILENAME = "wiki.nl.vec"
CCFILENAME = "cc.nl.300.vec"
WIKIDIR = "/home/erikt/projects/newsgac/fasttext-runs/"

for epoch in range(1,30,1):
    model = fasttext.train_supervised(ANODIRECTORY+TRAIN,dim=DIM,epoch=epoch)
    print(epoch,model.test(ANODIRECTORY+TEST))

1 (158, 0.5379746835443038, 0.5379746835443038)
2 (158, 0.5316455696202531, 0.5316455696202531)
3 (158, 0.5316455696202531, 0.5316455696202531)
4 (158, 0.5316455696202531, 0.5316455696202531)
5 (158, 0.5316455696202531, 0.5316455696202531)
6 (158, 0.5316455696202531, 0.5316455696202531)
7 (158, 0.6139240506329114, 0.6139240506329114)
8 (158, 0.7341772151898734, 0.7341772151898734)
9 (158, 0.8227848101265823, 0.8227848101265823)
10 (158, 0.8481012658227848, 0.8481012658227848)
11 (158, 0.8544303797468354, 0.8544303797468354)
12 (158, 0.8607594936708861, 0.8607594936708861)
13 (158, 0.8670886075949367, 0.8670886075949367)
14 (158, 0.8734177215189873, 0.8734177215189873)
15 (158, 0.8860759493670886, 0.8860759493670886)
16 (158, 0.8924050632911392, 0.8924050632911392)
17 (158, 0.8860759493670886, 0.8860759493670886)
18 (158, 0.8924050632911392, 0.8924050632911392)
19 (158, 0.8987341772151899, 0.8987341772151899)
20 (158, 0.8924050632911392, 0.8924050632911392)
21 (158, 0.879746835443038, 0

without pretrained vectors:
18 (158, 0.8987341772151899, 0.8987341772151899)
with pretrained vectors (wiki):
3 (158, 0.8607594936708861, 0.8607594936708861)
5 (158, 0.8734177215189873, 0.8734177215189873)
7 (158, 0.8607594936708861, 0.8607594936708861)
10 (158, 0.8670886075949367, 0.8670886075949367)
15 (158, 0.8544303797468354, 0.8544303797468354)
with pretrained vectors (cc): CRASH (memory problem)