In [None]:
import csv
import sys

ASDIR = "../usb/ovk/data/eriktks/AS/text"
ESDIR = "../usb/ovk/data/eriktks/ES/text"
REVERSEDFILENAME = "reversed.txt"
MAILFILENAME = "mails.csv"
SEPARATOR = ","
CLIENT = "client-id"
COUNSELOR = "counselor"
DATE = "date"
SENDER = "sender"

In [None]:
def safeOpenFile(fileName,mode):
    try: fileHandle = open(fileName,mode)
    except Exception as e: sys.exit("error: cannot open file "+fileName+": "+str(e))
    return(fileHandle)

def readFirstMails(inDir):
    mails = {}
    inFileName = inDir+"/"+MAILFILENAME
    inFile = safeOpenFile(inFileName,"r")
    csvReader = csv.DictReader(inFile,delimiter=SEPARATOR)
    for row in csvReader:
        try:
            client = row[CLIENT]
            date = row[DATE]
            sender = row[SENDER]
            if sender == "CLIENT" and \
               (not client in mails or 
                (mails[client][DATE] > date and date != "")):
                mails[client] = dict(row)
        except Exception as e: sys.exit("error: "+str(e))      
    inFile.close()
    return(mails)

asFirstMails = readFirstMails(ASDIR)
esFirstMails = readFirstMails(ESDIR)

In [None]:
def textToTokenDict(text):
    tokenDict = {}
    for token in text.split(): 
        tokenDict[token] = True
    return(tokenDict)
    
def makeFeatureDict(mailData):
    tokens = {}
    for client in mailData.keys():
        mailTokenDict = textToTokenDict(mailData[client]["text"])
        for token in mailTokenDict.keys():
            if token in tokens: tokens[token] += 1
            else: tokens[token] = 1
    featureList = list(tokens.keys())
    return(featureList)

featureList = makeFeatureDict({**asFirstMails,**esFirstMails})

In [None]:
OUTFILENAME="data2tsne.txt"
        
def writeFeatureValueLine(mailTokenDict,featureList,outFile):
    for i in range(0,len(featureList)):
        if featureList[i] in mailTokenDict: print(1.0,end=" ",file=outFile)
        else: print(0.0,end=" ",file=outFile)
    print("",file=outFile)

def writeFeatureValues(mailData,featureList,outFileName):
    outFile = safeOpenFile(outFileName,"w")
    for client in mailData.keys():
        mailTokenDict = textToTokenDict(mailData[client]["text"])
        writeFeatureValueLine(mailTokenDict,featureList,outFile)
    outFile.close()

selectedData = {**asFirstMails,**esFirstMails}
writeFeatureValues(selectedData,featureList,OUTFILENAME)

In [None]:
OVKMETAFILE = "../usb/ovk/data/eriktks/spss/opve.csv"
IDFIELDNAME = "onderzoeksnummer1"
EXITFIELDNAME = "Redenstoppen"
SEPARATOR = ","

def readMetaData():
    exitData = {}
    cesdDiff = {}
    mhcDiff = {}
    inFile = safeOpenFile(OVKMETAFILE,"r")
    csvReader = csv.DictReader(inFile,delimiter=SEPARATOR)
    for row in csvReader: 
        exitData[row[IDFIELDNAME]] = row[EXITFIELDNAME].strip()
        if row["CESD_TOT_t0"] != "NA" and row["CESD_TOT_t1"] != "NA": 
            cesdDiff[row[IDFIELDNAME]] = int(row["CESD_TOT_t1"])-int(row["CESD_TOT_t0"])
        if row["MHCtot_t0"] != "NA" and row["MHCtot_t1"] != "NA": 
            mhcDiff[row[IDFIELDNAME]] = float(row["MHCtot_t1"])-float(row["MHCtot_t0"])
    inFile.close()
    return(exitData,cesdDiff,mhcDiff)

exitData,cesdDiff,mhcDiff = readMetaData()

In [None]:
LABELFILE = "data2tsne.txt.labels"

def test(client): return(testMhc(client))
def testCesd(client): return(client in cesdDiff and cesdDiff [client] > 0)
def testExit(client): return(client in exitData and exitData[client] != "")
def testMhc(client): return(client in mhcDiff and mhcDiff[client] < 0)

def writeLabels(mailData,outFileName):
    outFile = safeOpenFile(outFileName,"w")
    for client in mailData.keys():
        if test(client): print("1.0",file=outFile)
        else: print("0.0",file=outFile)
    outFile.close()

writeLabels(selectedData,LABELFILE)