# Visualization of OVK data

First, read the data from various anonymized files and store them in six lists: 

1. mails: text of the emails
2. senders: sender of a mail: counselor or client
3. nbrOfChars: length of a mail in characters
4. counselor: id of the counselor
5. ids: id of the client
6. treatment type: either AS (auto-biographic writing) or ES (expressive writing)

For comparison, we include two additional data sets beside the two OVK data sets (AS and ES): biblical text from 1888 and newspaper text from 1985 (NRC). Each article is treated as an email (list mails). Dummy values have been used for the other five data lists.

In [None]:
import csv
import os
import re
import sys
import xml.etree.ElementTree as ET

ASFILE = "../usb/ovk/data/eriktks/AS/text/AS-mails.csv"
ESFILE = "../usb/ovk/data/eriktks/ES/text/ES-mails.csv"
BIBLE = "../usb/ovk/data/eriktks/othertexts/bible.csv"
NEWSGAC = "../usb/ovk/data/eriktks/othertexts/newsgac.csv"
TACTUS = "../usb/output/emails-all.csv"
INFILENAMES = [ ASFILE,ESFILE ] # [ ASFILE,ESFILE,BIBLE,NEWSGAC ] [ TACTUS ]
CLIENT = "CLIENT"
BIBLE = "BIBLE"
NEWSPAPER = "NEWSPAPER"
COUNSELOR = "counselor"
GENDER = "GeslachtA"
ID = "client-id"
NBROFCHARS = "nbrOfCharsInWords"
NBROFSENTS = "nbrOfSents"
NBROFWORDS = "nbrOfWords"
SENDER = "sender"
SEPARATOR = ","
TEXT = "text"
MINWORDS = 0
MAXMAILS = 1
TARGET="CLIENT"

(counselors,ids,mails,nbrOfChars,senders,treatments) = ([],[],[],[],[],[])
firstMailSeen = {}
nbrOfMails = {}
for inFileName in INFILENAMES:
    try: inFile = open(inFileName,"r")
    except Exception as e: sys.exit("cannot read file "+inFileName+": "+str(e))
    csvReader = csv.DictReader(inFile,delimiter=SEPARATOR)
    for row in csvReader:
        try:
            if True:
                if (inFileName == TACTUS or int(row[NBROFWORDS]) > MINWORDS) and row[SENDER] == TARGET and \
                   (not row[ID] in nbrOfMails or MAXMAILS <= 0 or nbrOfMails[row[ID]] < MAXMAILS):
                    if TARGET=="COUNSELOR" and MAXMAILS > 0 and not row[ID] in firstMailSeen: 
                        firstMailSeen[row[ID]] = True
                    else:
                        mails.append(row[TEXT])
                        senders.append(row[SENDER])
                        if row[ID] in nbrOfMails: nbrOfMails[row[ID]] += 1
                        else: nbrOfMails[row[ID]] = 1
                        if inFileName != TACTUS:
                            nbrOfChars.append(int(row[NBROFCHARS]))
                            counselors.append(row[COUNSELOR])
                            #ids.append(row[ID]+"-"+row[COUNSELOR]+"-"+row[SENDER]+"-"+str(len(ids)))
                            ids.append(row[ID])
                        else:
                            #ids.append(row[ID]+"-"+"-"+row[SENDER]+"-"+str(len(ids)))
                            ids.append(row[ID])
                        if inFileName == ASFILE: treatments.append("AS")
                        else: treatments.append("ES")
        except: sys.exit("unexpected row in file "+INFILENAME+": "+str(row))
    inFile.close()

Check if we want to have separate emails or have all emails for one person collapsed in one text

In [None]:
COLLAPSE = True

if COLLAPSE:
    (newCounselors,newIds,newMails,newNbrOfChars,newSenders,newTreatments) = ([],[],[],[],[],[])
    collapse = {}
    for i in range(0,len(ids)):
        if ids[i] in collapse:
            thisId = collapse[ids[i]]
            newMails[thisId] += " "+mails[i]
            newNbrOfChars[thisId] += nbrOfChars[i]
        else:
            collapse[ids[i]] = len(newIds)
            if i < len(counselors): newCounselors.append(counselors[i])
            newIds.append(ids[i])
            newMails.append(mails[i])
            if i < len(nbrOfChars): newNbrOfChars.append(nbrOfChars[i])
            newSenders.append(senders[i])
            newTreatments.append(treatments[i])
    counselors = newCounselors
    ids = newIds
    mails = newMails
    nbrOfChars = newNbrOfChars
    senders = newSenders
    treatments = newTreatments

Read the meta data: exit phrase and the scores CESD and MHC. If the exit phrase is non-empty the patient has abandoned the treatment. The lower the CESD (depression) score, the better. The higher the MHC (mental health) score, the better.

In [None]:
OVKMETAFILE = "../usb/ovk/data/eriktks/spss/opve.csv"
IDFIELDNAME = "onderzoeksnummer1"
EXITFIELDNAME = "Redenstoppen"

exitData = {}
cesdDiff = {}
mhcDiff = {}
gender ={}
try: inFile = open(OVKMETAFILE,"r")
except Exception as e: sys.exit("cannot read file "+OVKMETAFILE+": "+str(e))
csvReader = csv.DictReader(inFile,delimiter=SEPARATOR)
for row in csvReader: 
    exitData[row[IDFIELDNAME]] = row[EXITFIELDNAME]
    gender[row[IDFIELDNAME]] = row[GENDER]
    if row["CESD_TOT_t0"] != "NA" and row["CESD_TOT_t1"] != "NA": 
        cesdDiff[row[IDFIELDNAME]] = int(row["CESD_TOT_t1"])-int(row["CESD_TOT_t0"])
    if row["MHCtot_t0"] != "NA" and row["MHCtot_t1"] != "NA": 
        mhcDiff[row[IDFIELDNAME]] = float(row["MHCtot_t1"])-float(row["MHCtot_t0"])
inFile.close()

Next, count the tokens in the mails. We use a standard Python library for this, TfidfVectorizer, which normalizes the counts with respect to the lengths of the mails and prefers tokens that appear in a few mails over tokens that appear in every mail.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

NGRAMMIN = 1
NGRAMMAX = 2

tfidf_vectorizer = TfidfVectorizer(max_df=0.8,max_features=200000,min_df=0.2,use_idf=True,ngram_range=(NGRAMMIN,NGRAMMAX))
tfidf_matrix = tfidf_vectorizer.fit_transform(mails)

The AS and ES contain 2,000 mails and 25,000 different tokens. Therefore the previous analysis resulted in a table with 2,000 rows and 25,000 columns. We use principal component analysis to summarize this table to 2,000 rows and 4 columns.

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=4)
pca.fit(tfidf_matrix.toarray())
newSpace = pca.transform(tfidf_matrix.toarray())

Next the mails can be shown in a graph. Each row in the table corresponds with a mail. The values in the columns can be used as x-coordinates and y-coordinates to position the mails in the graph. We use the first two column values because they are expected to contain the most important information for creating interesting groups of mails in the graph.

In [None]:
%matplotlib notebook

import matplotlib.pyplot as plt
import random
import re

THRESHOLD = 5
RANDOMFACTOR=0.00
DOTSIZE = 5
(idsG1,idsG2,xG1,xG2,yG1,yG2) = ([],[],[],[],[],[])
counselorDict = {}

def testFunction(thisId):
    return(stoppedTreatment(thisId))

def stoppedTreatment(thisId):
    if thisId == "NAME": return("Treatment progress")
    elif thisId == "GROUP1": return("abandoned treatment")
    elif thisId == "GROUP2": return("finished treatment")
    else: return(re.search(r"[a-zA-Z]",exitData[thisId]))

def moreDepressed(thisId):
    if thisId == "NAME": return("Depression progress (CESD)")
    elif thisId == "GROUP1": return("more depressed")
    elif thisId == "GROUP2": return("lass depressed ")
    else: return(ids[i] in cesdDiff and cesdDiff[ids[i]] > 0)

def worseMental(thisId):
    if thisId == "NAME": return("Mental health progress (MHC)")
    elif thisId == "GROUP1": return("mental health deteriorated")
    elif thisId == "GROUP2": return("mental health improved")
    else: return(ids[i] in mhcDiff and mhcDiff[ids[i]] < 0)

def clientGender(thisId):
    if thisId == "NAME": return("Gender")
    elif thisId == "GROUP1": return("man")
    elif thisId == "GROUP2": return("woman")
    else: return(gender[thisId] == "man")

def false(thisId):
    if thisId == "NAME": return("No test criterion")
    elif thisId == "GROUP1": return("false")
    elif thisId == "GROUP2": return("true")
    else: return(False)

random.seed()
for i in range(0,len(newSpace)):
    if testFunction(ids[i]):
        xG1.append(newSpace[i][0]+random.random()*RANDOMFACTOR)
        yG1.append(newSpace[i][1]+random.random()*RANDOMFACTOR)
        idsG1.append(ids[i])
        counselorDict[ids[i]] = counselors[i]
    else: # testFunction()
        xG2.append(newSpace[i][0]+random.random()*RANDOMFACTOR)
        yG2.append(newSpace[i][1]+random.random()*RANDOMFACTOR)
        idsG2.append(ids[i])
        counselorDict[ids[i]] = counselors[i]
    
def makePlot(colorG1,colorG2):
    EXPERIMENT = testFunction("NAME")
    def pickScatter(event):
        dataIds = []
        for i in range(0,len(event.ind)):
            if event.artist == g2Dots: dataIds.append(idsG2[event.ind[i]])
            else: dataIds.append(idsG1[event.ind[i]])
        title = EXPERIMENT+" ["
        for i in range(0,len(dataIds)):
            title += dataIds[i]+":"+str(nbrOfMails[dataIds[i]])+":"+counselorDict[dataIds[i]]
            if i < len(dataIds)-1: title += ","
        title += "]"
        plt.gca().set_title(title,fontsize=12)

    fig = plt.figure(figsize=(9,5))
    plt.gca().set_title(EXPERIMENT)
    g1Dots = plt.scatter(xG1,yG1,s=DOTSIZE,color=colorG1,picker=DOTSIZE, 
                         label=testFunction("GROUP1")+" ("+str(len(idsG1))+")".format(THRESHOLD))
    g2Dots = plt.scatter(xG2,yG2,s=DOTSIZE,color=colorG2,picker=DOTSIZE, \
                         label=testFunction("GROUP2")+" ("+str(len(idsG2))+")".format(THRESHOLD))
    plt.legend(fontsize=8)
    plt.gcf().canvas.mpl_connect("pick_event",pickScatter)

makePlot("blue","red")
plt.savefig("image.png")

The cloud contains different colors for clients that completed the treatment (red) and clients that stopped the treatment (blue). Some blue dots are grouped in a seperate area but some are placed among the red dots. We now can do two things:

* we can build a model for all the blue dots (or all the red dots). But this model will need to separate the difficult cases as well: the blue dots among the red dots
* we can build a model for the blue dots that are in a separate area (top right). Actually this PCP analysis already is such a model but we want to see if we can define the group with lexical features and then check if the chose features make sense

In [None]:
# plot distribution of article lengths in words
# add ot, newspaper
# collect counselor texts of pca1 > 0.1, pca1 < 0.1; compute t-scores

THRESHOLDX = 0.2
THRESHOLDY = 0.0

def getTokens(text):
    unigrams = text.split()
    ngrams = []
    for ngramSize in range(NGRAMMIN,NGRAMMAX+1):
        for i in range(0,len(unigrams)):
            if i+ngramSize-1 < len(unigrams):
                ngram = unigrams[i]
                for j in range(2,ngramSize+1):
                    ngram += " "+unigrams[i+j-1]
                ngrams.append(ngram)
    return(ngrams)

textCompleted = {}
textStopped = {}
totalCompleted = 0
totalStopped = 0
nbrOfMailsCompleted = 0
nbrOfMailsStopped = 0
for i in range(0,len(newSpace)):
    seen = {}
    tokens = getTokens(mails[i])
    x = newSpace[i][0]
    y = newSpace[i][1]
    #if x > THRESHOLDX and y < THRESHOLDY: 
    if not testFunction(ids[i]):
        nbrOfMailsCompleted += 1
        for i in range(0,len(tokens)):
            if i < len(tokens):
                ngram = tokens[i]
                if not ngram in textCompleted: textCompleted[ngram] = 0
                if not ngram in seen:
                    textCompleted[ngram] += 1
                    totalCompleted += 1
                    seen[ngram] = True  
    else:
        nbrOfMailsStopped += 1
        for i in range(0,len(tokens)):
            if i < len(tokens):
                ngram = tokens[i]
                if not ngram in textStopped: textStopped[ngram] = 0
                if not ngram in seen:
                    textStopped[ngram] += 1
                    totalStopped += 1
                    seen[ngram] = True

Next we determine which words appear more often in the top-left group and which words appear more often in the bottom-right group. For this purpose we use the t-score, a statistical measure for comparing frequencies in two data sets.

In [None]:
import math

tscores = {}
for token in textCompleted:
    freqCompleted = textCompleted[token]/totalCompleted
    seCompleted = freqCompleted/totalCompleted
    if token in textStopped:
        freqStopped = textStopped[token]/totalStopped
    else:
        freqStopped = 0.5/totalStopped
    seStopped = freqStopped/totalStopped
    tscores[token] = (freqCompleted-freqStopped)/math.sqrt(seCompleted+seStopped)
for token in textStopped:
    if not token in textCompleted:       
        freqCompleted = 0.5/totalCompleted
        seCompleted = freqCompleted/totalCompleted
        freqStopped = textStopped[token]/totalStopped
        seStopped = freqStopped/totalStopped
        tscores[token] = (freqCompleted-freqStopped)/math.sqrt(seCompleted+seStopped)

Finally, we display the top ten most specific words of each group

In [None]:
import operator
import re

N = 20
sortedTscores = sorted(tscores.items(), key=operator.itemgetter(1), reverse=True)

print(testFunction("GROUP2")+" ("+str(nbrOfMailsCompleted)+")")
shown = 0
for i in range(0,len(sortedTscores)):
    ngram = sortedTscores[i][0]
    if re.match(r"^[a-zA-Z ]+$",ngram):
        if not ngram in textCompleted: textCompleted[ngram] = 0
        if not ngram in textStopped: textStopped[ngram] = 0
        print(str(shown+1)+". "+ngram,(nbrOfMailsStopped-textStopped[ngram])+textCompleted[ngram],
              textStopped[ngram],textCompleted[ngram],tscores[ngram])
        shown += 1
        if shown >= N: break

shown = 0
print("\n"+testFunction("GROUP1")+" ("+str(nbrOfMailsStopped)+")")
for i in range(-1,-len(sortedTscores),-1):
    ngram = sortedTscores[i][0]
    if re.match(r"^[a-zA-Z ]+$",ngram):
        if not ngram in textCompleted: textCompleted[ngram] = 0
        if not ngram in textStopped: textStopped[ngram] = 0
        print(str(shown+1)+". "+ngram,(nbrOfMailsCompleted-textCompleted[ngram])+textStopped[ngram],
              textStopped[ngram],textCompleted[ngram],tscores[ngram])
        shown += 1
        if shown >= N: break

Next:

* explore linking mails of the same session (+)
* explore topic classification with nmf (-)
* solve click in graph bug
* check mails of stopped patients