# Visualization of OVK data

First, read the data from various anonymized files and store them in six lists: 

1. mails: text of the emails
2. senders: sender of a mail: counselor or client
3. nbrOfChars: length of a mail in characters
4. counselor: id of the counselor
5. ids: id of the client
6. treatment type: either AS (auto-biographic writing) or ES (expressive writing)

For comparison, we include two additional data sets beside the two OVK data sets (AS and ES): biblical text from 1888 and newspaper text from 1985 (NRC). Each article is treated as an email (list mails). Dummy values have been used for the other five data lists.

In [None]:
import csv
import os
import re
import sys
import xml.etree.ElementTree as ET

ASFILE = "../usb/ovk/data/eriktks/AS/text/AS-mails.csv"
ESFILE = "../usb/ovk/data/eriktks/ES/text/ES-mails.csv"
BIBLE = "../usb/ovk/data/eriktks/othertexts/bible.csv"
NEWSGAC = "../usb/ovk/data/eriktks/othertexts/newsgac.csv"
TACTUS = "../usb/output/emails-all.csv"
INFILENAMES = [ ASFILE,ESFILE ] # [ ASFILE,ESFILE,BIBLE,NEWSGAC ] [ TACTUS ]
CLIENT = "CLIENT"
BIBLE = "BIBLE"
NEWSPAPER = "NEWSPAPER"
COUNSELOR = "counselor"
ID = "client-id"
NBROFCHARS = "nbrOfCharsInWords"
NBROFSENTS = "nbrOfSents"
NBROFWORDS = "nbrOfWords"
SENDER = "sender"
SEPARATOR = ","
TEXT = "text"
MINWORDS = 0

(counselors,ids,mails,nbrOfChars,senders,treatments) = ([],[],[],[],[],[])
for inFileName in INFILENAMES:
    try: inFile = open(inFileName,"r")
    except Exception as e: sys.exit("cannot read file "+inFileName+": "+str(e))
    csvReader = csv.DictReader(inFile,delimiter=SEPARATOR)
    for row in csvReader:
        try:
            if True:
                if (inFileName == TACTUS or int(row[NBROFWORDS]) > MINWORDS):
                    mails.append(row[TEXT])
                    senders.append(row[SENDER])
                    if inFileName != TACTUS:
                        nbrOfChars.append(int(row[NBROFCHARS]))
                        counselors.append(row[COUNSELOR])
                        #ids.append(row[ID]+"-"+row[COUNSELOR]+"-"+row[SENDER]+"-"+str(len(ids)))
                        ids.append(row[ID])
                    else:
                        #ids.append(row[ID]+"-"+"-"+row[SENDER]+"-"+str(len(ids)))
                        ids.append(row[ID])
                    if inFileName == ASFILE: treatments.append("AS")
                    else: treatments.append("ES")
        except: sys.exit("unexpected row in file "+INFILENAME+": "+str(row))
    inFile.close()

Next, count the tokens in the mails. We use a standard Python library for this, TfidfVectorizer, which normalizes the counts with respect to the lengths of the mails and prefers tokens that appear in a few mails over tokens that appear in every mail.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

NGRAMMIN = 1
NGRAMMAX = 1

tfidf_vectorizer = TfidfVectorizer(max_df=0.8,max_features=200000,min_df=0.2,use_idf=True,ngram_range=(NGRAMMIN,NGRAMMAX))
tfidf_matrix = tfidf_vectorizer.fit_transform(mails)

The AS and ES contain 2,000 mails and 25,000 different tokens. Therefore the previous analysis resulted in a table with 2,000 rows and 25,000 columns. We use principal component analysis to summarize this table to 2,000 rows and 4 columns.

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=4)
pca.fit(tfidf_matrix.toarray())
newSpace = pca.transform(tfidf_matrix.toarray())

Next the mails can be shown in a graph. Each row in the table corresponds with a mail. The values in the columns can be used as x-coordinates and y-coordinates to position the mails in the graph. We use the first two column values because they are expected to contain the most important information for creating interesting groups of mails in the graph.

In [None]:
# draw graph of pca data: red: from client; blue: from counselor

%matplotlib notebook

import matplotlib.pyplot as plt
import random
import re

EXPERIMENT = "Emails"
DOTSIZE = 5
THRESHOLD = 5
RANDOMFACTOR=0.00
(idsCli,idsCou,idsBib,idsNew,xCli,xCou,xBib,xNew,yCli,yCou,yBib,yNew) = ([],[],[],[],[],[],[],[],[],[],[],[])

random.seed()
for i in range(0,len(newSpace)):
    if senders[i] == "CLIENT":
        xCli.append(newSpace[i][0]+random.random()*RANDOMFACTOR)
        yCli.append(newSpace[i][1]+random.random()*RANDOMFACTOR)
        idsCli.append(ids[i])
    # top left counselors: C16 C88 C77 C65
    elif senders[i] == "BIBLE":
        xBib.append(newSpace[i][0]+random.random()*RANDOMFACTOR)
        yBib.append(newSpace[i][1]+random.random()*RANDOMFACTOR)
        idsBib.append(ids[i])
    elif senders[i] == "NEWSPAPER":
        xNew.append(newSpace[i][0]+random.random()*RANDOMFACTOR)
        yNew.append(newSpace[i][1]+random.random()*RANDOMFACTOR)
        idsNew.append(ids[i])
    else: # senders[i] == COUNSELOR
        xCou.append(newSpace[i][0]+random.random()*RANDOMFACTOR)
        yCou.append(newSpace[i][1]+random.random()*RANDOMFACTOR)
        idsCou.append(ids[i])

def makePlot(colorCli,colorCou,colorBib,colorNew):
    def pickScatter(event):
        dataIds = []
        for i in range(0,len(event.ind)): 
            if event.artist == cliDots: dataIds.append(idsCli[event.ind[i]])
            elif event.artist == bibDots: dataIds.append(idsBib[event.ind[i]])
            elif event.artist == newDots: dataIds.append(idsNew[event.ind[i]])
            else: dataIds.append(idsCou[event.ind[i]])
        plt.gca().set_title(EXPERIMENT+str(dataIds),fontsize=12)

    fig = plt.figure(figsize=(9,5))
    plt.gca().set_title(EXPERIMENT)
    if len(xCou) > 0: couDots = plt.scatter(xCou,yCou,s=DOTSIZE,color=colorCou,picker=2, \
                      label="from counselor ("+str(len(idsCou))+")".format(THRESHOLD))
    else: couDots = plt.scatter(xCou,yCou,s=DOTSIZE,color=colorCou,picker=DOTSIZE)
    if len(xCli) > 0: cliDots = plt.scatter(xCli,yCli,s=DOTSIZE,color=colorCli,picker=2, \
                          label="from client ("+str(len(idsCli))+")".format(THRESHOLD))
    else: cliDots = plt.scatter(xCli,yCli,s=DOTSIZE,color=colorCli,picker=DOTSIZE)
    if len(xBib) > 0:  bibDots = plt.scatter(xBib,yBib,s=DOTSIZE,color=colorBib,picker=2, \
                       label="from Bible ("+str(len(idsBib))+")".format(THRESHOLD))
    else: bibDots = plt.scatter(xBib,yBib,s=DOTSIZE,color=colorBib,picker=DOTSIZE)
    if len(xNew) > 0: newDots = plt.scatter(xNew,yNew,s=DOTSIZE,color=colorNew,picker=2, \
                      label="from newspaper ("+str(len(idsNew))+")".format(THRESHOLD))
    else: newDots = plt.scatter(xNew,yNew,s=DOTSIZE,color=colorNew,picker=DOTSIZE)
    plt.legend(fontsize=8)
    plt.gcf().canvas.mpl_connect("pick_event",pickScatter)
    plt.savefig("image.png")

makePlot("red","blue","green","black")


The counselor cloud is divided in two parts: top-left and right-bottom. We examine the vocabulary of the two part to find out what the differences are between the two parts. We draw a imaginary vertical line in the graph at x = 0.1 and count the words of the counselor mails to the left and to the right of the line in two lists: textLargerThan01 and textSmallerThan01.

In [None]:
# plot distribution of article lengths in words
# add ot, newspaper
# collect counselor texts of pca1 > 0.1, pca1 < 0.1; compute t-scores

THRESHOLD = 0.0

def getTokens(text):
    unigrams = text.split()
    ngrams = []
    for ngramSize in range(NGRAMMIN,NGRAMMAX+1):
        for i in range(0,len(unigrams)):
            if i+ngramSize-1 < len(unigrams):
                ngram = unigrams[i]
                for j in range(2,ngramSize+1):
                    ngram += " "+unigrams[i+j-1]
                ngrams.append(ngram)
    return(ngrams)

textLargerThan01 = {}
textSmallerThan01 = {}
totalLarger = 0
totalSmaller = 0
nbrOfMailsLarger = 0
nbrOfMailsSmaller = 0
for i in range(0,len(newSpace)):
    tokens = getTokens(mails[i])
    x = newSpace[i][0]
    y = newSpace[i][1]
    #if senders[i] == "COUNSELOR" and x > THRESHOLD: # counselors: top-left vs bottom-right (threshold: 0.15)
    if (senders[i] == "COUNSELOR" or senders[i] == "CLIENT") and x > THRESHOLD: # counselors vs clients (0.0)
    # if (senders[i] == "COUNSELOR" or senders[i] == "CLIENT") and x-y > THRESHOLD: # tactus cou vs cli (-0.2)
        nbrOfMailsLarger += 1
        for i in range(0,len(tokens)):
            if i < len(tokens):
               ngram = tokens[i]
               if not ngram in textLargerThan01: textLargerThan01[ngram] = 0
               textLargerThan01[ngram] += 1
               totalLarger += 1
    elif senders[i] == "COUNSELOR" or senders[i] == "CLIENT": # counselors vs clients
    #elif senders[i] == "COUNSELOR": # counselors: top-left vs bottom-right
        nbrOfMailsSmaller += 1
        for i in range(0,len(tokens)):
            if i < len(tokens):
               ngram = tokens[i]
               if not ngram in textSmallerThan01: textSmallerThan01[ngram] = 0
               textSmallerThan01[ngram] += 1
               totalSmaller += 1

Next we determine which words appear more often in the top-left group and which words appear more often in the bottom-right group. For this purpose we use the t-score, a statistical measure for comparing frequencies in two data sets.

In [None]:
import math

tscores = {}
for token in textLargerThan01:
    if token in textSmallerThan01:
        freqLarger = textLargerThan01[token]/totalLarger
        freqSmaller = textSmallerThan01[token]/totalSmaller
        seLarger = freqLarger/totalLarger
        seSmaller = freqSmaller/totalSmaller
        tscores[token] = (freqLarger-freqSmaller)/math.sqrt(seLarger+seSmaller)

Finally, we display the top ten most specific words of each group

In [None]:
import operator
import re

N = 10
sortedTscores = sorted(tscores.items(), key=operator.itemgetter(1))

print("GROUP LEFT OF LINE ("+str(nbrOfMailsSmaller)+")")
shown = 0
for i in range(0,len(sortedTscores)):
    if re.match(r"^[a-zA-Z ]+$",sortedTscores[i][0]):
        print(str(shown+1)+". "+sortedTscores[i][0])
        shown += 1
    if shown >= N: break

print("\nGROUP RIGHT OF LINE ("+str(nbrOfMailsLarger)+")")
shown = 0
for i in range(len(sortedTscores)-1,-1,-1):
    if re.match(r"^[a-zA-Z ]+$",sortedTscores[i][0]):
        print(str(shown+1)+". "+sortedTscores[i][0])
        shown += 1
    if shown >= N: break

The word frequency lists display an important style difference between the two mail groups: mails in the top-left group use a more formal style (*u*, *uw* and *U*) while the mails in the bottom-right group are written in a less formal style (*je*, *Je*, *jou*, *jezelf*, *jij* and *jouw*).

Next:

* explore linking mails of the same session
* explore topic classification with nmf