In [None]:
# read mails from files and store in array mails

import csv
import os
import re
import sys
import xml.etree.ElementTree as ET


INFILENAMES = [ "../usb/ovk/data/eriktks/AS/text/AS-mails.csv","../usb/ovk/data/eriktks/ES/text/ES-mails.csv", \
                "../usb/ovk/data/eriktks/othertexts/genesis.csv" ]
CLIENT = "CLIENT"
GENESIS = "GENESIS"
COUNSELOR = "counselor"
ID = "client-id"
NBROFCHARS = "nbrOfCharsInWords"
NBROFSENTS = "nbrOfSents"
NBROFWORDS = "nbrOfWords"
SENDER = "sender"
SEPARATOR = ","
TEXT = "text"

(counselors,ids,mails,nbrOfChars,senders) = ([],[],[],[],[])
for inFileName in INFILENAMES:
    try: inFile = open(inFileName,"r")
    except: sys.exit("cannot read file "+INFILENAME)
    csvReader = csv.DictReader(inFile,delimiter=SEPARATOR)
    for row in csvReader:
        try:
            if True:
                mails.append(row[TEXT])
                senders.append(row[SENDER])
                nbrOfChars.append(int(row[NBROFCHARS]))
                counselors.append(row[COUNSELOR])
                ids.append(row[ID]+","+row[COUNSELOR]+","+row[NBROFWORDS]+","+row[NBROFCHARS]+","+row[NBROFSENTS]+","+row[SENDER])
        except: sys.exit("unexpected row in file "+INFILENAME+": "+str(row))
    inFile.close()

In [None]:
# create tfidf vectors for each mail text

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.8,max_features=200000, \
                                   min_df=0.2,use_idf=True)
tfidf_matrix = tfidf_vectorizer.fit_transform(mails)

In [None]:
# convert tfidf vectors to principal components (n=2)

from sklearn.decomposition import PCA

pca = PCA(n_components=4)
pca.fit(tfidf_matrix.toarray())
newSpace = pca.transform(tfidf_matrix.toarray())

In [None]:
# draw graph of pca data: red: from client; blue: from counselor

%matplotlib notebook
import matplotlib.pyplot as plt
import random
import re

EXPERIMENT = "Emails"
THRESHOLD = 5
RANDOMFACTOR=0.00
(idsCli,idsCouns,idsGen,xCli,xCouns,xGen,yCli,yCouns,yGen) = ([],[],[],[],[],[],[],[],[])
random.seed()
for i in range(0,len(newSpace)):
    if senders[i] == CLIENT:
        xCli.append(newSpace[i][0]+random.random()*RANDOMFACTOR)
        yCli.append(newSpace[i][1]+random.random()*RANDOMFACTOR)
        idsCli.append(ids[i])
    elif senders[i] == GENESIS:
        xGen.append(newSpace[i][0]+random.random()*RANDOMFACTOR)
        yGen.append(newSpace[i][1]+random.random()*RANDOMFACTOR)
        idsGen.append(ids[i])
    else:
        xCouns.append(newSpace[i][0]+random.random()*RANDOMFACTOR)
        yCouns.append(newSpace[i][1]+random.random()*RANDOMFACTOR)
        idsCouns.append(ids[i])
fig = plt.figure(figsize=(9,5))
plt.gca().set_title(EXPERIMENT+": Click in the graph to see the relevant document ids")
counsDots = plt.scatter(xCouns,yCouns,s=1,color="blue",picker=2,label="from counselor ("+str(len(idsCouns))+")".format(THRESHOLD))
cliDots = plt.scatter(xCli,yCli,s=1,color="red",picker=2,label="from client ("+str(len(idsCli))+")".format(THRESHOLD))
genDots = plt.scatter(xGen,yGen,s=1,color="green",picker=2,label="from Genesis ("+str(len(idsGen))+")".format(THRESHOLD))
unknownDots = -1
plt.legend(fontsize=8)

def pickScatter(event):
    dataIds = []
    for i in range(0,len(event.ind)): 
        if event.artist == cliDots: dataIds.append(idsCli[event.ind[i]])
        elif event.artist == genDots: dataIds.append(idsGen[event.ind[i]])
        else: dataIds.append(idsCouns[event.ind[i]])
    plt.gca().set_title(EXPERIMENT+str(dataIds),fontsize=12)
plt.gcf().canvas.mpl_connect("pick_event",pickScatter)