# Visualizing English Tactus Data

The first block of code contains the code of the function that reads the data.

In [None]:
import csv

CLIENT = "CLIENT"
DATE = "DATE"
NBROFSENTS = "NBROFSENTS"
NBROFTOKENS = "NBROFTOKENS"
SENDER = "Sender"

def removeMetaData(row):
    if DATE in row: del(row[DATE])
    if NBROFSENTS in row: del(row[NBROFSENTS])
    if SENDER in row: del(row[SENDER])
    return(row)

def readData(inFileName):
    inFile = open(inFileName,"r")
    data = []
    csvReader = csv.DictReader(inFile,delimiter=",")
    for row in csvReader:
        if (DATE in row and row[DATE] in clientDatesList1) or \
           (SENDER in row and row[SENDER] == CLIENT): data.append(removeMetaData(row))
    inFile.close()
    return(data)

The second code block holds the function that selects the data from the fields we want to visualize. Each field data item is a list with numbers: how often each type of word was seen in each of the mails. Since one mail can be longer than another, we will use percentages in the data visualization. Therefore, we divide each number by the total number of words of each mail (NBROFTOKENS). 

In [None]:
import sys

NBROFTOKENS = "NBROFTOKENS"

def selectData(data,fieldNameList):
    fieldDataList = []
    for fieldName in fieldNameList:
        if not fieldName in data[0]: sys.exit("unknown field name: "+fieldName)
        fieldData = [float(data[i][fieldName])/float(data[i][NBROFTOKENS]) for i in range(0,len(data))]
        fieldDataList.append(fieldData)
    return(fieldDataList)

The data will be visualized as a stacked bar plot by the three functions in the third code block. The y-values shown in the plot are fractions: 0.01 corresponds to 1%. The data visualization is automatically saved in the file tactus.png. You can use this image file for presentations.

In [None]:
import matplotlib.pyplot as plt

PLOTWIDTH = 15
PLOTHEIGHT = 4
BARWIDTH = 1.0
IMAGEFILE = "tactus.png"

def makeBottomValues(fieldDataList,index):
    bottomValues = []
    for i in range(0,len(fieldDataList)):
        for j in range(0,len(fieldDataList[i])):
            while len(bottomValues) < j+1: bottomValues.append(0)
            if i < index: bottomValues[j] += fieldDataList[i][j]
    return(bottomValues)

def makePlot(fieldDataList,fieldNames):
    plt.figure(figsize=(PLOTWIDTH,PLOTHEIGHT))
    xvalues = range(0,len(fieldDataList[0]))
    barplots = []
    for i in range(0,len(fieldDataList)):
        bottomValues = makeBottomValues(fieldDataList,i)
        barplot = plt.bar(xvalues,fieldDataList[i],width=BARWIDTH,bottom=bottomValues)
        barplots.append(barplot)
    plt.legend(tuple([b[0] for b in barplots]),tuple(fieldNames))
    plt.xticks(xvalues,[x+1 for x in xvalues])
    plt.savefig(IMAGEFILE)
    plt.show()
    
def visualize(inFileName,fieldNameList):
    data = readData(inFileName)
    fieldDataList = selectData(data,fieldNameList)
    makePlot(fieldDataList,fieldNameList)

The function summarize presents a list of feature names together with their frequency. Thus we can observe which feature names are interesting in a certain file. WIth summarizeMail, we obtain the frequencies of the features for a single mail. And summarizeFeature provides the frequencies of a single feature per mail.

In [None]:
import operator

DATA = "DATA"
FEATURE = "FEATURE"
MAIL = "MAIL"

def summarizeDataFeature(data,featureName):
    return({i+1:float(data[i][featureName])/float(data[i][NBROFTOKENS]) \
            for i in range(0,len(data)) if featureName in data[i]})

def summarizeDataMail(data,mailId):
    summary = {}
    if mailId >= 0 and mailId < len(data):
        row = data[mailId]
        for featureName in row:
            if row[featureName].isdigit():
                if featureName in summary: summary[featureName] += float(row[featureName])/float(row[NBROFTOKENS])
                else: summary[featureName] = float(row[featureName])/float(row[NBROFTOKENS])
    return(summary)

def summarizeData(data):
    summary = {}
    for row in data:
        for featureName in row:
            if row[featureName].isdigit() :
                if featureName in summary: summary[featureName] += int(row[featureName])
                else: summary[featureName] = int(row[featureName])
    return(summary)

def printSummary(summary,type=DATA):
    for element in sorted(summary.items(), key=operator.itemgetter(1),reverse=True):
        featureName,frequency = element
        if featureName != NBROFTOKENS: 
            if type != DATA: print("%5.2f%% %s" % (100.0*frequency,featureName))
            else: print("%5d %s (%0.2f%%)" % (frequency,featureName,
                                              100.0*float(frequency)/float(summary[NBROFTOKENS])))
def summarizeFeature(inFileName,featureName):
    data = readData(inFileName)
    summary = summarizeDataFeature(data,featureName)
    printSummary(summary,FEATURE)        

def summarizeMail(inFileName,mailId):
    data = readData(inFileName)
    summary = summarizeDataMail(data,mailId)
    printSummary(summary,MAIL)

def summarize(inFileName):
    data = readData(inFileName)
    summary = summarizeData(data)
    printSummary(summary)

Next, the name of the input file is defined. Furthermore, the dates and times of mails written by clients are specified in a list. Then we call the visualization function with as arguments the name of the input file and a list of fields that we want to visualize.

In [None]:
INFILENAME1 = "1.txt"
clientDatesList1 = ["2010-05-06T12:52:39","2010-05-10T18:21:37","2010-05-31T23:58:56"]

visualize(INFILENAME1,["relativ","bio","social","cogproc","family"])

In [None]:
INFILENAME2 = "AdB0016.txt"
visualize(INFILENAME2,["relativ","bio","social","cogproc","family"])

In [None]:
summarize(INFILENAME1)

In [None]:
summarizeMail(INFILENAME2,30)

In [None]:
summarizeFeature(INFILENAME1,"function")