# Correlations between LIWC variables and demographics

First, read the avarege LIWC scores for the first four emails of each client. 

In [1]:
import math
import pandas as pd
import random

In [2]:
LIWCDATADIR = "/home/erikt/projects/e-mental-health/enron/orange-hackathon/orangehackathon/jupyter/"
LIWCDATAFILE = "liwc-average-first4.csv"
FILE = "file"
DROPOUT = "dropout"
AVERAGE = "average"
STD = "std"
CONSUMPTION = "consumption"

In [3]:
liwcData = pd.read_csv(LIWCDATADIR+LIWCDATAFILE,index_col=FILE)

The dropout column contains string values. The correlation function can only process numeric values so we convert these to numbers and nans.

In [4]:
dropoutColumn = []
for i in range(0,len(liwcData)):
    if liwcData[DROPOUT][i] == "?": dropoutColumn.append(math.nan)
    else: dropoutColumn.append(int(liwcData[DROPOUT][i]))
liwcData[DROPOUT] = dropoutColumn

In [5]:
liwcData.corr()

Unnamed: 0,nbr of mails,Number of matches,number count,1 function,2 pronoun,3 ppron,4 i,5 we,6 you,7 shehe,...,114 relig,115 death,120 informal,121 swear,122 netspeak,123 assent,124 nonflu,125 filler,counselor,dropout
nbr of mails,1.000000,-0.038908,-0.077817,-0.039687,-0.065775,-0.045838,-0.079301,0.065388,-0.017152,0.137740,...,0.014353,0.043794,-0.077138,0.046553,-0.070551,-0.006874,-0.016229,-0.027966,0.037037,0.634160
Number of matches,-0.038908,1.000000,0.155722,0.157823,0.069183,0.007560,0.014414,0.021191,-0.066942,0.073696,...,-0.010168,0.099984,-0.065410,0.038559,-0.056007,-0.011529,-0.004032,-0.041084,-0.001356,-0.070590
number count,-0.077817,0.155722,1.000000,-0.084171,-0.094449,-0.107165,-0.086465,-0.012167,-0.067222,-0.007501,...,-0.009737,0.014656,0.030830,-0.008760,-0.005779,-0.021025,0.293477,0.022314,-0.016404,-0.128907
1 function,-0.039687,0.157823,-0.084171,1.000000,0.725999,0.583265,0.547843,-0.016452,0.133523,0.117791,...,0.052824,0.049706,0.012146,0.045610,0.051114,0.007199,-0.046685,-0.032291,0.037468,0.028048
2 pronoun,-0.065775,0.069183,-0.094449,0.725999,1.000000,0.808211,0.733666,0.006047,0.243020,0.140903,...,0.003923,0.057102,0.080025,0.016904,0.037001,0.020211,-0.103014,0.047967,0.006480,-0.005466
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123 assent,-0.006874,-0.011529,-0.021025,0.007199,0.020211,-0.098302,-0.104944,-0.028745,-0.002761,0.021246,...,-0.008882,-0.016428,0.287656,0.005872,0.112932,1.000000,0.001600,0.304247,0.017209,0.038974
124 nonflu,-0.016229,-0.004032,0.293477,-0.046685,-0.103014,-0.085515,-0.070686,-0.013980,-0.031779,-0.018855,...,-0.014029,-0.004042,0.205179,-0.004271,0.090520,0.001600,1.000000,0.086578,0.066770,-0.007480
125 filler,-0.027966,-0.041084,0.022314,-0.032291,0.047967,-0.037118,-0.045261,-0.011709,0.028713,-0.025394,...,0.028821,-0.009632,0.880129,-0.002898,0.037359,0.304247,0.086578,1.000000,0.050486,-0.010867
counselor,0.037037,-0.001356,-0.016404,0.037468,0.006480,-0.011131,-0.006704,-0.036912,0.006171,0.000279,...,-0.026593,-0.020976,0.002027,0.022372,-0.057242,0.017209,0.066770,0.050486,1.000000,-0.030545


## Read age and gender

Next, we extract the gender and age values of the client from the intake forms in the Tactus xml data.

In [6]:
import gzip
import xml.etree.ElementTree as ET
from IPython.display import clear_output

In [7]:
AGE = "leeftijd"
GENDER = "geslacht"
ID = "ID"
METADATADIR = "/home/erikt/projects/e-mental-health/usb/tmp/20190917/"
QUESTION = "./Intake/Questionnaire/Content/question/answer"
GENDERDICT = {"Vrouw":0,"Man":1}

In [8]:
metaData = {}
for client in range(1,1988):
    clear_output(wait=True)
    print(client)
    try:
        inFileName = "AdB"+str(client).zfill(4)+".xml.gz"
        inFile = gzip.open(METADATADIR+inFileName,"rb")
        text = inFile.read()
        inFile.close()
    except: continue
    clientId = inFileName.split(".")[0]
    root = ET.fromstring(text)
    for answer in root.findall(QUESTION):
        if ID in answer.attrib.keys():
            if answer.attrib[ID] == GENDER: 
                for i in range(0,len(answer)):
                    if not clientId in metaData: metaData[clientId] = {}
                    metaData[clientId][GENDER] = GENDERDICT[answer[i].text.strip()]
            if answer.attrib[ID] == AGE: 
                for i in range(0,len(answer)):
                    if not clientId in metaData: metaData[clientId] = {}
                    metaData[clientId][AGE] = int(answer[i].text.strip().split()[0])

1987


The age and gender numbers are added to the LIWC data structure in order to compute the correlation scores with a single command.

In [9]:
age = []
gender = []
for i in range(0,len(liwcData)):
    if liwcData.index[i] in metaData:
        age.append(metaData[liwcData.index[i]][AGE])
        gender.append(metaData[liwcData.index[i]][GENDER])
    else:
        age.append(math.nan)
        gender.append(math.nan)

In [10]:
liwcData[AGE] = age
liwcData[GENDER] = gender

## Read weekly alcohol consumption

In [11]:
CONSUMPTIONFILE = "consumption-week.csv"

consumption = pd.read_csv(CONSUMPTIONFILE,index_col=ID)

In [12]:
consumptionColumn = []
for i in range(0,len(liwcData)):
    if liwcData.index[i] in consumption.index:
        consumptionColumn.append(consumption.loc[liwcData.index[i]][0:7].max())
    else:
        consumptionColumn.append(math.nan)

In [13]:
liwcData[CONSUMPTION] = consumptionColumn

In [14]:
#teetotalers = [liwcData.index[i] for i in range(0,len(liwcData)) if liwcData.iloc[i][CONSUMPTION] == 0]
#liwcData = liwcData.drop(teetotalers)
len(liwcData)

1125

## Save data

In [15]:
liwcData.to_csv("liwc-average-first4-age-gender-consumption.csv",index_label=ID)

## Compute correlations

We are interested in the correlation scores of the average LIWC features with three other columns in the data: AGE, GENDER and DROPOUT. We inspect the 10 highest scores and the 10 lower scores. We compute the average correlation scores of 100 random samples of the raw data in order to be able to compute averages and standard deviations.

In [16]:
FIELD = DROPOUT
REPEAT = 100

samples = []
for i in range(0,REPEAT):
    samples.append(liwcData.sample(frac=1.0,replace=True).corr().sort_values(FIELD,ascending=False).T.loc[FIELD])
samples = pd.DataFrame(samples)

In [17]:
features = {}
for feature in samples.iloc[0].index:
    features[feature] = {AVERAGE:samples[feature].mean(),STD:samples[feature].std()}
features = pd.DataFrame(features).T

In [18]:
features.sort_values(AVERAGE,ascending=False)[:10]

Unnamed: 0,average,std
dropout,1.0,0.0
nbr of mails,0.634435,0.024254
60 percept,0.137006,0.033145
leeftijd,0.136508,0.038411
63 feel,0.102977,0.03465
44 male,0.088402,0.038978
7 shehe,0.079719,0.034501
10 article,0.074236,0.03613
115 death,0.067783,0.036722
71 body,0.065173,0.036181


In [19]:
features.sort_values(AVERAGE,ascending=False)[-10:]

Unnamed: 0,average,std
103 time,-0.045043,0.03574
52 cause,-0.045627,0.030735
91 focuspresent,-0.045765,0.040359
42 friend,-0.048354,0.026022
101 motion,-0.05505,0.031644
consumption,-0.059266,0.037824
Number of matches,-0.066078,0.027527
110 work,-0.078168,0.032665
geslacht,-0.107854,0.037879
number count,-0.130549,0.029869


In [20]:
liwcData.corr().sort_values(FIELD,ascending=False).T.loc[FIELD][:10]

dropout         1.000000
nbr of mails    0.634160
leeftijd        0.138881
60 percept      0.133184
63 feel         0.098362
44 male         0.088933
7 shehe         0.077820
10 article      0.073364
115 death       0.069565
90 focuspast    0.065517
Name: dropout, dtype: float64

In [21]:
liwcData.corr().sort_values(FIELD,ascending=False).T.loc[FIELD][-10:]

91 focuspresent     -0.045395
92 focusfuture      -0.045575
35 sad              -0.046286
42 friend           -0.048868
consumption         -0.055530
101 motion          -0.055952
Number of matches   -0.070590
110 work            -0.079063
geslacht            -0.113842
number count        -0.128907
Name: dropout, dtype: float64

We find that only a handful LIWC features has a correlation larger that 0.10 with one of these three other features. The correlation of only one LWC feature (1 function) is higher than 0.20 (0.22 with younger ages). 