# OVK-LIWC correlation

Test the correlation of the columns in the file ovk-liwc-wide.csv

In [None]:
import csv
import re
import sys

INFILE = "ovk-liwc-wide.csv"

def readFile(inFileName):
    try:
        table = []
        inFile = open(inFileName,"r")
        csvreader = csv.DictReader(inFile,delimiter=",")
        for row in csvreader: table.append(row)
        inFile.close()
        return(table)
    except Exception as e:
        sys.exit("error processing file "+inFileName+": "+str(e))

table = readFile(INFILE)

In [None]:
%matplotlib notebook
import matplotlib.pyplot as plt

def plot(x,y,title=""):
    fig = plt.figure()
    plt.scatter(x,y)
    plt.title(title)
    plt.show()
    plt.savefig("ovk-liwc-correlation.png")

First, determine the correlation between the CESD score and the MHC score. The expectation is that this would be negative, since CESD measures depression (the higher, the more depression signals) and MHC measures mental health (the higher, the better). The expection proves to be correct. The correlation factor is -0.34 and the negative correlation is identifyable from the plot.

In [None]:
import numpy as np

CESD = "cesd"
MHC = "mhc"
CESDT0 = CESD+"T0"
CESDT1 = CESD+"T1"
MHCT0 = MHC+"T0"
MHCT1 = MHC+"T1"

cesd = [float(x[CESDT1])-float(x[CESDT0]) for x in table if x[CESDT1] != "NA" and x[MHCT1] != "NA" ]
mhc = [float(y[MHCT1])-float(y[MHCT0]) for y in table if y[CESDT1] != "NA" and y[MHCT1] != "NA"]

np.corrcoef(cesd,mhc)[0][1]

In [None]:
plot(cesd,mhc,"change in CESD (x) vs MHC (y); "+str(len(cesd))+" clients")

Next, we determine the correlation between the change of CESD and MHC rates with any of the other features. We discard the features from T1 because we are interested in determing characteristics of patients from T0 data. We find that three features are weakly correlated with both CESD change and MHC change: avgWordLenCliT0, sexualT0 and assentT0. The absolute correlation factors of these fetaures is about 0.2.

In [None]:
MINCOR = 0.15

def numeric(table,key):
    for row in table:
        if row[key] != "NA":
            try: float(row[key])
            except: return(False)
    return(True)

def computeCorrelations(targetName):
    correlations = {}
    for key in table[0]:
        if numeric(table,key):
            target = [float(x[targetName+"T1"])-float(x[targetName+"T0"]) \
                      for x in table if x[targetName+"T1"] != "NA" and x[key] != "NA"]
            y = [float(y[key]) for y in table if y[targetName+"T1"] != "NA" and y[key] != "NA"]
            if len(set(y)) > 1: correlations[key] = np.corrcoef(target,y)[0][1]
        else:
            keyValues = list(set([x[key] for x in table]))
            if len(keyValues) == 2:
                for x in table:
                    if x[key] == keyValues[0]: x[key] = 0
                    elif x[key] == keyValues[1]: x[key] = 1
                    else: sys.exit("cannot happen")
                target = [float(x[targetName+"T1"])-float(x[targetName+"T0"]) \
                          for x in table if x[targetName+"T1"] != "NA" and x[key] != "NA"]
                y = [float(y[key]) for y in table if y[targetName+"T1"] != "NA" and y[key] != "NA"]
                if len(set(y)) > 1: correlations[key] = np.corrcoef(target,y)[0][1]
    for key in sorted(correlations,key=correlations.__getitem__,reverse=True): 
        if abs(correlations[key]) >= MINCOR and not re.search("T1$",key):
            print(correlations[key],key)
            
computeCorrelations("cesd")
print("\n***\n")
computeCorrelations("mhc")

In [None]:
def plotFeature(table,measureName,featureName,title="",sb=1): 
    measure = [float(x[measureName+"T1"])-float(x[measureName+"T0"]) \
               for x in table if x[measureName+"T1"] != "NA" and x[featureName] != "NA"]
    feature = [float(x[featureName]) \
               for x in table if x[measureName+"T1"] != "NA" and x[featureName] != "NA"]
    corr = np.corrcoef(measure,feature)[0][1]
    plot(measure,feature,measureName+" vs "+featureName+" ("+str(round(corr,2))+")")

plotFeature(table,"mhc","assentT0","CESD vs assentT0",1)
plotFeature(table,"mhc","sexualT0","CESD vs sexualT0",2)
plotFeature(table,"mhc","avgWordLenCliT0","CESD vs avgWordLenCliT0",3)

Next, we check the relation between counselors and the therapy result. For this purpose, we need to convert the counselor ids to numbers. There are several ways to do this (different orders). We do not know what works best so we check all possible orders and select the best result.

In [None]:
from itertools import permutations

COUNSELOR = "counselor"
NA = "NA"

counselorList = [x[COUNSELOR] for x in table]
counselors = sorted(list(set(counselorList)))
cesd = [float(x[CESDT1])-float(x[CESDT0]) for x in table if x[CESDT1] != NA and x[MHCT1] != NA ]
mhc = [float(x[MHCT1])-float(x[MHCT0]) for x in table if x[CESDT1] != NA and x[MHCT1] != NA ]
target = mhc

bestShuffle = []
bestCorr = 0
for shuffle in permutations([x for x in range(0,len(counselors))]):
    shuffled = {counselors[i]:shuffle[i] for i in range(0,len(counselors))}
    counselorsShuffled = [shuffled[x[COUNSELOR]] for x in table if x[CESDT1] != NA and x[MHCT1] != NA ]
    corr = np.corrcoef(target,counselorsShuffled)[0][1]
    if abs(corr) > bestCorr:
        bestCorr = corr
        bestShuffle = shuffled
print(bestCorr,bestShuffle)

In [None]:
counselorsShuffled = [bestShuffle[x[COUNSELOR]] for x in table if  x[CESDT1] != NA and x[MHCT1] != NA]
corr = np.corrcoef(target,counselorsShuffled)[0][1]
plot(target,counselorsShuffled)
corr

Even the strongest correlated features are only weakly correlated (0.2). The small data size could be a cause of this. We could explore two directions for improving this:

1. Increase the size of the current dataset
2. Combine features
3. Look for other features

We compute the correlation factor of the two variables delta CESD and delta MHC with the set of features COUNSELOR, ASSENTT0, SEXUALT0 and AVGWORDLENCLIT0. For this purpose we use [multiple correlation](https://en.wikipedia.org/wiki/Multiple_correlation#Computation). The results are:

1. CESD: COU:11%, ASS:-26%, SEX:-25%, AVG:-20%, COMBI:43%
2. MHC: COU:29%, ASS:22%, SEX:20%, AVG:18%, COMBI:44%

In [None]:
import math
import pandas as pd

COUNSELORSHUFFLED = COUNSELOR+"shuffled"
ASSENTT0 = "assentT0"
SEXUALT0 = "sexualT0"
AVGWORDLENCLIT0 = "avgWordLenCliT0"

def makeSubsetNonNA(table,columns):
    subset = []
    for row in table:
        NAfound = False
        for column in columns:
            if not column in row or row[column] == None or row[column] == NA:
                NAfound = True
                break
        if not NAfound:
            newRow = {x:float(row[x]) for x in columns if x != COUNSELOR}
            if COUNSELOR in columns: newRow[COUNSELOR] = row[COUNSELOR]
            subset.append(newRow)
    return(subset)

#len(shuffled)
subsetNonNADict = makeSubsetNonNA(table,\
                  [CESDT0,CESDT1,MHCT0,MHCT1,ASSENTT0,SEXUALT0,AVGWORDLENCLIT0,COUNSELOR])
measures = []
features = []
for row in subsetNonNADict:
    row[CESD] = row[CESDT1]-row[CESDT0]
    row[MHC] = row[MHCT1]-row[MHCT0]
    row[COUNSELORSHUFFLED] = bestShuffle[row[COUNSELOR]]
    measures.append([row[CESD],row[MHC]])
    features.append([row[COUNSELORSHUFFLED],row[ASSENTT0],row[SEXUALT0],row[AVGWORDLENCLIT0]])

# https://en.wikipedia.org/wiki/Multiple_correlation#Computation
Rxx = pd.DataFrame(np.array(features)).corr()
c = []
TARGETCESD = 0
TARGETMHC = 1
selectedMeasure = [measures[i][TARGETMHC] for i in range(0,len(measures))]
for f in range(0,len(features[0])):
    c.append(np.corrcoef(selectedMeasure,[features[i][f] for i in range(0,len(features))])[0][1])
c = np.array(c)
r = math.sqrt(np.transpose(c).dot(np.linalg.inv(Rxx).dot(c)))
(r,c)