In [1]:
import pandas as pd
from gensim.models import KeyedVectors
from sklearn.preprocessing import normalize
import numpy as np
import re

In [2]:
wv_from_bin = KeyedVectors.load_word2vec_format(fname="GoogleNews-vectors-negative300.bin", binary=True)  # C binary format

# LIWC

In [3]:
liwc = pd.read_csv('data/LIWC - full.csv', index_col=0).T.reset_index()
liwc.index.name = ''
liwc.columns = ['Function'] + list(liwc.columns[1:])
liwc.head()

Unnamed: 0,Function,Pronoun,Ppron,I,We,You,SheHe,They,Ipron,Article,...,Money,Relig,Death,Informal,Swear,Netspeak,Assent,Nonflu,Filler,Cognitive
,,,,,,,,,,,,,,,,,,,,,
0.0,a,another,he,i'd,let's,ily*,he,their*,another,a,...,account*,afterlife*,alive,(:,af,(:,absolutely,ah,anyway*,abnormal*
1.0,about,anybod*,he'd,i'd've,lets,thee,he'd,them,anybod*,an,...,accrue*,agnost*,assass*,(;,arse,(;,agree,ahh*,blah,could've
2.0,above,anymore,he's,i'll,our,thine,he's,themself,anymore,the,...,affordable,alla,autops*,):,arsehole*,):,ah,er,dunno,hazy
3.0,absolutely,anyone*,her,i'm,ours,thou,her,themselves,anyone*,,...,atm,allah*,behead*,/:,arses,/:,aight,hm*,idk,need'nt
4.0,abt,anything,hers,i've,ourselves,thoust,hers,they,anything,,...,atms,altar*,bereave*,4ev*,asf,4ev*,alright*,huh,idontknow,secretively


This cleaning procedure should generally help, but it will also introduce a little noise. For example i'd becomes id, which is a different word.

In [4]:
def try_clean(cell):
    try:
        word = ''.join(char for char in cell if char.isalpha()) #cell.replace('*','') 
        word = np.NaN if word == 'Unnamed' else word # need this because Unnamed: 0 is in the Function column after read_csv
        return word
    except:
        return cell

liwc = liwc.applymap(try_clean)
liwc.head()

Unnamed: 0,Function,Pronoun,Ppron,I,We,You,SheHe,They,Ipron,Article,...,Money,Relig,Death,Informal,Swear,Netspeak,Assent,Nonflu,Filler,Cognitive
,,,,,,,,,,,,,,,,,,,,,
0.0,a,another,he,id,lets,ily,he,their,another,a,...,account,afterlife,alive,,af,,absolutely,ah,anyway,abnormal
1.0,about,anybod,hed,idve,lets,thee,hed,them,anybod,an,...,accrue,agnost,assass,,arse,,agree,ahh,blah,couldve
2.0,above,anymore,hes,ill,our,thine,hes,themself,anymore,the,...,affordable,alla,autops,,arsehole,,ah,er,dunno,hazy
3.0,absolutely,anyone,her,im,ours,thou,her,themselves,anyone,,...,atm,allah,behead,,arses,,aight,hm,idk,neednt
4.0,abt,anything,hers,ive,ourselves,thoust,hers,they,anything,,...,atms,altar,bereave,ev,asf,ev,alright,huh,idontknow,secretively


In [5]:
liwc.tail()

Unnamed: 0,Function,Pronoun,Ppron,I,We,You,SheHe,They,Ipron,Article,...,Money,Relig,Death,Informal,Swear,Netspeak,Assent,Nonflu,Filler,Cognitive
,,,,,,,,,,,,,,,,,,,,,
1411.0,,,,,,,,,,,...,,,,,,,,,,
1412.0,,,,,,,,,,,...,,,,,,,,,,
1413.0,,,,,,,,,,,...,,,,,,,,,,
1414.0,,,,,,,,,,,...,,,,,,,,,,
1415.0,,,,,,,,,,,...,,,,,,,,,,


In [6]:
vectors_in_cells = liwc.applymap(lambda x: wv_from_bin[x] if x in wv_from_bin else np.nan)
vectors_in_cells.head()

Unnamed: 0,Function,Pronoun,Ppron,I,We,You,SheHe,They,Ipron,Article,...,Money,Relig,Death,Informal,Swear,Netspeak,Assent,Nonflu,Filler,Cognitive
,,,,,,,,,,,,,,,,,,,,,
0.0,,"[0.19433594, -0.01965332, 0.091796875, 0.10449...","[0.19238281, 0.12792969, -0.019165039, -0.0292...","[0.29492188, -0.12109375, 0.0006828308, 0.2207...","[0.19824219, -0.0026245117, 0.123046875, 0.075...","[0.012939453, -0.018310547, -0.026000977, 0.12...","[0.19238281, 0.12792969, -0.019165039, -0.0292...","[0.07324219, 0.19628906, 0.0005187988, 0.14355...","[0.19433594, -0.01965332, 0.091796875, 0.10449...",,...,"[-0.025634766, -0.0046081543, 0.030395508, -0....","[0.33398438, -0.13769531, -0.24414062, 0.19824...","[0.051757812, 0.018432617, 0.1484375, -0.03637...",,"[-0.107421875, -0.02709961, 0.107421875, 0.091...",,"[-0.043701172, -0.27929688, 0.09277344, -0.073...","[-0.1640625, 0.19726562, 0.19921875, 0.1884765...","[0.13085938, -0.07910156, 0.026000977, 0.22949...","[-0.17578125, 0.14746094, -0.015625, -0.018798..."
1.0,"[0.20214844, -0.08105469, 0.18359375, -0.13671...",,"[-0.13476562, 0.15820312, -0.28710938, -0.0756...",,"[0.19824219, -0.0026245117, 0.123046875, 0.075...","[0.07128906, 0.04638672, -0.11230469, 0.126953...","[-0.13476562, 0.15820312, -0.28710938, -0.0756...","[0.03491211, 0.08496094, 0.0625, 0.140625, -0....",,"[0.12597656, 0.19042969, 0.06982422, 0.0722656...",...,"[0.12402344, 0.07714844, -0.17773438, 0.324218...",,,,,,"[-0.03857422, 0.026245117, 0.19042969, 0.12109...","[0.071777344, 0.14160156, 0.15820312, 0.300781...","[-0.03491211, 0.20019531, 0.106933594, 0.14160...","[-0.052001953, -0.017944336, -0.010559082, 0.1..."
2.0,"[-0.12402344, -0.20605469, -0.08886719, 0.2656...","[0.15820312, -0.13183594, -0.088378906, 0.2373...","[0.0390625, 0.078125, -0.10644531, 0.15820312,...","[-0.0030975342, 0.07324219, -0.26757812, 0.119...","[-0.19824219, 0.17285156, 0.08544922, 0.371093...","[0.068847656, 0.034179688, -0.15332031, 0.3222...","[0.0390625, 0.078125, -0.10644531, 0.15820312,...","[0.28515625, -0.14355469, 0.14160156, 0.144531...","[0.15820312, -0.13183594, -0.088378906, 0.2373...","[0.080078125, 0.10498047, 0.049804688, 0.05346...",...,"[-0.041503906, -0.13183594, -0.19824219, 0.013...","[0.037109375, 0.06933594, 0.14453125, 0.209960...",,,"[0.25585938, -0.3359375, 0.18261719, -0.005645...",,"[-0.1640625, 0.19726562, 0.19921875, 0.1884765...","[0.044433594, 0.1875, -0.05493164, 0.07763672,...","[0.06933594, -0.15039062, 0.14941406, 0.332031...","[0.2734375, -0.042236328, 0.16503906, 0.074707..."
3.0,"[-0.043701172, -0.27929688, 0.09277344, -0.073...","[0.16796875, -0.26171875, 0.010620117, 0.14550...","[0.14648438, -0.013793945, -0.019165039, -0.04...","[-0.036621094, 0.014526367, 0.03515625, 0.2304...","[-0.14257812, 0.06298828, 0.07910156, 0.431640...","[0.22949219, 0.33984375, 0.23730469, 0.0751953...","[0.14648438, -0.013793945, -0.019165039, -0.04...","[0.16699219, 0.15917969, 0.05078125, 0.0498046...","[0.16796875, -0.26171875, 0.010620117, 0.14550...",,...,"[-0.31054688, 0.15429688, -0.25195312, 0.04809...","[0.22265625, 0.114746094, 0.2578125, 0.2832031...","[-0.16113281, -0.22363281, 0.39648438, 0.12695...",,,,"[-0.12988281, -0.055908203, 0.036621094, 0.205...","[-0.017211914, 0.01965332, 0.19042969, 0.42578...","[-0.23046875, -0.060546875, 0.05102539, 0.3828...",
4.0,"[-0.18261719, 0.1796875, 0.14160156, 0.1137695...","[0.07080078, -0.03491211, 0.06542969, 0.058837...","[-0.022460938, -0.22167969, 0.046875, 0.214843...","[-0.41210938, 0.18847656, -0.234375, 0.296875,...","[0.07910156, 0.24316406, 0.24121094, 0.3164062...",,"[-0.022460938, -0.22167969, 0.046875, 0.214843...","[0.064453125, 0.036132812, 0.03857422, 0.09472...","[0.07080078, -0.03491211, 0.06542969, 0.058837...",,...,,"[0.115234375, 0.2578125, -0.17675781, 0.240234...","[0.072265625, 0.040527344, -0.041503906, 0.102...","[-0.23242188, 0.08886719, 0.10839844, 0.095703...","[0.11279297, -0.08691406, 0.012634277, 0.11474...","[-0.23242188, 0.08886719, 0.10839844, 0.095703...","[0.140625, -0.026733398, 0.08105469, 0.125, -0...","[0.2734375, 0.14160156, 0.057373047, 0.4277343...",,"[-0.035888672, -0.17480469, -0.083984375, -0.0..."


In [7]:
liwc_avg_vecs = vectors_in_cells.apply(np.mean)
liwc_avg_vecs

Unnamed: 0,Function,Pronoun,Ppron,I,We,You,SheHe,They,Ipron,Article,...,Money,Relig,Death,Informal,Swear,Netspeak,Assent,Nonflu,Filler,Cognitive
0,0.039440,0.051351,0.048744,-0.014726,-0.046326,0.082492,0.124512,0.054244,0.053011,0.103027,...,0.064965,0.165077,0.127482,-0.005333,0.049465,-0.052683,0.024125,0.039159,-0.007782,0.043229
1,-0.006557,0.026285,0.038816,0.009051,0.037608,0.070349,0.017023,0.035245,0.004571,0.147705,...,0.011034,0.014858,0.153382,-0.007544,-0.075328,0.026749,0.013838,0.033910,-0.020386,0.025630
2,0.027752,0.021411,0.018129,0.015066,-0.022882,0.074504,-0.055748,0.032673,0.025068,0.059814,...,-0.044619,0.099840,0.137861,0.069204,0.099116,0.054139,0.060871,0.104145,0.057617,0.032536
3,0.148285,0.190776,0.205819,0.220933,0.210083,0.280735,0.009247,0.272283,0.159922,0.062866,...,0.167710,0.172983,0.155117,0.183788,0.199616,0.174362,0.181702,0.219945,0.214966,0.088567
4,-0.066668,-0.070047,-0.057547,-0.082427,-0.046621,-0.056229,0.013390,-0.133301,-0.090539,-0.143066,...,0.048566,-0.029042,-0.029057,-0.110732,-0.145052,-0.093709,-0.095788,-0.103413,-0.072884,-0.088977
5,0.027370,0.051375,0.064774,0.112362,-0.077609,0.147366,0.010986,0.024481,0.026549,-0.046509,...,0.009038,0.046002,0.045171,0.059582,0.096124,0.050120,0.009909,-0.006804,-0.038635,0.034571
6,0.015041,-0.024560,-0.056900,-0.056887,0.029751,-0.095488,-0.059723,-0.065385,0.035267,0.044556,...,0.073200,0.026835,0.112012,-0.044036,-0.027698,-0.057057,-0.022801,-0.051482,0.031688,0.117915
7,-0.085990,-0.048134,-0.042973,-0.092443,-0.084195,-0.023809,-0.043327,0.045288,-0.055158,-0.097656,...,-0.063477,-0.045319,-0.147449,-0.072398,-0.028119,-0.101448,-0.074823,-0.067153,-0.109741,-0.076949
8,0.064208,-0.028507,-0.071937,-0.014936,-0.126305,-0.092043,-0.056402,-0.063277,0.045333,0.043457,...,0.143938,0.057761,0.201386,0.066029,0.122877,0.034333,0.037186,0.055388,0.036346,0.058504
9,0.062922,0.015395,0.012003,0.029074,0.035990,0.036082,-0.098915,0.055779,0.018879,-0.049866,...,0.073052,0.130870,0.123869,0.088001,0.113512,0.068111,0.076765,0.169355,0.085632,0.048767


In [8]:
# liwc_avg_vecs.to_csv('data/liwc_avg_vecs.csv', index=False)

# Harvard General Inquirer dictionary

In [9]:
inq = pd.read_excel('data/inquirerbasic.xls', index_col=0)
inq.head()

Unnamed: 0_level_0,Source,Positiv,Negativ,Pstv,Affil,Ngtv,Hostile,Strong,Power,Weak,...,Anomie,NegAff,PosAff,SureLw,If,NotLw,TimeSpc,FormLw,Othtags,Defined
Entry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A,H4Lvd,,,,,,,,,,...,,,,,,,,,DET ART,| article: Indefinite singular article--some o...
ABANDON,H4Lvd,,Negativ,,,Ngtv,,,,Weak,...,,,,,,,,,SUPV,|
ABANDONMENT,H4,,Negativ,,,,,,,Weak,...,,,,,,,,,Noun,|
ABATE,H4Lvd,,Negativ,,,,,,,,...,,,,,,,,,SUPV,|
ABATEMENT,Lvd,,,,,,,,,,...,,,,,,,,,Noun,


In [10]:
inq = inq.drop(['Source', 'Othtags', 'Defined'], axis='columns')
inq.head()

Unnamed: 0_level_0,Positiv,Negativ,Pstv,Affil,Ngtv,Hostile,Strong,Power,Weak,Submit,...,PtLw,Nation,Anomie,NegAff,PosAff,SureLw,If,NotLw,TimeSpc,FormLw
Entry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A,,,,,,,,,,,...,,,,,,,,,,
ABANDON,,Negativ,,,Ngtv,,,,Weak,,...,,,,,,,,,,
ABANDONMENT,,Negativ,,,,,,,Weak,,...,,,,,,,,,,
ABATE,,Negativ,,,,,,,,,...,,,,,,,,,,
ABATEMENT,,,,,,,,,,,...,,,,,,,,,,


In [11]:
inq.loc[False].head()

Positiv        NaN
Negativ    Negativ
Pstv           NaN
Affil          NaN
Ngtv          Ngtv
dtype: object

That's some data entry or encoding error. We need to replace that row with the *string* `'False'`

In [12]:
as_list = inq.index.tolist()
idx = as_list.index(False)
as_list[idx] = 'False'
inq.index = as_list

In [13]:
assert False not in inq.index

In [14]:
inq.loc['False'].head()

Positiv        NaN
Negativ    Negativ
Pstv           NaN
Affil          NaN
Ngtv          Ngtv
Name: False, dtype: object

In [15]:
stacked_inq = inq.stack()
stacked_inq.head()

ABANDON  Negativ    Negativ
         Ngtv          Ngtv
         Weak          Weak
         Fail          Fail
         IAV            IAV
dtype: object

In [16]:
stacked_inq = stacked_inq.reset_index().drop(0, axis='columns')
stacked_inq.columns = ['word','category']
stacked_inq.head()

Unnamed: 0,word,category
0,ABANDON,Negativ
1,ABANDON,Ngtv
2,ABANDON,Weak
3,ABANDON,Fail
4,ABANDON,IAV


In [17]:
regex = re.compile('[^a-zA-Z]')
#First parameter is the replacement, second parameter is your input string
regex.sub('', 'ab3d*E')

'abdE'

In [18]:
def clean_words(word):
    word = regex.sub('', word.lower())
    return word

stacked_inq['word'] = stacked_inq['word'].apply(clean_words)
stacked_inq.head()

Unnamed: 0,word,category
0,abandon,Negativ
1,abandon,Ngtv
2,abandon,Weak
3,abandon,Fail
4,abandon,IAV


In [19]:
def average_vectors(words):
    words = words.word
    vecs = []
    for word in words:
        try:
            vec = wv_from_bin[word]
            vecs.append(vec)
        except KeyError:
            continue
    return pd.DataFrame(vecs).mean(axis=0)

In [20]:
inq_avg_vectors = stacked_inq.groupby('category').apply(average_vectors).T
inq_avg_vectors.head()

category,ABS,ANI,Abs@,Academ,Active,AffGain,AffLoss,AffOth,AffPt,AffTot,...,WlbPsyc,WlbPt,WlbTot,WltOth,WltPt,WltTot,WltTran,Work,Yes,You
0,0.047323,0.029427,0.090752,0.05392,0.03745,0.059068,0.082395,0.098802,0.074279,0.083579,...,0.154763,0.034029,0.069767,0.058066,0.066566,0.06139,0.073435,0.015147,0.057469,0.205105
1,0.051974,0.104235,0.033096,0.004647,0.060734,-0.021589,0.043723,0.022277,-0.074794,-0.012202,...,0.049124,0.018446,0.07853,0.032591,-0.027608,0.023562,0.027835,0.093043,-0.002935,0.098524
2,0.015816,-0.191244,0.02397,0.069611,0.004073,0.030237,-0.246097,-0.0001,0.005405,-0.007716,...,0.026844,-0.046716,-0.005397,-0.059825,-0.103164,-0.068362,-0.07483,-0.015966,0.045119,0.113444
3,0.100709,0.090906,0.11798,0.141519,0.048712,0.124644,0.161871,0.124055,0.079389,0.113175,...,0.049746,0.119494,0.078296,0.098404,0.069943,0.09644,0.106396,0.033535,0.161345,0.294054
4,-0.044055,-0.02044,-0.063909,0.066524,-0.071443,-0.092868,-0.148593,-0.094935,0.045352,-0.059646,...,-0.110401,-0.109217,-0.106026,0.037566,0.030673,0.034472,0.026721,-0.065664,-0.091167,-0.106852


# Combine dictionaries' average vector dataframes

In [21]:
all_dictionaries_avgs = pd.concat([inq_avg_vectors, liwc_avg_vecs], axis='columns').T
all_dictionaries_avgs.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
ABS,0.047323,0.051974,0.015816,0.100709,-0.044055,-0.013068,0.089823,-0.116081,0.100341,0.082088,...,-0.092534,0.014969,-0.094077,-0.065393,-0.036495,-0.021092,-0.003979,-0.027713,-0.005736,0.00713
ANI,0.029427,0.104235,-0.191244,0.090906,-0.02044,-0.028862,0.012906,0.018099,0.057349,0.04114,...,-0.140927,0.056619,-0.091317,0.034088,0.065352,0.075121,0.043407,0.132303,-0.028511,0.094875
Abs@,0.090752,0.033096,0.02397,0.11798,-0.063909,0.02613,0.112878,-0.121309,0.098919,0.064893,...,-0.133848,0.017571,-0.071422,-0.089449,-0.056986,0.018606,0.062237,-0.039359,-0.015719,0.028384
Academ,0.05392,0.004647,0.069611,0.141519,0.066524,0.062273,0.12889,-0.092114,0.028991,-0.076046,...,-0.120987,-0.000318,-0.150794,0.013228,-0.014078,-0.00292,0.00143,-0.032383,0.080872,0.049225
Active,0.03745,0.060734,0.004073,0.048712,-0.071443,0.014378,0.060926,-0.042278,0.078379,0.068554,...,-0.011435,0.076758,-0.077064,0.024897,-0.072333,-0.042771,-0.006919,-0.072431,0.024453,-0.001164


In [23]:
all_dictionaries_avgs = pd.DataFrame(normalize(all_dictionaries_avgs), index=all_dictionaries_avgs.index)

In [24]:
all_dictionaries_avgs.to_csv('data/all_dictionaries_avg_vectors.csv')