In [None]:
%matplotlib inline
from __future__ import division
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import nltk
import json
import scipy.spatial.distance
import seaborn as sns
sno = nltk.stem.SnowballStemmer('english')

In [7]:
config = json.load(open('../config.json', 'r'))
DATA_DIR = config['DATA_DIR']
TWEET_DIR = config['TWEET_DIR']
NRC_DIR = config['NRC_DIR']

# build own affect lexicon based on NRC

In [5]:
glove = pd.read_csv(DATA_DIR + 'glove.50d.csv', sep='\t', index_col=0)
vocab = set(glove.index)

In [9]:
# load NRC
nrc_lexicon = pd.read_csv(NRC_DIR + 'nrc-emotion.csv', sep='\t', index_col=0)

# stem indices
new_indices = [sno.stem(str(w).lower()) for w in nrc_lexicon.index]
nrc_lexicon.index = new_indices

# get all stems associated with a particular affect category
nrc_dict = {}
for cat in nrc_lexicon.columns:
    nrc_dict[cat] = set(nrc_lexicon[nrc_lexicon[cat] == 1].index)

In [10]:
nrc_lexicon.columns  # affect categories

Index(['anger', 'anticipation', 'disgust', 'fear', 'joy', 'negative',
       'positive', 'sadness', 'surprise', 'trust'],
      dtype='object')

In [14]:
# keep only stems that are among our GloVe embeddings (based on the joint vocab)
new_nrc = {k:[] for k in nrc_dict}
for k, words in nrc_dict.items():
    for w in words:
        if w in vocab:
            new_nrc[k].append(w)

In [44]:
# these are manually specified keywords from our vocabulary matching the affect categories
key_words = {'negative':['hatr', 'hate', 'griev', 'grief', 'wrong'], 
             'sadness':['mourn', 'sadden', 'griev', 'grief', 'sad', 'suffer', 'affect', 'broken', 'senseless', 'loss', 'heartbroken'],
            'positive':['love', 'donat', 'heart', 'thought', 'strength', 'bless', 'solidar'], 
            'anger':['deserv', 'lynch', 'gang', 'threat','mad', 'sicken', 'harm', 'enforc', 'firearm', 'ridicul', 'assault'],
            'fear':['risk',  'hide', 'danger', 'warn', 'fear'], 
            'trust':['secur', 'coach', 'safe', 'hero', 'nation'],
            'disgust':['disgust', 'disgrac', 'shame', 'slaughter', 'sicken', 'sick',  'ill', 'lunat', 'coward']}

In [45]:
def cosine(u, v):
    return scipy.spatial.distance.cosine(u, v)

In [46]:
filtered_nrc = {}
for k, v in key_words.items():
    print(k)
    words = new_nrc[k]
    dists = []
    for w in words:
        dists.append(np.mean([cosine(glove.loc[w], glove.loc[word]) for word in v]))
    filtered_nrc[k] = [words[i] for i in np.array(dists).argsort()[:30]]

negative
sadness
positive
anger
fear
trust
disgust


In [50]:
for k, v in filtered_nrc.items():
    print(k + ':', ', '.join(v))
    print('----')

negative: hate, hatr, of, violenc, word, sad, evil, tragedi, feel, will, attack, kill, shoot, anger, victim, murder, need, sick, want, massacr, loss, tragic, crime, griev, hard, lost, pain, grief, senseless, ignor
----
sadness: loss, senseless, tragedi, love, sad, lost, devast, griev, terribl, broken, horrif, condol, victim, mourn, pain, hurt, violenc, feel, horrifi, grief, will, sick, suffer, ach, kill, aw, evil, sicken, lose, hate
----
positive: love, friend, thought, pray, affect, communiti, pleas, god, stand, condol, bless, hope, thank, help, strong, will, time, effect, support, word, work, strength, comfort, give, rest, feel, peac, power, continu, better
----
anger: gun, murder, kill, will, violenc, shoot, wrong, shot, attack, death, bad, idiot, feel, arm, crime, crazi, action, crimin, terrorist, mad, ridicul, dead, die, hate, hell, evil, word, damn, shit, insan
----
fear: danger, fear, still, hide, arm, threat, shooter, attack, gun, murder, kill, shoot, shot, polic, feel, risk, f

In [51]:
with open(DATA_DIR + 'affect_lexicon.json', 'w') as f:
    f.write(json.dumps(filtered_nrc))

# plot affect partisanship

In [None]:
events = open(DATA_DIR + 'event_names.txt', 'r').read().splitlines()
shooter_race = json.load(open(DATA_DIR + "shooters_race.json","r"))

In [None]:
def get_odds(features):
    dem_sum = sum([counts['dem'] for k, counts in features.items()])
    rep_sum = sum([counts['rep'] for k, counts in features.items()])
    odds = {}
    for k, counts in features.items():
        dem = counts['dem']
        rep = counts['rep']
        odds[k] =  np.log((rep / (rep_sum - rep)) / (dem / (dem_sum - dem)))
    return odds

In [None]:
event_list = []
cat_list = []
odds_list = []
race = []
for e in events:
    features = json.load(open(TWEET_DIR+e+'/'+e+'_affect_features.json', 'r'))
    odds = get_odds(features)
    for k, v in odds.items():
        event_list.append(e)
        cat_list.append(k)
        odds_list.append(v)
        race.append(shooter_race[e])

df = pd.DataFrame.from_dict({'event':event_list, 'odds':odds_list, 'category':cat_list, 'race':race})

In [None]:
f = plt.figure(dpi=400, figsize=(5,2.5))
ax = sns.violinplot(data=df, x="odds",y="category", hue='race', split=True, fliersize=2, notch=True, order=reversed(df.groupby('category').median().sort_values('odds').index), orient="h", palette=sns.color_palette("YlGnBu", 2))

ax.plot([0,0], [-1, 20], linewidth=1, color='grey')
ax.set_ylabel('')
ax.spines['top'].set_linewidth(0.4)
ax.spines['right'].set_linewidth(0.4)
ax.spines['bottom'].set_linewidth(0.4)
ax.spines['left'].set_linewidth(0.4)
ax.set_xlabel('Log odds ratio\n(Democrat < 0 < Republican)')
legend = ax.legend(loc='lower right', title="Shooter's race", fontsize=8)
plt.setp(legend.get_title(),fontsize='small')