In [1]:
% matplotlib inline
import matplotlib.pyplot as plt
from __future__ import division
import numpy as np
import pandas as pd
import gensim.corpora as corpora
import json
import random
import sys
import copy
import nltk
sys.path.append('..')
from helpers.glove_neighbors import *
from gensim.utils import SaveLoad

In [2]:
config = json.load(open('../config.json', 'r'))
MALLET = config['MALLET']
INPUT_DIR = config['INPUT_DIR']
OUTPUT_DIR = config['OUTPUT_DIR']
TWEET_DIR = config['TWEET_DIR']
RNG = random.Random()
RNG.seed(config['SEED'])
events = open(INPUT_DIR + 'event_names.txt', 'r').read().splitlines()
K_VALS = range(6, 11)
sno = nltk.stem.SnowballStemmer('english')

In [3]:
NO_SAMPLES = 400

In [4]:
# use the same set of topic ids across the two models and tasks
SAMPLE_TOPIC_IDS = {num_topics: RNG.choices(range(num_topics), k=NO_SAMPLES) for num_topics in K_VALS}

In [5]:
def get_furthest_other_closest(furthest, closest):
    # keep words that are in `furthest` but also in `closest` for another cluster
    for i, items in furthest.items():
        closest_other = set()
        for j, items_other in closest.items():
            if j == i:
                continue
            closest_other |= items_other

        new_list = []
        for k in items:
            if k in closest_other:
                new_list.append(k)
        furthest[i] = new_list
    return furthest

# Word intrusion

In [6]:
LOWEST_PERCENTILE = 5

In [7]:
def generate_samples(k, closest_10, furthest, dicts):
    for i in range(NO_SAMPLES):
        d = {}
        topic_id = SAMPLE_TOPIC_IDS[k][i]  # topic_id for sample
        keywords = RNG.sample(closest_10[topic_id], 5).copy()
        sampled_word = RNG.choice(furthest[topic_id])
        
        # construct sample
        keywords.append(sampled_word)
        shuffled_idx = list(range(len(keywords)))
        RNG.shuffle(shuffled_idx)
        odd_idx = shuffled_idx.index(5)
        d['odd_idx'] = odd_idx
        for j, idx in enumerate(shuffled_idx):
            d['word_' + str(j)] = keywords[idx]
        d['topic'] = topic_id
        d['no_topics'] = k
        d['sample_idx'] = i
        dicts.append(d)
    return dicts

## kmeans model

In [8]:
glove = pd.read_csv(OUTPUT_DIR+'glove.50d.csv', sep='\t', index_col=0)

In [10]:
stopwords = set([sno.stem(w) for w in open(INPUT_DIR + 'stopwords.txt', 'r').read().splitlines()])
glove = glove[~glove.index.isin(stopwords)]

In [11]:
V = len(glove)
V_lowest = int(V / 100 * LOWEST_PERCENTILE)
V_lowest

83

In [12]:
dicts = []
for k in K_VALS:
    print(k, 'clusters')
    means = np.load(OUTPUT_DIR + 'cluster_'+str(k)+'_means.npy')
    
    # get closest words to each cluster
    closest_10 = {i: list(neighbors_vector(m, glove).head(10).index) for i, m in enumerate(means)} # this is a list
    closest = {i: set(neighbors_vector(m, glove).head(V_lowest).index) for i, m in enumerate(means)}
    
    # get furthest words to each cluster within LOWEST_PERCENTILE percentile
    furthest = {i: list(neighbors_vector(m, glove).tail(V_lowest).index) for i, m in enumerate(means)}
    furthest = get_furthest_other_closest(furthest, closest)
    
    # generate samples
    dicts = generate_samples(k, closest_10, furthest, dicts)
kmeans_df = pd.DataFrame(dicts, index = range(len(dicts)))

6 clusters
7 clusters
8 clusters
9 clusters
10 clusters


## Mallet

In [13]:
topic_words = SaveLoad.load(OUTPUT_DIR + 'ldamallet_model_6.pickle').get_topics()
V = topic_words.shape[1]
V_lowest = int(V / 100 * LOWEST_PERCENTILE)
V_lowest

83

In [14]:
dicts = []
for k in K_VALS:
    print(k, 'clusters')
    ldamallet = SaveLoad.load(OUTPUT_DIR + 'ldamallet_model_' + str(k) + '.pickle')
    topic_words = ldamallet.get_topics()
    id2word = ldamallet.id2word
    
    # get closest words to each cluster
    closest_10 = {i: [id2word.id2token[idx] for idx in row.argsort()[::-1][:10]] for i, row in enumerate(topic_words)} # list
    closest = {i: set([id2word.id2token[idx] for idx in row.argsort()[::-1][:V_lowest]]) for i, row in enumerate(topic_words)}  # set
    
    # get furthest words to each cluster within LOWEST_PERCENTILE percentile
    furthest = {i: [id2word.id2token[idx] for idx in row.argsort()[:V_lowest]] for i, row in enumerate(topic_words)}
    furthest = get_furthest_other_closest(furthest, closest)
    
    # generate samples
    dicts = generate_samples(k, closest_10, furthest, dicts)
mallet_df = pd.DataFrame(dicts, index = range(len(dicts)))

6 clusters
7 clusters
8 clusters
9 clusters
10 clusters


In [14]:
mallet_df

Unnamed: 0,no_topics,odd_idx,sample_idx,topic,word_0,word_1,word_2,word_3,word_4,word_5
0,6,5,0,3,bernardino,orlando,obama,terror,terrorist,fear
1,6,4,1,0,student,texa,shoot,trump,pattern,year
2,6,3,2,1,airport,shot,offic,protest,dead,suspect
3,6,3,3,1,offic,gunman,polic,jesus,shoot,dead
4,6,2,4,4,time,stop,mall,church,law,make
5,6,1,5,4,make,condemn,violenc,good,church,talk
6,6,5,6,5,today,shoot,mass,famili,victim,cop
7,6,4,7,0,trump,student,school,day,citizen,year
8,6,4,8,2,peopl,cop,guy,black,parent,kill
9,6,5,9,0,shoot,trump,high,school,parkland,cover


In [15]:
kmeans_df

Unnamed: 0,no_topics,odd_idx,sample_idx,topic,word_0,word_1,word_2,word_3,word_4,word_5
0,6,0,0,3,photo,problem,fix,#guncontolnow,bad,yeah
1,6,0,1,0,impact,#blacklivesmatt,thug,blm,racist,#blm
2,6,4,2,1,detain,fatal,multipl,#breakingnew,honest,updat
3,6,3,3,1,updat,unconfirm,fatal,agre,#updat,#break
4,6,2,4,4,southern,david,effect,veteran,calif,california
5,6,3,5,4,gunman,calif,identifi,#blacklivesmatt,david,bar
6,6,3,6,5,observ,honor,vigil,knife,honour,candlelight
7,6,5,7,0,#blacklivesmatt,label,#blm,racism,radic,heal
8,6,3,8,2,#prayer,affect,sadden,reuter,faculti,prayer
9,6,4,9,0,blm,racist,label,supremacist,los,thug


In [15]:
mallet_df['model'] = 'mallet'
kmeans_df['model'] = 'kmeans'
concat = pd.concat([mallet_df, kmeans_df], ignore_index=True)

In [17]:
concat

Unnamed: 0,no_topics,odd_idx,sample_idx,topic,word_0,word_1,word_2,word_3,word_4,word_5,model
0,6,5,0,3,bernardino,orlando,obama,terror,terrorist,fear,mallet
1,6,4,1,0,student,texa,shoot,trump,pattern,year,mallet
2,6,3,2,1,airport,shot,offic,protest,dead,suspect,mallet
3,6,3,3,1,offic,gunman,polic,jesus,shoot,dead,mallet
4,6,2,4,4,time,stop,mall,church,law,make,mallet
5,6,1,5,4,make,condemn,violenc,good,church,talk,mallet
6,6,5,6,5,today,shoot,mass,famili,victim,cop,mallet
7,6,4,7,0,trump,student,school,day,citizen,year,mallet
8,6,4,8,2,peopl,cop,guy,black,parent,kill,mallet
9,6,5,9,0,shoot,trump,high,school,parkland,cover,mallet


In [16]:
concat.to_csv(OUTPUT_DIR + 'word_intrusion_data.csv', index=False)

# Tweet intrusion

In [6]:
SAMPLE_PER_EVENT = 3000
NUM_CLOSE = 3
LOWEST_PERCENTILE = 1

In [7]:
def topic_tweet_neighbors(data, topic, count, k, close=True):
    if close:
        data = data[data['topic_0'] == topic]  # `topic` is closest topic
    else:
        data = data[data['topic_' + str(k-1)] == topic]  # `topic` is furthest topic
    data.sort_values('ratio', inplace=True)
    return data['indices_in_original'].head(count)

In [8]:
def generate_samples(k, closest, furthest, dicts):
    for i in range(NO_SAMPLES):
        d = {}
        topic_id = SAMPLE_TOPIC_IDS[k][i]  # topic_id for sample
        close_tweets = RNG.sample(closest[topic_id], NUM_CLOSE).copy()  
        far_tweet = RNG.choice(furthest[topic_id])
        
        # construct sample
        close_tweets.append(far_tweet)
        shuffled_idx = list(range(len(close_tweets)))
        RNG.shuffle(shuffled_idx)
        odd_idx = shuffled_idx.index(NUM_CLOSE)
        d['odd_idx'] = odd_idx
        for j, idx in enumerate(shuffled_idx):
            d['index_' + str(j)] = close_tweets[idx]
        d['topic'] = topic_id
        d['no_topics'] = k
        d['sample_idx'] = i
        dicts.append(d)
    return dicts

## kmeans

In [9]:
thresholds = json.load(open(OUTPUT_DIR + 'cosine_thresholds.json', 'r'))
dicts = []
for k in K_VALS:
    print(k, 'clusters')
    # get large tweet sample where all events are equally represented
    dfs = []
    for e in events:
        df = pd.read_csv(TWEET_DIR + e + '/' + e + '_kmeans_topics_' + str(k) + '.csv')
        # for cosine distance ratio, we want the ratio to be small
        df['ratio']  = df['cosine_0'] / df['cosine_1']
        df = df[df['ratio'] < thresholds[str(k)]].sample(SAMPLE_PER_EVENT, random_state=config['SEED'])
        df['indices_in_original'] = df['indices_in_original'].astype(str) + '_' + e
        dfs.append(df)
    concat = pd.concat(dfs, ignore_index=True)
    percentile_count = int(len(concat) / 100 * LOWEST_PERCENTILE)
    print(percentile_count)
    
    # get closest tweets
    closest = {i: set(topic_tweet_neighbors(concat, i, percentile_count, k)) for i in range(k)}
    
    # get furthest tweets to each cluster within LOWEST_PERCENTILE percentile
    furthest = {i: set(topic_tweet_neighbors(concat, i, percentile_count, k, close=False)) for i in range(k)}
    furthest = get_furthest_other_closest(furthest, closest)
    
    # generate samples
    dicts = generate_samples(k, closest, furthest, dicts)
kmeans_df = pd.DataFrame(dicts, index = range(len(dicts)))

6 clusters
630


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


7 clusters
630
8 clusters
630
9 clusters
630
10 clusters
630


In [10]:
thresholds = json.load(open(OUTPUT_DIR + 'prob_thresholds.json', 'r'))
dicts = []
for k in K_VALS:
    print(k, 'clusters')
    # get large tweet sample where all events are equally represented
    dfs = []
    for e in events:
        df = pd.read_csv(TWEET_DIR + e + '/' + e + '_mallet_topics_' + str(k) + '.csv')
        df['ratio']  = (1 - df['prob_0']) / (1 - df['prob_1'])
        df = df[df['ratio'] < thresholds[str(k)]].sample(SAMPLE_PER_EVENT, random_state=config['SEED'])
        df['indices_in_original'] = df['indices_in_original'].astype(str) + '_' + e
        dfs.append(df)
    concat = pd.concat(dfs, ignore_index=True)
    percentile_count = int(len(concat) / 100 * LOWEST_PERCENTILE)
    print(percentile_count)
    
    # get closest tweets
    closest = {i: set(topic_tweet_neighbors(concat, i, percentile_count, k)) for i in range(k)}
    
    # get furthest tweets to each cluster within LOWEST_PERCENTILE percentile
    furthest = {i: set(topic_tweet_neighbors(concat, i, percentile_count, k, close=False)) for i in range(k)}
    furthest = get_furthest_other_closest(furthest, closest)
    
    # generate samples
    dicts = generate_samples(k, closest, furthest, dicts)
mallet_df = pd.DataFrame(dicts, index = range(len(dicts)))

6 clusters
630


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


7 clusters
630
8 clusters
630
9 clusters
630
10 clusters
630


In [11]:
mallet_df['model'] = 'mallet'
kmeans_df['model'] = 'kmeans'
concat = pd.concat([kmeans_df, mallet_df], ignore_index=True)

In [12]:
def get_tweets(df):
    for e in events:
        tweets = pd.read_csv(TWEET_DIR + e + '/' + e + '.csv', sep='\t', lineterminator='\n', usecols=['text'])
        for i in range(NUM_CLOSE + 1):
            split = df['index_' + str(i)].str.split('_', n=1)
            for row_idx, (idx, event) in enumerate(split):
                if event == e:
                    df.set_value(row_idx, 'tweet_' + str(i), tweets.iloc[int(idx)].text)
    return df

In [13]:
concat = get_tweets(concat)

  


In [14]:
concat

Unnamed: 0,index_0,index_1,index_2,index_3,no_topics,odd_idx,sample_idx,topic,model,tweet_0,tweet_1,tweet_2,tweet_3
0,101873_dallas,286903_sutherland_springs,2774_burlington,14002_santa_fe,6,0,0,3,kmeans,One of the suspects in Dallas shooting has bee...,"We need more good guys with guns, taking a gun...",Seriously. Gun laws need updated. But for some...,Again?? Another #shooting?? #Killing more #Ame...
1,24410_colorado_springs,42801_colorado_springs,40942_colorado_springs,3220_chattanooga,6,3,1,0,kmeans,The terrorist attack of a Planned Parenthood h...,@Oathkeepers Where were you when planned paren...,Also why are white religious radicals not labe...,"Tragic, tragic events today, my prayers go out..."
2,2744_thornton,70537_fort_lauderdale,650_kalamazoo,28830_colorado_springs,6,3,2,1,kmeans,SHOOTING AT WALMART IN COLORADO MULTIPLE DOWN,Breaking: At least 9 people injured & multiple...,Police: Multiple people dead after several sho...,Just no words. #ColoradoSpringsShooting Prayer...
3,1591_annapolis,1844886_orlando,5397_thornton,26601_fort_lauderdale,6,1,3,1,kmeans,At least 4 injured in shooting at Annapolis ne...,Stop blaming Orlando shooting on everything bu...,"""Multiple people down"" in shooting at Colorado...",At least 9 people have been injured at Fort La...
4,4282_thousand_oaks,79329_thousand_oaks,6867_thousand_oaks,128895_santa_fe,6,3,4,4,kmeans,WATCH LIVE: Update on mass shooting at #Border...,Many Who Fled California Attack Were Survivors...,#CaliforniashootingBreaking: 13 dead including...,"So, mikey. What is your plan for white high sc..."
5,12828_thousand_oaks,72111_thousand_oaks,18860_baton_rouge,91416_thousand_oaks,6,2,5,4,kmeans,Thousand Oaks shooting: gunman kills 12 at Cal...,Telemachus Orfanos survived the Las Vegas shoo...,My heart goes out to the families of the offic...,Veteran who survived Las Vegas massacre among ...
6,10020_kalamazoo,49066_fort_lauderdale,9006_kalamazoo,17775_roseburg,6,1,6,5,kmeans,Kalamazoo City Commission cancels tonight's me...,Police: Multiple people shot dead at Fort Laud...,The flag at the Township Hall will remain lowe...,Please join us as we observe a moment of silen...
7,217486_dallas,13659_fresno,3809_roseburg,16251_fresno,6,2,7,0,kmeans,If Dallas shooter were white. Hated targeted &...,#IslamicTerror in #Fresno is being suppressed ...,Thoughts and prayers with #umpquacommunity col...,"the Fresno shooting was not terror, though, an..."
8,39122_pittsburgh,15612_fresno,49734_colorado_springs,9515_burlington,6,2,8,2,kmeans,Terrible attack at the synagogue in Pittsburgh...,"The Fresno shooter said he prays to 7 ""gods."" ...",@DRUDGE_REPORT The same Jihad terrorism happen...,@cnnbrk my heart and prayers go out to the vic...
9,89727_san_bernardino,28681_annapolis,11682_fresno,11517_roseburg,6,3,9,0,kmeans,@washingtonpost The radical ideology & terrori...,I hate this even needs to be said- but whether...,Media: We don't know Fresno Shooters Motives S...,#prayingforroseburg Our prayers go to victims ...


In [15]:
concat.to_csv(TWEET_DIR + 'tweet_intrusion_data.csv', index=False)