In [1]:
from __future__ import division
import numpy as np
import json
import pandas as pd
import nltk

In [2]:
sno = nltk.stem.SnowballStemmer('english')
config = json.load(open('../config.json', 'r'))
INPUT_DIR = config['INPUT_DIR']
OUTPUT_DIR = config['OUTPUT_DIR']
TWEET_DIR = config['TWEET_DIR']
BTM_DIR = config['BTM']
BTM_INPUT_DIR = config['BTM'] + 'myData/'
BTM_OUTPUT_DIR = config['BTM'] + 'output/model/'
events = open(INPUT_DIR + 'event_names.txt', 'r').read().splitlines()

In [3]:
stopwords = set([sno.stem(w) for w in open(INPUT_DIR + 'stopwords.txt', 'r').read().splitlines()])

In [4]:
def remove_stopwords(tweet):
    return ' '.join([w for w in tweet.split() if w not in stopwords])

## filter stopwords from joint vocab

In [41]:
i = 0
word2idx = {}
for word in open(OUTPUT_DIR + 'joint_vocab.txt', 'r').read().splitlines():
    if word not in stopwords:
        word2idx[word] = i
        i += 1

In [42]:
with open(OUTPUT_DIR + 'joint_vocab_nostop.json', 'w') as f:
    f.write(json.dumps(word2idx))

In [43]:
len(word2idx)

1669

## get tweets for inducing topics

In [14]:
def get_samples():
    all_tweets = []
    all_events = []
    all_indices = []
    for event in events:
        tweets = open(TWEET_DIR + event + '/' + event + '_cleaned_text.txt', 'r').read().splitlines()
        N = len(tweets)
        print(event, N)

        # get tweets with embeddings
        idx1 = np.load(TWEET_DIR + event + '/' + event + '_partisan_indices_among_cleaned_indices.npy')
        tweets = [tweets[i] for i in idx1]

        # get sample for determining topics
        idx2 = np.load(TWEET_DIR + event + '/' + event + '_indices_among_embeddings_for_getting_topics.npy')
        tweets = [tweets[i] for i in idx2]

        # get indices among cleaned ones
        filter1 = sorted(list(set(idx1) & set(range(N))))
        filter2 = [filter1[j] for j in idx2]
        all_indices.extend(filter2)

        all_events.extend([event] * len(tweets))
        all_tweets.extend(tweets)
    df = pd.DataFrame({'tweet': all_tweets, 'event': all_events, 'index_in_clean_text': all_indices})
    df['tweet'] = df['tweet'].apply(remove_stopwords)
    df = df[df['tweet'].str.contains(' ')]
    return df

In [15]:
df = get_samples()
with open(BTM_INPUT_DIR + 'tweets_train.txt', 'w') as f:
    f.write('\n'.join(df['tweet']))
df[['event', 'index_in_clean_text']].to_csv(BTM_INPUT_DIR + 'indices_train.csv', index=False)

chattanooga 29550
roseburg 18042
colorado_springs 55792
san_bernardino 70452
kalamazoo 10950
orlando 1825759
dallas 259736
baton_rouge 46036
burlington 8148
fort_lauderdale 12514
fresno 8850
san_francisco 10484
vegas 1284855
thornton 14296
sutherland_springs 153889
parkland 272213
nashville 38638
santa_fe 73563
annapolis 27673
pittsburgh 59869
thousand_oaks 117580


## get all tweets

In [19]:
def combine_all_tweets():
    all_tweets = []
    all_indices = []
    all_events = []
    for event in events:
        print(event)
    
        # get tweets for which we have embeddings
        tweets = open(TWEET_DIR + event + '/' + event + '_cleaned_text.txt', 'r').read().splitlines()
        idx1 = np.load(TWEET_DIR + event + '/' + event + '_partisan_indices_among_cleaned_indices.npy')
        tweets = [tweets[i] for i in idx1]
    
        # get indices of these tweets in the **original data** (with no filtering)
        original_indices = np.load(TWEET_DIR + event + '/' + event + '_cleaned_and_partisan_indices.npy')
        
        all_tweets.extend(tweets)
        all_indices.extend(original_indices)
        all_events.extend([event] * len(tweets))
    
    df = pd.DataFrame({'tweet': all_tweets, 'indices_in_original': all_indices, 'event': all_events})
    df['tweet'] = df['tweet'].apply(remove_stopwords)
    df = df[df['tweet'].str.contains(' ')]
    return df

In [20]:
df = combine_all_tweets()
df[['indices_in_original', 'event']].to_csv(BTM_INPUT_DIR + 'indices_all.csv', index=False)
with open(BTM_INPUT_DIR + 'tweets_all.txt', 'w') as f:
    f.write('\n'.join(df['tweet']))

chattanooga
roseburg
colorado_springs
san_bernardino
kalamazoo
orlando
dallas
baton_rouge
burlington
fort_lauderdale
fresno
san_francisco
vegas
thornton
sutherland_springs
parkland
nashville
santa_fe
annapolis
pittsburgh
thousand_oaks


## next step is to run BTM script: `sh myBTMexample.sh`

Make sure to set `K` in the parameters and to run it in a Python 3 environment. Also make sure that the vocab size `W` is set right.

## convert outputs

### get topic probabilities

In [5]:
def save_topic_probs(k):
    with open(BTM_OUTPUT_DIR + 'k' + str(k) + '.pz', 'r') as f:
        np.save(OUTPUT_DIR + 'btm_' + str(k) + '_topic_probs.npy', np.array([float(p) for p in f.readline().split()]))

In [6]:
for k in range(6, 11):
    save_topic_probs(k)

### get topic word matrix

In [7]:
word2idx = json.load(open(OUTPUT_DIR + 'joint_vocab_nostop.json', 'r'))

In [12]:
idx2word = {i: w for w, i in word2idx.items()}

In [20]:
def save_topic_word_matrix(k):
    mat = np.zeros((k, len(word2idx)))
    i = 0
    for row in open(BTM_OUTPUT_DIR + 'k' + str(k) + '.pw_z', 'r'):
        mat[i, :] = [float(v) for v in row.split()]
        print(i, ', '.join([idx2word[w] for w in mat[i, :].argsort()[::-1][:10]]))
        i += 1
    print('-----')
    np.save(OUTPUT_DIR + 'btm_' + str(k) + '_topic_words.npy', mat)

In [21]:
for k in range(6, 11):
    save_topic_word_matrix(k)

0 shoot, shooter, attack, trump, san, vega, gun, bernardino, orlando, victim
1 kill, peopl, shoot, white, shooter, shot, polic, gun, dalla, black
2 gun, school, shoot, high, law, peopl, control, church, parkland, student
3 shoot, victim, famili, prayer, thought, kill, today, mass, communiti, heart
4 shoot, dead, polic, shooter, peopl, report, shot, kill, suspect, offic
5 hous, shoot, shooter, white, gun, kill, man, hero, peopl, guy
-----
0 shoot, dead, polic, shooter, peopl, report, shot, suspect, kill, offic
1 kill, hous, peopl, white, shoot, shooter, black, man, shot, polic
2 shoot, shooter, attack, trump, terrorist, terror, call, media, san, plan
3 gun, shoot, peopl, control, kill, stop, law, church, violenc, mass
4 shoot, victim, famili, prayer, thought, today, kill, communiti, school, heart
5 school, gun, shoot, high, shooter, parkland, student, texa, kid, law
6 shoot, vega, mass, las, thousand, victim, kill, california, san, bar
-----
0 gun, shoot, law, control, peopl, shooter, c

### get topic assignments

In [50]:
def get_topic_assignments():
    
    all_tweets = pd.read_csv(BTM_INPUT_DIR + 'indices_all.csv')
    print(len(all_tweets))
    
    for k in range(6, 11):
        pz_d = open(BTM_OUTPUT_DIR + 'k' + str(k) + '.pz_d', 'r').read().splitlines()
        topic_probs = np.array([[float(v) for v in row.split()] for row in pz_d])
        print(topic_probs.shape)
        
        topic_probs_argsort = topic_probs.argsort()
        
        for i in range(k):
            print(i)
            all_tweets['topic_' + str(k-1-i)] = topic_probs_argsort[:, i]
            all_tweets['prob_' + str(k-1-i)] = [topic_probs[j, t] for j, t in enumerate(topic_probs_argsort[:, i])]
            
        for event in events:
            print(event)
            tweets = all_tweets[all_tweets['event'] == event]
            tweets.to_csv(TWEET_DIR + event + '/' + event + '_btm_topics_' + str(k) + '.csv', index=False)

In [None]:
get_topic_assignments()