In [1]:
from __future__ import division
import random
import gc
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from gensim.models import Word2Vec
import nltk
import string
import re

In [2]:
RNG = random.Random()
RNG.seed(400)

INPUT = "/home/twalton_umass_edu/Political Polarization Project/tmls/month_tmls/"
OUTPUT = "/home/twalton_umass_edu/Political Polarization Project/tmls/word_embeddings/sent_embed/"

months = ["jun", "jul", "aug", "sep", "oct", "nov", "dec", "jan"]

In [3]:
def get_word_weights(files, a=1e-3):
    vectorizer = CountVectorizer(decode_error='ignore')
    #get word frequencies
    counts = vectorizer.fit_transform(files)
    #get total # of times a word was used
    total_freq = np.sum(counts, axis=0).T  # aggregate frequencies over all files
    #number of words in the files
    N = np.sum(total_freq)
    #get the weighted frequency for each word
    weighted_freq = a / (a + total_freq / N)
    #garbage collection
    gc.collect()
    # dict with words and their weights
    return dict(zip(vectorizer.get_feature_names(), weighted_freq))

In [4]:
def sentences2idx(sentences, words2index, words2weight):
    """
    Given a list of sentences, output array of word indices that can be fed into the algorithms.
    :param sentences: a list of sentences
    :param words: a dictionary, words['str'] is the indices of the word 'str'
    :return: x1, m1. x1[i, :] is the word indices in sentence i, m1[i,:] is the mask for sentence i (0 means no word at the location)
    """
    #print(sentences[0].split())
    #get the longest tweet, capped at 105, used to create arrays
    maxlen = min(max([len(s.split()) for s in sentences]), 150)
    #print the length of longest tweet
    print('maxlen', maxlen)
    #get the number of sentences/tweets
    n_samples = len(sentences)
    #print the number of sentences
    print('samples', n_samples)
    #array for holding the word indices for each sentence
    #each row is the sentence and each column contains the word indices
    x = np.zeros((n_samples, maxlen)).astype('int32')
    #same format as above but holds the weighted frequency for the words in each sentence
    w = np.zeros((n_samples, maxlen)).astype('float32')
    #dumy variable indicating whether the sentence has a word in that position
    x_mask = np.zeros((n_samples, maxlen)).astype('float32')
    #loop through sentences, idx = index of sentence, s = the sentence
    for idx, s in enumerate(sentences):
        #print for every 100000 sentences that have been indexed
        if idx % 100000 == 0:
            print(idx)
        #split the sentence into tokens
        split = s.split()
        #list for holding the word indices
        indices = []
        #list for holding the weights of each word
        weightlist = []
        #loop through the words in the current sentence
        for word in split:
            #check if the word is in word embedding vector index
            if word in words2index:
                #if true, append the indices to the indices list
                indices.append(words2index[word])
                if word not in words2weight:
                    #if the word does not have a weight, bcuz it was not in the sample used to get weights
                    #give it this weight
                    weightlist.append(0.000001)
                else:
                    #if it is in the list append its weighted frequency
                    weightlist.append(words2weight[word])
        #the length is the smalles of these two, indicates how many cells to fill in the three arrays
        length = min(len(indices), maxlen)
        #place all word indices in the row matching the index of the sentence
        x[idx, :length] = indices[:length]
        #place all weights into the array
        w[idx, :length] = weightlist[:length]
        #places ones in all cells indicating the length of the sentence
        x_mask[idx, :length] = [1.] * length
    #delete the sentences
    del sentences
    #clean the memory
    gc.collect()
    #return the three arrays
    return x, x_mask, w

In [5]:
def get_weighted_average(We, x, m, w, dim):
    """
    Compute the weighted average vectors
    :param We: We[i,:] is the vector for word i
    :param x: x[i, :] are the indices of the words in sentence i
    :param w: w[i, :] are the weights for the words in sentence i
    :return: emb[i, :] are the weighted average vector for sentence i
    """
    print('Getting weighted average...')
    n_samples = x.shape[0]
    print(n_samples, dim)
    #the sentence embeddings are the number of sentences by the number of dimensions, which is same as word embeddings
    emb = np.zeros((n_samples, dim)).astype('float32')
    
    #loop through the sentences
    for i in range(n_samples):
        if i % 100000 == 0:
            print(i)
        #create list for holding each words 50 dimensional embedding
        stacked = []
        #loop through the word indices array for the current sentence
        #idx = word position index, j = list of word indices
        for idx, j in enumerate(x[i, :]):
            #if there is not a word in the postion
            if m[i, idx] != 1:
                #append a row of zeros that is the length dim
                stacked.append(np.zeros(dim))
            else:
                #append the word embedding for word indices j
                stacked.append(We.wv[index2word[j]])
        #create a numpy array by stacking the vectors
        vectors = np.stack(stacked)
        nonzeros = np.sum(m[i,:])
        #divide the dot product of the word weights and vectors by the sum of total words in the senteces
        #creates a weighted average for each dimension by multiply the word dimension probablity by its weighted frequency
        emb[i, :] = np.divide(w[i, :].dot(vectors), np.sum(m[i,:]), out=np.zeros(dim), where=nonzeros!=0)  # where there is a word
    #clear memory
    del x
    del w
    gc.collect()
    return emb

In [6]:
#compute the principal component of the sentence embeddings
def compute_pc(X,npc=1):
    """
    Compute the principal components. DO NOT MAKE THE DATA ZERO MEAN!
    :param X: X[i,:] is a data point
    :param npc: number of principal components to remove
    :return: component_[i,:] is the i-th pc
    """
    svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
    print('Computing principal components...')
    svd.fit(X)
    return svd.components_

In [7]:
#remove the principal component of the sentence embeddings
def remove_pc(X, npc=1):
    """
    Remove the projection on the principal components
    :param X: X[i,:] is a data point
    :param npc: number of principal components to remove
    :return: XX[i, :] is the data point after removing its projection
    """
    print('Removing principal component...')
    pc = compute_pc(X, npc)
    if npc == 1:
        XX = X - X.dot(pc.transpose()) * pc
    else:
        XX = X - X.dot(pc.transpose()).dot(pc)
    return XX

In [8]:
#create the sentence embedding array
def SIF_embedding(We, x, m, w, rmpc, dim):
    """
    Compute the scores between pairs of sentences using weighted average + removing the projection on the first principal component
    :param We: We[i,:] is the vector for word i
    :param x: x[i, :] are the indices of the words in the i-th sentence
    :param w: w[i, :] are the weights for the words in the i-th sentence
    :param params.rmpc: if >0, remove the projections of the sentence embeddings to their first principal component
    :return: emb, emb[i, :] is the embedding for sentence i
    """
    emb = np.nan_to_num(get_weighted_average(We, x, m, w, dim))
    if rmpc > 0:
        emb = remove_pc(emb, rmpc)
    return emb

In [9]:
def generate_embeddings(docs, all_data, model, words2idx, dim, rmpc=1):
    """
    :param docs: list of strings (i.e. docs), based on which to do the tf-idf weighting.
    :param all_data: dataframe column / list of strings (all tweets)
    :param model: pretrained word vectors
    :param vocab: a dictionary, words['str'] is the indices of the word 'str'
    :param dim: dimension of embeddings
    :param rmpc: number of principal components to remove
    :return:
    """
    print(dim)

    print('Getting word weights...')
    #get the word weights
    word2weight = get_word_weights(docs)
    # load sentences
    print('Loading sentences...')
    #get the sentence/word weight indices
    x, m, w = sentences2idx(all_data, words2idx, word2weight)  # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    print('Creating embeddings...')
    #create and return the sentence embeddings
    return SIF_embedding(model, x, m, w, rmpc, dim)  # embedding[i,:] is the embedding for sentence i

In [10]:
#get the samples for computing the word weights
def get_samples_for_computing_word_weights(p, sample_size):
    tweets = []
    with open('/home/twalton_umass_edu/Political Polarization Project/tmls/word_embeddings/' + p + '_clean/cleaned_text.txt', 'r') as f:
        lines = f.read().splitlines()
        tweets.extend([lines[i] for i in sorted(RNG.sample(range(len(lines)), min(sample_size, len(lines))))])
    with open('/home/twalton_umass_edu/Political Polarization Project/tmls/word_embeddings/' + p +  '_tweets_for_weights.txt', 'w') as f:
        f.write('\n'.join(tweets))
    return tweets

In [11]:
#function for cleaning text
#set the stemmer
sno = nltk.stem.SnowballStemmer('english')

#list of punctuation to be removed
punct_chars = list((set(string.punctuation) | {'’', '‘', '–', '—', '~', '|', '“', '”', '…', "'", "`", '_'}) - set(['#']))
#sort the punctuation
punct_chars.sort()
#make puncutation one string
punctuation = ''.join(punct_chars)
#symbols to be removed
replace = re.compile('[%s]' % re.escape(punctuation))

##########################################
######function for cleaning text##########
##########################################
def clean_text(text, event=None, stem=True):
    #remove emojis
    text = re.sub('<U\+[^>]+>', '', text)
    #replace &amp; with and
    text = re.sub('&amp;', 'and', text)
    # lower case
    text = text.lower()
    # eliminate urls
    text = re.sub(r'http\S*|\S*\.com\S*|\S*www\S*', ' ', text)
    #eliminate @mentions
    text = re.sub(r'\s@\S+', ' ', text)
    # substitute all other punctuation with whitespace
    text = replace.sub(' ', text)
    # replace all whitespace with a single space
    text = re.sub(r'\s+', ' ', text)
    # strip off spaces on either end
    text = text.strip()
    # stem words
    words = text.split()
    if stem:
        words = [sno.stem(w) for w in words]
    return words

In [12]:
#function for cleaning tweet and keeping words in vocab set
def clean_tweet(text):
    cleaned = clean_text(text)
    return ' '.join([w for w in cleaned if w in vocab])

In [13]:
#create sentence embeddings for reps first
#number of dimensions = 100
d = 100

#load in vectors
vectors = Word2Vec.load('/home/twalton_umass_edu/Political Polarization Project/tmls/word_embeddings/rep_word2vec_100.model')

#create an index that maps to the ordering of words in the word embedding vectors
words2index = {w: i for i, w in enumerate(vectors.wv.vocab)}

#for getting words from index
index2word = {i: w for i, w in enumerate(vectors.wv.vocab)}

#get sample of tweets for generating word weights, 30k * 8 months = 240K sample
tweets_for_weights = get_samples_for_computing_word_weights(p = 'rep', sample_size = 240000)

In [14]:
#load in vocab for cleaning the full text
vocab = vectors.wv.vocab
len(vocab)

12612

In [15]:
#load the monthly data and filter to only republicans
list_dfs = []

for m in months:
    df = pd.read_csv(INPUT + m + '_all_tweets.csv', encoding = 'UTF-8', dtype = 'str')
    list_dfs.append(df[(df['dem_follows'] == '0') & (df['rep_follows'] == '1')]) #only append rep data
    
#print number of tweets in each dataframe
for i, df in enumerate(list_dfs):
    print(len(df))

2655243
2536232
2801577
2724289
3073973
3040480
2274517
1369799


In [16]:
#keep only english tweets and remove the proportion of tweets removed for each month
for i, df in enumerate(list_dfs):
    #print proportion that is not english
    print(len(df[df['lang'] != "en"]) / len(df))
    #keep only english tweets in the list of dfs
    list_dfs[i] = df[df['lang'] == "en"]

0.08084081193322043
0.08213956767361977
0.08585914290415719
0.08255658632399132
0.09135311208003453
0.09761715255485975
0.09812676713341778
0.08842903228867885


In [17]:
#loop through dfs and clean text
for i, df in enumerate(list_dfs):
    list_dfs[i]['text'] = list_dfs[i]['text'].astype(str).apply(clean_tweet)
    print('before cleaning: ' + str(len(list_dfs[i])))
    #drop tweets that are whitespace
    list_dfs[i] = df[df['text'].str.contains(' ')]
    print('after cleaning: ' + str(len(list_dfs[i])))
    
    #save as csv
    list_dfs[i].to_csv(OUTPUT + 'cleaned/rep_clean_' + str(i) + '.csv')

before cleaning: 2440591
after cleaning: 2406714
before cleaning: 2327907
after cleaning: 2294805
before cleaning: 2561036
after cleaning: 2527290
before cleaning: 2499381
after cleaning: 2464728
before cleaning: 2793156
after cleaning: 2750488
before cleaning: 2743677
after cleaning: 2694886
before cleaning: 2051326
after cleaning: 2015834
before cleaning: 1248669
after cleaning: 1230169


In [42]:
df = pd.read_csv("/home/twalton_umass_edu/Political Polarization Project/tmls/word_embeddings/trump_tweets.csv", float_precision='round_trip')

In [44]:
df['new_id'] = df.index

In [46]:
df['text'] = df['text'].astype(str).apply(clean_tweet)

In [47]:
df = df[df['text'].str.contains(' ')]

In [48]:
len(df)

3141

In [49]:
embedding = generate_embeddings(tweets_for_weights, df['text'], vectors, words2index, d)

100
Getting word weights...
Loading sentences...
maxlen 60
samples 3141
0
Creating embeddings...
Getting weighted average...
3141 100
0
Removing principal component...
Computing principal components...


In [50]:
embedding = pd.DataFrame(embedding, index = df['new_id'])

In [53]:
len(embedding)

3141

In [51]:
embedding.head(10)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
new_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.073564,0.081949,-0.005785,-0.202244,0.246758,0.21848,0.007749,-0.02867,-0.133112,-0.132207,...,0.108177,0.024074,0.186911,-0.24394,-0.149253,0.043281,-0.117061,0.126005,0.273804,-0.10519
1,-0.033136,-0.045747,0.216323,0.122513,0.103753,0.171067,0.217599,-0.144914,0.141763,0.038024,...,-0.180825,0.189626,0.030355,-0.510649,-0.180899,-0.03922,-0.027464,0.541595,-0.141667,-0.150322
2,-0.002573,-0.049544,0.649211,-0.144131,-0.586543,0.233466,0.154909,0.247567,-0.401007,0.441537,...,0.701509,0.043682,0.2435,0.072113,-0.056746,0.419754,-0.09161,0.596091,1.080192,0.29777
3,0.056326,0.229281,-0.04527,-0.141668,0.082001,-0.028151,0.310029,0.332903,0.059105,-0.033075,...,-0.332025,-0.080705,-0.216858,0.250835,0.250046,-0.351637,-0.07848,-0.005941,-0.074247,-0.082971
4,0.382766,-0.081024,-0.189134,0.277608,0.182576,-0.232103,-0.53676,0.041914,0.477109,-0.56418,...,-0.242628,-0.78015,-0.382058,0.541565,-0.071992,-0.424914,0.306443,-0.364825,-0.53707,-0.018623
5,-0.092318,-0.610982,0.317876,-0.012196,0.071507,-0.467429,-0.463095,-0.283753,-0.115868,-0.113337,...,-0.157606,-0.041729,0.274131,-0.263535,-0.322494,-0.169191,0.265779,-0.052536,-0.094045,-0.211846
6,-0.394193,0.080165,0.034296,0.134974,-0.558802,-0.53812,-0.121854,-0.496141,0.017788,-0.017248,...,-0.458408,0.29796,0.822449,0.270187,-0.636212,0.401391,-0.156196,-0.409086,-0.679627,0.111035
7,-0.2618,0.117579,-0.177094,-0.079597,-0.190123,0.061346,-0.299552,-0.36649,-0.119471,0.165477,...,0.046086,0.133063,-0.1821,0.045974,0.066244,-0.367505,-0.302448,-0.17139,-0.10382,0.406921
8,-0.464115,-0.137322,-0.259617,0.003323,0.045364,0.068199,-0.183191,-0.049691,0.096815,0.001571,...,-0.054176,-0.094137,0.402045,-0.153957,-0.339999,0.266362,-0.133496,0.238312,-0.633142,0.110635
9,-1.102499,0.186191,-1.182891,-0.113912,-0.78661,0.751293,0.619332,0.322484,-0.945972,-0.139472,...,0.593449,-1.151752,0.34251,0.258641,0.402657,0.225254,-0.215405,1.653555,0.699425,-0.061285


In [52]:
embedding.to_csv("/home/twalton_umass_edu/Political Polarization Project/tmls/word_embeddings/trump_tweets_embeds.csv")

In [18]:
del list_dfs

In [None]:
for i in range(8):
    df = pd.read_csv(OUTPUT + 'cleaned/rep_clean_' + str(i) + '.csv')
    embedding = generate_embeddings(tweets_for_weights, df['text'], vectors, words2index, d)
    embedding = pd.DataFrame(embedding, index = df.index)
    embedding.to_csv(OUTPUT + 'trained/rep_trained_' + str(i) + '.csv')

100
Getting word weights...
Loading sentences...
maxlen 90
samples 2406714
0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
Creating embeddings...
Getting weighted average...
2406714 100
0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
Removing principal component...
Computing principal components...
100
Getting word weights...
Loading sentences...
maxlen 119
samples 2294805
0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
Creating embeddings...
Getting weighted average...
2294805 100
0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
16000