In [1]:
from __future__ import division
import random
import gc
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from gensim.models import Word2Vec
import nltk
import string
import re

In [2]:
RNG = random.Random()
RNG.seed(400)

INPUT = "/home/twalton_umass_edu/Political Polarization Project/tmls/month_tmls/"
OUTPUT = "/home/twalton_umass_edu/Political Polarization Project/tmls/word_embeddings/sent_embed/"

months = ["jun", "jul", "aug", "sep", "oct", "nov", "dec", "jan"]

In [3]:
def get_word_weights(files, a=1e-3):
    vectorizer = CountVectorizer(decode_error='ignore')
    #get word frequencies
    counts = vectorizer.fit_transform(files)
    #get total # of times a word was used
    total_freq = np.sum(counts, axis=0).T  # aggregate frequencies over all files
    #number of words in the files
    N = np.sum(total_freq)
    #get the weighted frequency for each word
    weighted_freq = a / (a + total_freq / N)
    #garbage collection
    gc.collect()
    # dict with words and their weights
    return dict(zip(vectorizer.get_feature_names(), weighted_freq))

In [4]:
def sentences2idx(sentences, words2index, words2weight):
    """
    Given a list of sentences, output array of word indices that can be fed into the algorithms.
    :param sentences: a list of sentences
    :param words: a dictionary, words['str'] is the indices of the word 'str'
    :return: x1, m1. x1[i, :] is the word indices in sentence i, m1[i,:] is the mask for sentence i (0 means no word at the location)
    """
    #print(sentences[0].split())
    #get the longest tweet, capped at 100, used to create arrays
    maxlen = min(max([len(s.split()) for s in sentences]), 150)
    #print the length of longest tweet
    print('maxlen', maxlen)
    #get the number of sentences/tweets
    n_samples = len(sentences)
    #print the number of sentences
    print('samples', n_samples)
    #array for holding the word indices for each sentence
    #each row is the sentence and each column contains the word indices
    x = np.zeros((n_samples, maxlen)).astype('int32')
    #same format as above but holds the weighted frequency for the words in each sentence
    w = np.zeros((n_samples, maxlen)).astype('float32')
    #dumy variable indicating whether the sentence has a word in that position
    x_mask = np.zeros((n_samples, maxlen)).astype('float32')
    #loop through sentences, idx = index of sentence, s = the sentence
    for idx, s in enumerate(sentences):
        #print for every 100000 sentences that have been indexed
        if idx % 100000 == 0:
            print(idx)
        #split the sentence into tokens
        split = s.split()
        #list for holding the word indices
        indices = []
        #list for holding the weights of each word
        weightlist = []
        #loop through the words in the current sentence
        for word in split:
            #check if the word is in word embedding vector index
            if word in words2index:
                #if true, append the indices to the indices list
                indices.append(words2index[word])
                if word not in words2weight:
                    #if the word does not have a weight, bcuz it was not in the sample used to get weights
                    #give it this weight
                    weightlist.append(0.000001)
                else:
                    #if it is in the list append its weighted frequency
                    weightlist.append(words2weight[word])
        #the length is the smalles of these two, indicates how many cells to fill in the three arrays
        length = min(len(indices), maxlen)
        #place all word indices in the row matching the index of the sentence
        x[idx, :length] = indices[:length]
        #place all weights into the array
        w[idx, :length] = weightlist[:length]
        #places ones in all cells indicating the length of the sentence
        x_mask[idx, :length] = [1.] * length
    #delete the sentences
    del sentences
    #clean the memory
    gc.collect()
    #return the three arrays
    return x, x_mask, w

In [5]:
def get_weighted_average(We, x, m, w, dim):
    """
    Compute the weighted average vectors
    :param We: We[i,:] is the vector for word i
    :param x: x[i, :] are the indices of the words in sentence i
    :param w: w[i, :] are the weights for the words in sentence i
    :return: emb[i, :] are the weighted average vector for sentence i
    """
    print('Getting weighted average...')
    n_samples = x.shape[0]
    print(n_samples, dim)
    #the sentence embeddings are the number of sentences by the number of dimensions, which is same as word embeddings
    emb = np.zeros((n_samples, dim)).astype('float32')
    
    #loop through the sentences
    for i in range(n_samples):
        if i % 100000 == 0:
            print(i)
        #create list for holding each words 50 dimensional embedding
        stacked = []
        #loop through the word indices array for the current sentence
        #idx = word position index, j = list of word indices
        for idx, j in enumerate(x[i, :]):
            #if there is not a word in the postion
            if m[i, idx] != 1:
                #append a row of zeros that is the length dim
                stacked.append(np.zeros(dim))
            else:
                #append the word embedding for word indices j
                stacked.append(We.wv[index2word[j]])
        #create a numpy array by stacking the vectors
        vectors = np.stack(stacked)
        #emb[i,:] = w[i,:].dot(vectors) / np.count_nonzero(w[i,:])
        nonzeros = np.sum(m[i,:])
        #divide the dot product of the word weights and vectors by the sum of total words in the senteces
        #creates a weighted average for each dimension by multiply the word dimension probablity by its weighted frequency
        emb[i, :] = np.divide(w[i, :].dot(vectors), np.sum(m[i,:]), out=np.zeros(dim), where=nonzeros!=0)  # where there is a word
    #clear memory
    del x
    del w
    gc.collect()
    return emb

In [6]:
#compute the principal component of the sentence embeddings
def compute_pc(X,npc=1):
    """
    Compute the principal components. DO NOT MAKE THE DATA ZERO MEAN!
    :param X: X[i,:] is a data point
    :param npc: number of principal components to remove
    :return: component_[i,:] is the i-th pc
    """
    svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
    print('Computing principal components...')
    svd.fit(X)
    return svd.components_

In [7]:
#remove the principal component of the sentence embeddings
def remove_pc(X, npc=1):
    """
    Remove the projection on the principal components
    :param X: X[i,:] is a data point
    :param npc: number of principal components to remove
    :return: XX[i, :] is the data point after removing its projection
    """
    print('Removing principal component...')
    pc = compute_pc(X, npc)
    if npc == 1:
        XX = X - X.dot(pc.transpose()) * pc
    else:
        XX = X - X.dot(pc.transpose()).dot(pc)
    return XX

In [8]:
#create the sentence embedding array
def SIF_embedding(We, x, m, w, rmpc, dim):
    """
    Compute the scores between pairs of sentences using weighted average + removing the projection on the first principal component
    :param We: We[i,:] is the vector for word i
    :param x: x[i, :] are the indices of the words in the i-th sentence
    :param w: w[i, :] are the weights for the words in the i-th sentence
    :param params.rmpc: if >0, remove the projections of the sentence embeddings to their first principal component
    :return: emb, emb[i, :] is the embedding for sentence i
    """
    emb = np.nan_to_num(get_weighted_average(We, x, m, w, dim))
    if rmpc > 0:
        emb = remove_pc(emb, rmpc)
    return emb

In [9]:
def generate_embeddings(docs, all_data, model, words2idx, dim, rmpc=1):
    """
    :param docs: list of strings (i.e. docs), based on which to do the tf-idf weighting.
    :param all_data: dataframe column / list of strings (all tweets)
    :param model: pretrained word vectors
    :param vocab: a dictionary, words['str'] is the indices of the word 'str'
    :param dim: dimension of embeddings
    :param rmpc: number of principal components to remove
    :return:
    """
    print(dim)

    print('Getting word weights...')
    #get the word weights
    word2weight = get_word_weights(docs)
    # load sentences
    print('Loading sentences...')
    #get the sentence/word weight indices
    x, m, w = sentences2idx(all_data, words2idx, word2weight)  # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    print('Creating embeddings...')
    #create and return the sentence embeddings
    return SIF_embedding(model, x, m, w, rmpc, dim)  # embedding[i,:] is the embedding for sentence i

In [10]:
#get the samples for computing the word weights
def get_samples_for_computing_word_weights(p, sample_size):
    tweets = []
    with open('/home/twalton_umass_edu/Political Polarization Project/tmls/word_embeddings/' + p + '_clean/cleaned_text.txt', 'r') as f:
        lines = f.read().splitlines()
        tweets.extend([lines[i] for i in sorted(RNG.sample(range(len(lines)), min(sample_size, len(lines))))])
    with open('/home/twalton_umass_edu/Political Polarization Project/tmls/word_embeddings/' + p +  '_tweets_for_weights.txt', 'w') as f:
        f.write('\n'.join(tweets))
    return tweets

In [11]:
#function for cleaning text
#set the stemmer
sno = nltk.stem.SnowballStemmer('english')

#list of punctuation to be removed
punct_chars = list((set(string.punctuation) | {'’', '‘', '–', '—', '~', '|', '“', '”', '…', "'", "`", '_'}) - set(['#']))
#sort the punctuation
punct_chars.sort()
#make puncutation one string
punctuation = ''.join(punct_chars)
#symbols to be removed
replace = re.compile('[%s]' % re.escape(punctuation))

##########################################
######function for cleaning text##########
##########################################
def clean_text(text, event=None, stem=True):
    #remove emojis
    text = re.sub('<U\+[^>]+>', '', text)
    #replace &amp; with and
    text = re.sub('&amp;', 'and', text)
    # lower case
    text = text.lower()
    # eliminate urls
    text = re.sub(r'http\S*|\S*\.com\S*|\S*www\S*', ' ', text)
    #eliminate @mentions
    text = re.sub(r'\s@\S+', ' ', text)
    # substitute all other punctuation with whitespace
    text = replace.sub(' ', text)
    # replace all whitespace with a single space
    text = re.sub(r'\s+', ' ', text)
    # strip off spaces on either end
    text = text.strip()
    # stem words
    words = text.split()
    if stem:
        words = [sno.stem(w) for w in words]
    return words

In [12]:
#function for cleaning tweet and keeping words in vocab set
def clean_tweet(text):
    cleaned = clean_text(text)
    return ' '.join([w for w in cleaned if w in vocab])

In [13]:
#create sentence embeddings for dems first
#number of dimensions = 100
d = 100

#load in vectors
vectors = Word2Vec.load('/home/twalton_umass_edu/Political Polarization Project/tmls/word_embeddings/dem_word2vec_100.model')

#create an index that maps to the ordering of words in the word embedding vectors
words2index = {w: i for i, w in enumerate(vectors.wv.vocab)}

#for getting words from index
index2word = {i: w for i, w in enumerate(vectors.wv.vocab)}

#get sample of tweets for generating word weights, 30k * 7 months = 210K sample
tweets_for_weights = get_samples_for_computing_word_weights(p = 'dem', sample_size = 240000)

In [14]:
#load in vocab for cleaning the full text
vocab = vectors.wv.vocab
len(vocab)

13504

In [16]:
#load the monthly data and filter to only republicans
list_dfs = []

for m in months:
    df = pd.read_csv(INPUT + m + '_all_tweets.csv', encoding = 'UTF-8', dtype = 'str')
    list_dfs.append(df[(df['dem_follows'] == '1') & (df['rep_follows'] == '0')]) #only append rep data
    
#print number of tweets in each dataframe
for i, df in enumerate(list_dfs):
    print(len(df))

644790
632178
660602
629154
720484
624539
442501
562374


In [17]:
#keep only english tweets and remove the proportion of tweets removed for each month
for i, df in enumerate(list_dfs):
    #print proportion that is not english
    print(len(df[df['lang'] != "en"]) / len(df))
    #keep only english tweets in the list of dfs
    list_dfs[i] = df[df['lang'] == "en"]

0.06423176538097675
0.06816750978363689
0.06839973236532738
0.06623338642049482
0.08073184137329906
0.08639332371557261
0.07786649069719616
0.06645577498248496


In [18]:
#loop through dfs and clean text
for i, df in enumerate(list_dfs):
    list_dfs[i]['text'] = list_dfs[i]['text'].astype(str).apply(clean_tweet)
    print('before cleaning: ' + str(len(list_dfs[i])))
    #drop tweets that are whitespace
    list_dfs[i] = df[df['text'].str.contains(' ')]
    print('after cleaning: ' + str(len(list_dfs[i])))
    
    #save as csv
    list_dfs[i].to_csv(OUTPUT + 'cleaned/dem_clean_' + str(i) + '.csv')

before cleaning: 603374
after cleaning: 596779
before cleaning: 589084
after cleaning: 582628
before cleaning: 615417
after cleaning: 608384
before cleaning: 587483
after cleaning: 580913
before cleaning: 662318
after cleaning: 654338
before cleaning: 570583
after cleaning: 561750
before cleaning: 408045
after cleaning: 401951
before cleaning: 525001
after cleaning: 517796


In [19]:
del list_dfs

In [20]:
for i in range(8):
    df = pd.read_csv(OUTPUT + 'cleaned/dem_clean_' + str(i) + '.csv')
    embedding = generate_embeddings(tweets_for_weights, df['text'], vectors, words2index, d)
    embedding = pd.DataFrame(embedding, index = df.index)
    print('saving embedding...')
    embedding.to_csv(OUTPUT + 'trained/dem_trained_' + str(i) + '.csv')

100
Getting word weights...
Loading sentences...
maxlen 150
samples 596779
0
100000
200000
300000
400000
500000
Creating embeddings...
Getting weighted average...
596779 100
0
100000
200000
300000
400000
500000
Removing principal component...
Computing principal components...
saving embedding...
100
Getting word weights...
Loading sentences...
maxlen 96
samples 582628
0
100000
200000
300000
400000
500000
Creating embeddings...
Getting weighted average...
582628 100
0
100000
200000
300000
400000
500000
Removing principal component...
Computing principal components...
saving embedding...
100
Getting word weights...
Loading sentences...
maxlen 84
samples 608384
0
100000
200000
300000
400000
500000
600000
Creating embeddings...
Getting weighted average...
608384 100
0
100000
200000
300000
400000
500000
600000
Removing principal component...
Computing principal components...
saving embedding...
100
Getting word weights...
Loading sentences...
maxlen 150
samples 580913
0
100000
200000
300000