# Testing the effects of Word2Vec on propaganda

First, we define a function to convert a random article to a cohesive list of words

In [1]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

def text_to_wordlist(text, remove_stopwords = False):
    """
    Converts an article into a sequence of words
    """

    # 1. Remove HTML
    # text = BeautifulSoup(text).get_text()

    # 2. Remove non-letters
    text = re.sub('[^a-zA-Z]', ' ', text)

    # 3. Convert words to lowercase and split
    words = text.lower().split()

    # 4. (Optional) Remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    # 5. Return the list
    return words

Test the function on our propaganda text

In [2]:
tweet_file = open('tweets-large.txt', 'r')

clean_tweets = []
tweet_text = ''
for tweet in tweet_file:
    tweet_text += tweet + ' '
    clean_tweets.append(tweet)
    
print(f'There are {len(text_to_wordlist(tweet_text))} words')
print(len(clean_tweets))

There are 22722162 words
2106034


Next, we add a function to parse the text into sentences

In [3]:
import nltk.data
# nltk.download()

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def text_to_sentences(tweet, tokenizer, remove_stopwords = False):
    """
    Splits an article into parsed sentences
    Returns a list of sentences
    """

    # 1. Use the NLTK tokenizer to split the paragraph
    raw_sentences = tokenizer.tokenize(tweet.strip())

    # 2. Loop over every sentence
    sentences = []
    for sentence in raw_sentences:
        if len(sentence) != 0:
            sentences += text_to_wordlist(sentence, remove_stopwords)

    # 3. Return the list
    return sentences

In [4]:
sentences = []

for tweet in clean_tweets:
    sentences.append(text_to_sentences(tweet, tokenizer, True))

print(f'There are {len(sentences)} sentences')

There are 2106034 sentences


Now we can train and store the model!

In [None]:
# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 1000    # Word vector dimensionality                      
min_word_count = 10   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-5   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
print('Training model...')
model = Word2Vec(sentences, workers=num_workers, size=num_features, min_count = min_word_count, window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = 'propaganda-large-2'
model.save(model_name)

2020-03-15 14:53:49,725 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-03-15 14:53:49,726 : INFO : built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)
2020-03-15 14:53:49,727 : INFO : collecting all words and their counts
2020-03-15 14:53:49,728 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-15 14:53:49,742 : INFO : PROGRESS: at sentence #10000, processed 72754 words, keeping 16219 word types
2020-03-15 14:53:49,759 : INFO : PROGRESS: at sentence #20000, processed 144738 words, keeping 26002 word types
2020-03-15 14:53:49,777 : INFO : PROGRESS: at sentence #30000, processed 230874 words, keeping 29868 word types
2020-03-15 14:53:49,789 : INFO : PROGRESS: at sentence #40000, processed 295384 words, keeping 34253 word types
2020-03-15 14:53:49,802 : INFO : PROGRESS: at sentence #50000, processed 371770 words, keeping 38730 word types
2020-03-15

Training model...


2020-03-15 14:53:49,845 : INFO : PROGRESS: at sentence #80000, processed 603931 words, keeping 53004 word types
2020-03-15 14:53:49,862 : INFO : PROGRESS: at sentence #90000, processed 700959 words, keeping 57964 word types
2020-03-15 14:53:49,880 : INFO : PROGRESS: at sentence #100000, processed 795413 words, keeping 62909 word types
2020-03-15 14:53:49,898 : INFO : PROGRESS: at sentence #110000, processed 891267 words, keeping 67731 word types
2020-03-15 14:53:49,914 : INFO : PROGRESS: at sentence #120000, processed 981608 words, keeping 70859 word types
2020-03-15 14:53:49,929 : INFO : PROGRESS: at sentence #130000, processed 1066278 words, keeping 73770 word types
2020-03-15 14:53:49,940 : INFO : PROGRESS: at sentence #140000, processed 1122244 words, keeping 75400 word types
2020-03-15 14:53:49,955 : INFO : PROGRESS: at sentence #150000, processed 1203878 words, keeping 78400 word types
2020-03-15 14:53:49,971 : INFO : PROGRESS: at sentence #160000, processed 1293627 words, keepin

2020-03-15 14:53:50,985 : INFO : PROGRESS: at sentence #800000, processed 6077255 words, keeping 190164 word types
2020-03-15 14:53:50,998 : INFO : PROGRESS: at sentence #810000, processed 6141380 words, keeping 190901 word types
2020-03-15 14:53:51,014 : INFO : PROGRESS: at sentence #820000, processed 6200276 words, keeping 191433 word types
2020-03-15 14:53:51,023 : INFO : PROGRESS: at sentence #830000, processed 6245648 words, keeping 192183 word types
2020-03-15 14:53:51,038 : INFO : PROGRESS: at sentence #840000, processed 6326221 words, keeping 193865 word types
2020-03-15 14:53:51,060 : INFO : PROGRESS: at sentence #850000, processed 6403791 words, keeping 195298 word types
2020-03-15 14:53:51,079 : INFO : PROGRESS: at sentence #860000, processed 6473978 words, keeping 197690 word types
2020-03-15 14:53:51,090 : INFO : PROGRESS: at sentence #870000, processed 6524148 words, keeping 199434 word types
2020-03-15 14:53:51,101 : INFO : PROGRESS: at sentence #880000, processed 656751

2020-03-15 14:53:52,172 : INFO : PROGRESS: at sentence #1510000, processed 11311881 words, keeping 247190 word types
2020-03-15 14:53:52,189 : INFO : PROGRESS: at sentence #1520000, processed 11393078 words, keeping 247474 word types
2020-03-15 14:53:52,205 : INFO : PROGRESS: at sentence #1530000, processed 11474445 words, keeping 247812 word types
2020-03-15 14:53:52,220 : INFO : PROGRESS: at sentence #1540000, processed 11555124 words, keeping 248172 word types
2020-03-15 14:53:52,235 : INFO : PROGRESS: at sentence #1550000, processed 11629501 words, keeping 248438 word types
2020-03-15 14:53:52,247 : INFO : PROGRESS: at sentence #1560000, processed 11687739 words, keeping 248870 word types
2020-03-15 14:53:52,259 : INFO : PROGRESS: at sentence #1570000, processed 11746595 words, keeping 249236 word types
2020-03-15 14:53:52,272 : INFO : PROGRESS: at sentence #1580000, processed 11815970 words, keeping 249640 word types
2020-03-15 14:53:52,287 : INFO : PROGRESS: at sentence #1590000,

2020-03-15 14:54:04,929 : INFO : EPOCH 1 - PROGRESS: at 16.94% examples, 385746 words/s, in_qsize 7, out_qsize 0
2020-03-15 14:54:05,932 : INFO : EPOCH 1 - PROGRESS: at 23.02% examples, 386080 words/s, in_qsize 7, out_qsize 1
2020-03-15 14:54:06,962 : INFO : EPOCH 1 - PROGRESS: at 28.96% examples, 386915 words/s, in_qsize 7, out_qsize 0
2020-03-15 14:54:07,965 : INFO : EPOCH 1 - PROGRESS: at 35.04% examples, 389526 words/s, in_qsize 7, out_qsize 0
2020-03-15 14:54:08,973 : INFO : EPOCH 1 - PROGRESS: at 40.73% examples, 386006 words/s, in_qsize 6, out_qsize 1
2020-03-15 14:54:09,981 : INFO : EPOCH 1 - PROGRESS: at 47.37% examples, 388721 words/s, in_qsize 7, out_qsize 0
2020-03-15 14:54:11,030 : INFO : EPOCH 1 - PROGRESS: at 52.68% examples, 384226 words/s, in_qsize 7, out_qsize 0
2020-03-15 14:54:12,040 : INFO : EPOCH 1 - PROGRESS: at 58.26% examples, 385398 words/s, in_qsize 6, out_qsize 1
2020-03-15 14:54:13,053 : INFO : EPOCH 1 - PROGRESS: at 64.01% examples, 386562 words/s, in_qsiz

2020-03-15 14:55:07,444 : INFO : EPOCH 4 - PROGRESS: at 64.53% examples, 359781 words/s, in_qsize 7, out_qsize 0
2020-03-15 14:55:08,458 : INFO : EPOCH 4 - PROGRESS: at 70.30% examples, 358686 words/s, in_qsize 8, out_qsize 3
2020-03-15 14:55:09,461 : INFO : EPOCH 4 - PROGRESS: at 75.57% examples, 357600 words/s, in_qsize 7, out_qsize 0
2020-03-15 14:55:10,480 : INFO : EPOCH 4 - PROGRESS: at 81.12% examples, 357839 words/s, in_qsize 7, out_qsize 0
2020-03-15 14:55:11,496 : INFO : EPOCH 4 - PROGRESS: at 86.16% examples, 355050 words/s, in_qsize 7, out_qsize 0
2020-03-15 14:55:12,497 : INFO : EPOCH 4 - PROGRESS: at 91.86% examples, 353642 words/s, in_qsize 7, out_qsize 0
2020-03-15 14:55:13,521 : INFO : EPOCH 4 - PROGRESS: at 96.60% examples, 352290 words/s, in_qsize 6, out_qsize 1
2020-03-15 14:55:14,093 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-03-15 14:55:14,097 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-03-15 14:55:14,120 : I

Now we test out the model...

In [24]:
print('Most similar to "trump":')
print(model.wv.most_similar('trump'), end='\n\n')

print('Most similar to "hillary":')
print(model.wv.most_similar('hillary'), end='\n\n')

print('Most similar to "obama":')
print(model.wv.most_similar('obama'), end='\n\n')

print('Most similar to "immigrant":')
print(model.wv.most_similar('immigrant'), end='\n\n')

print('Most similar to "muslim":')
print(model.wv.most_similar('muslim'), end='\n\n')

print('Most similar to "media":')
print(model.wv.most_similar('media'), end='\n\n')

print('Most similar to "abortions":')
print(model.wv.most_similar('abortions'), end='\n\n')

print('Most similar to "russia":')
print(model.wv.most_similar('russia'), end='\n\n')

Most similar to "trump":
[('impeach', 0.9377140402793884), ('romney', 0.9375194311141968), ('supporters', 0.9333128333091736), ('newt', 0.9302890300750732), ('gingrich', 0.9285157322883606), ('bitter', 0.9285004138946533), ('behna', 0.9277978539466858), ('tantrum', 0.9268388152122498), ('forgets', 0.9253718256950378), ('temper', 0.9249578714370728)]

Most similar to "hillary":
[('clinton', 0.951890230178833), ('podestaemails', 0.9492028951644897), ('dnc', 0.9432393312454224), ('hillarysemails', 0.9405421018600464), ('dncleak', 0.9361636638641357), ('lied', 0.9330999851226807), ('bff', 0.9328992962837219), ('susanrice', 0.9327013492584229), ('crooked', 0.9313591122627258), ('crookedhillary', 0.9278271198272705)]

Most similar to "obama":
[('undermine', 0.9506447315216064), ('hostile', 0.9470846652984619), ('agreed', 0.9441496729850769), ('admin', 0.9425696134567261), ('directly', 0.9397111535072327), ('treason', 0.939399242401123), ('tactics', 0.9370774030685425), ('purge', 0.9355739355

[('vladimir', 0.9627082347869873),
 ('russia', 0.9338808655738831),
 ('kremlin', 0.8904656767845154),
 ('lavrov', 0.8867936730384827),
 ('sochitalks', 0.8861778974533081),
 ('kerry', 0.8700426816940308),
 ('aggression', 0.8657740354537964),
 ('assad', 0.8646084666252136),
 ('ukraine', 0.8645811080932617),
 ('nk', 0.8620179891586304)]