## What are these accounts tweeting about?  
## N-grams and topic modelling

In [18]:
import os
import re
import pprint
import sqlite3 as sql
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from spacy.lang.en import English
from spacy.matcher import Matcher, PhraseMatcher
from spacy.tokens import Doc
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from ast import literal_eval

import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)

np.random.seed(123)

In [19]:
eng_tweets = pd.read_pickle('../data/eng_tweets.p')
#user_rt = pd.read_pickle('../data/user_rt.p')
#user_original = pd.read_pickle('../data/user_rt.p')

eng_tweets.head()

Unnamed: 0,external_author_id,author,content,language,publish_date,harvested_date,following,followers,updates,post_type,account_type,retweet,account_category,new_june_2018,alt_external_id,tweet_id,hashtags,mentions
0,906000000000000000,10_GOP,We have a sitting Democrat US Senator on trial...,English,10/1/2017 19:58,10/1/2017 19:59,1052,9636,253,,Right,0,RightTroll,0,905874659358453760,914580356430536707,[],[@nedryun]
1,906000000000000000,10_GOP,Marshawn Lynch arrives to game in anti Trump s...,English,10/1/2017 22:43,10/1/2017 22:43,1054,9637,254,,Right,0,RightTroll,0,905874659358453760,914621840496189440,[],[]
2,906000000000000000,10_GOP,Daughter of fallen Navy Sailor delivers powerf...,English,10/1/2017 22:50,10/1/2017 22:51,1054,9637,255,RETWEET,Right,1,RightTroll,0,905874659358453760,914623490375979008,[#BoycottNFL],[]
3,906000000000000000,10_GOP,JUST IN President Trump dedicates Presidents C...,English,10/1/2017 23:52,10/1/2017 23:52,1062,9642,256,,Right,0,RightTroll,0,905874659358453760,914639143690555392,[],[]
4,906000000000000000,10_GOP,19000 RESPECTING our National Anthem Stand For...,English,10/1/2017 2:13,10/1/2017 2:13,1050,9645,246,RETWEET,Right,1,RightTroll,0,905874659358453760,914312219952861184,[#StandForOurAnthem],[]


In [20]:
targets = ['RightTroll', 'LeftTroll', 'NewsFeed']
eng_tweets = eng_tweets.loc[eng_tweets.account_category.isin(targets)]

### N-grams

In [21]:
unigram_vect = TfidfVectorizer(ngram_range=(1,1), stop_words='english')
bigram_vect = TfidfVectorizer(ngram_range=(2,2), stop_words='english')
trigram_vect = TfidfVectorizer(ngram_range=(3,3), stop_words='english')

### Top 10 words

In [22]:
eng_summaries = "".join(eng_tweets['content'])
eng_unigrams = unigram_vect.build_analyzer()(eng_summaries)

for value, count in Counter(eng_unigrams).most_common(10):
    print(value)

trump
black
new
news
police
just
world
man
people
obama


### Top 10 bigrams

In [23]:
#eng_summaries = "".join(eng_tweets['content'])
eng_bigrams = bigram_vect.build_analyzer()(eng_summaries)

for value, count in Counter(eng_bigrams).most_common(10):
    print(value)

donald trump
real donald
lives matter
black lives
hillary clinton
newstop news
world newstop
fox news
white house
president trump


### Top 10 trigrams

In [24]:
eng_summaries = "".join(eng_tweets['content'])
eng_trigrams = trigram_vect.build_analyzer()(eng_summaries)

for value, count in Counter(eng_trigrams).most_common(10):
    print(value)

real donald trump
black lives matter
world newstop news
cnn fakenews cnn
fakenews cnn fakenews
make america great
john mc cain
archived hedge bz
black history month
enlist patriot army


### Topic Modelling

In [25]:
import gensim
import nltk
import pyLDAvis.gensim
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

np.random.seed(123)
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gcdunn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/gcdunn/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [26]:
en_stop = set(nltk.corpus.stopwords.words('english'))

In [27]:
en_stop.add('the')
en_stop.add('amp')

In [28]:
stemmer = SnowballStemmer('english')

In [29]:
def lemma_stem(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text, lemmatize=True):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            if lemmatize == True:
                result.append(lemma_stem(token))
            else:
                result.append(token)
    return result

In [30]:
eng_tweets['processed'] = eng_tweets.apply(lambda row: preprocess(row['content'].lower()), axis=1)
#user_original['processed'] = user_original.apply(lambda row: preprocess(row['content'].lower()), axis=1)
#user_rt['processed'] = user_rt.apply(lambda row: preprocess(row['content'].lower()), axis=1)

In [None]:
#user_original['tokenized'] = user_original.apply(lambda row: nltk.word_tokenize(row['content'].lower()), axis=1)
#user_rt['tokenized'] = user_rt.apply(lambda row: nltk.word_tokenize(row['content'].lower()), axis=1)

In [None]:
#user_original['tokenized'] = user_original.tokenized.apply(lambda x: [item for item in x if item not in en_stop])
#user_rt['tokenized'] = user_rt.tokenized.apply(lambda x: [item for item in x if item not in en_stop])

In [None]:
#user_original['tokenized'] = user_original.tokenized.apply(lambda x: [item for item in x if len(item) > 2])
#user_rt['tokenized'] = user_rt.tokenized.apply(lambda x: [item for item in x if len(item) > 2])

In [31]:
dictionary = corpora.Dictionary(eng_tweets['processed'])
corpus = [dictionary.doc2bow(text) for text in eng_tweets['processed']]

In [35]:
ntopics = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = ntopics, id2word=dictionary)
#ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.011*"brooklyn" + 0.011*"south" + 0.010*"black" + 0.010*"news" + 0.010*"louis"')
(1, '0.039*"trump" + 0.025*"politics" + 0.013*"aleppo" + 0.010*"hillary" + 0.010*"president"')
(2, '0.028*"syria" + 0.018*"local" + 0.012*"crash" + 0.009*"army" + 0.008*"dies"')
(3, '0.010*"nypd" + 0.009*"cops" + 0.009*"suspect" + 0.008*"politics" + 0.005*"turkish"')
(4, '0.037*"miami" + 0.025*"news" + 0.021*"sports" + 0.020*"local" + 0.018*"police"')


### Right trolls, left trolls,  and newsfeeds discuss:
1. ISIS and Syria
2. Trump, Clinton and politics
3. Local news
4. Black Lives Matter and police
5. Guns

In [36]:
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)