## What are these accounts tweeting about?  
## N-grams and topic modelling

In [39]:
import os
import re
import pprint
import sqlite3 as sql
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from spacy.lang.en import English
from spacy.matcher import Matcher, PhraseMatcher
from spacy.tokens import Doc
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from ast import literal_eval

import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)

np.random.seed(123)

In [3]:
eng_tweets = pd.read_pickle('../data/eng_tweets.p')
user_rt = pd.read_pickle('../data/user_rt.p')
user_original = pd.read_pickle('../data/user_rt.p')

#with sql.connect('../data/eng_tweets.db') as conn:
#    eng_tweets = pd.read_sql_query('select * from eng_tweets', conn)
eng_tweets.head()

Unnamed: 0,external_author_id,author,content,language,publish_date,harvested_date,following,followers,updates,post_type,account_type,retweet,account_category,new_june_2018,alt_external_id,tweet_id,hashtags,mentions
0,906000000000000000,10_GOP,We have a sitting Democrat US Senator on trial...,English,10/1/2017 19:58,10/1/2017 19:59,1052,9636,253,,Right,0,RightTroll,0,905874659358453760,914580356430536707,[],[@nedryun]
1,906000000000000000,10_GOP,Marshawn Lynch arrives to game in anti Trump s...,English,10/1/2017 22:43,10/1/2017 22:43,1054,9637,254,,Right,0,RightTroll,0,905874659358453760,914621840496189440,[],[]
2,906000000000000000,10_GOP,Daughter of fallen Navy Sailor delivers powerf...,English,10/1/2017 22:50,10/1/2017 22:51,1054,9637,255,RETWEET,Right,1,RightTroll,0,905874659358453760,914623490375979008,[#BoycottNFL],[]
3,906000000000000000,10_GOP,JUST IN President Trump dedicates Presidents C...,English,10/1/2017 23:52,10/1/2017 23:52,1062,9642,256,,Right,0,RightTroll,0,905874659358453760,914639143690555392,[],[]
4,906000000000000000,10_GOP,19000 RESPECTING our National Anthem Stand For...,English,10/1/2017 2:13,10/1/2017 2:13,1050,9645,246,RETWEET,Right,1,RightTroll,0,905874659358453760,914312219952861184,[#StandForOurAnthem],[]


In [4]:
#user_original = pd.read_csv('../data/user_original.csv')
#user_rt = pd.read_csv('../data/user_rt.csv')

#with sql.connect('../data/user_original.db') as conn:
#    user_original = pd.read_sql_query('select * from user_original', conn)
user_original.head()

Unnamed: 0,author,account_category,content
0,10_GOP,RightTroll,Daughter of fallen Navy Sailor delivers powerf...
1,1D_NICOLE_,Fearmonger,Food Poisoning is not a joke Walmart Koch Farm...
2,1ERIK_LEE,RightTroll,Why is someone even against the petition I'll ...
3,4EVER_SUSAN,RightTroll,Raiders defense playing hungry Bending and not...
4,4MYSQUAD,LeftTroll,'politicalseason Maat Justice Injustice in Lou...


In [5]:
#with sql.connect('../data/user_rt.db') as conn:
#    user_rt = pd.read_sql_query('select * from user_rt', conn)
#user_rt.head()

In [6]:
targets = ['RightTroll', 'LeftTroll', 'Newsfeed']
eng_tweets = eng_tweets.loc[eng_tweets.account_category.isin(targets)]

In [7]:
#user_original = user_original[user_original.tweet_text != np.nan]
#user_rt = user_rt[user_rt.tweet_text != np.nan]

### N-grams

In [8]:
unigram_vect = TfidfVectorizer(ngram_range=(1,1), stop_words='english')
bigram_vect = TfidfVectorizer(ngram_range=(2,2), stop_words='english')
trigram_vect = TfidfVectorizer(ngram_range=(3,3), stop_words='english')

### Top 10 words

In [45]:
eng_summaries = "".join(eng_tweets['content'])
eng_unigrams = unigram_vect.build_analyzer()(eng_summaries)

for value, count in Counter(eng_unigrams).most_common(10):
    print(value)

trump
black
just
hillary
people
obama
new
real
like
america


### Top 10 bigrams

In [44]:
#eng_summaries = "".join(eng_tweets['content'])
eng_bigrams = bigram_vect.build_analyzer()(eng_summaries)

for value, count in Counter(eng_bigrams).most_common(10):
    print(value)

donald trump
real donald
lives matter
black lives
hillary clinton
fox news
president trump
white house
fake news
north korea


### Top 10 trigrams

In [43]:
eng_summaries = "".join(eng_tweets['content'])
eng_trigrams = trigram_vect.build_analyzer()(eng_summaries)

#Counter(eng_trigrams).most_common(10)
for value, count in Counter(eng_trigrams).most_common(10):
    print(value)

real donald trump
black lives matter
cnn fakenews cnn
fakenews cnn fakenews
make america great
john mc cain
archived hedge bz
enlist patriot army
black history month
curated hedge bz


### Topic Modelling

In [15]:
len(user_original.author.value_counts())

1019

In [16]:
import gensim
import nltk
import pyLDAvis.gensim
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

np.random.seed(123)
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gcdunn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/gcdunn/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [17]:
en_stop = set(nltk.corpus.stopwords.words('english'))

In [18]:
en_stop.add('the')
en_stop.add('amp')

In [19]:
stemmer = SnowballStemmer('english')

In [20]:
def lemma_stem(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text, lemmatize=False):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            if lemmatize == True:
                result.append(lemma_stem(token))
            else:
                result.append(token)
    return result

In [47]:
eng_tweets['processed'] = eng_tweets.apply(lambda row: preprocess(row['content'].lower()), axis=1)
#user_original['processed'] = user_original.apply(lambda row: preprocess(row['content'].lower()), axis=1)
#user_rt['processed'] = user_rt.apply(lambda row: preprocess(row['content'].lower()), axis=1)

In [48]:
user_original.processed.head()

0    [daughter, fallen, navy, sailor, delivers, pow...
1    [food, poisoning, joke, walmart, koch, farms, ...
2    [petition, watch, bleeding, thug, shoots, guns...
3    [raiders, defense, playing, hungry, bending, b...
4    [politicalseason, maat, justice, injustice, lo...
Name: processed, dtype: object

In [49]:
#user_original['tokenized'] = user_original.apply(lambda row: nltk.word_tokenize(row['content'].lower()), axis=1)
#user_rt['tokenized'] = user_rt.apply(lambda row: nltk.word_tokenize(row['content'].lower()), axis=1)

In [50]:
#user_original['tokenized'] = user_original.tokenized.apply(lambda x: [item for item in x if item not in en_stop])
#user_rt['tokenized'] = user_rt.tokenized.apply(lambda x: [item for item in x if item not in en_stop])

In [51]:
#user_original['tokenized'] = user_original.tokenized.apply(lambda x: [item for item in x if len(item) > 2])
#user_rt['tokenized'] = user_rt.tokenized.apply(lambda x: [item for item in x if len(item) > 2])

In [52]:
dictionary = corpora.Dictionary(eng_tweets['processed'])
corpus = [dictionary.doc2bow(text) for text in eng_tweets['processed']]

In [81]:
ntopics = 10
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = ntopics, id2word=dictionary, passes=15)
#ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.014*"pjnet" + 0.014*"trump" + 0.009*"maga" + 0.008*"house" + 0.008*"obama"')
(1, '0.033*"black" + 0.019*"people" + 0.013*"lives" + 0.013*"matter" + 0.010*"women"')
(2, '0.014*"islam" + 0.013*"like" + 0.011*"people" + 0.010*"love" + 0.009*"good"')
(3, '0.043*"hillary" + 0.022*"clinton" + 0.014*"police" + 0.012*"news" + 0.008*"breaking"')
(4, '0.095*"trump" + 0.028*"donald" + 0.026*"real" + 0.020*"president" + 0.017*"tcot"')


In [80]:
#dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
#corpus = pickle.load(open('corpus.pkl', 'rb'))
#lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')

lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display);

In [29]:
#dictionary = corpora.Dictionary(user_rt['processed'])
#corpus = [dictionary.doc2bow(text) for text in user_rt['processed']]

In [76]:
#ntopics = 4
#ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = ntopics, id2word=dictionary, passes=15)
#ldamodel.save('model.gensim')
#topics = ldamodel.print_topics(num_words=5)
#for topic in topics:
#    print(topic)

In [77]:
#lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
#pyLDAvis.display(lda_display)

In [None]:
# the vectorizer object will be used to transform text to vector form
# reject tokens in more than 90% or less than 25% of documents
#vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')

# apply transformation
#tf = vectorizer.fit_transform(user_original['content']).toarray()

# tf_feature_names tells us what word each column in the matrix represents
#tf_feature_names = vectorizer.get_feature_names()

In [None]:
#number_of_topics = 10

#model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)

In [None]:
#model.fit(tf)