In [1]:
import os
import sqlite3 as sql
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from spacy.lang.en import English
from spacy.matcher import Matcher, PhraseMatcher
from spacy.tokens import Doc
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

In [82]:
#user_original = pd.read_csv('../data/user_original.csv')
#user_rt = pd.read_csv('../data/user_rt.csv')

with sql.connect('../data/user_original.db') as conn:
    user_original = pd.read_sql_query('select * from user_original', conn)
user_original.head()

Unnamed: 0,author,content
0,10_GOP,[' We have a sitting Democrat US Senator on tr...
1,1D_NICOLE_,['I hurt easy I just don t show it You can hur...
2,1ERIK_LEE,[]
3,1LORENAFAVA1,[]
4,2NDHALFONION,[' Follow the other half an onion Mage America...


In [83]:
with sql.connect('../data/user_rt.db') as conn:
    user_rt = pd.read_sql_query('select * from user_rt', conn)
user_rt.head()

Unnamed: 0,author,content
0,10_GOP,['Daughter of fallen Navy Sailor delivers powe...
1,1D_NICOLE_,"[' is not a joke ', 'Thanks God that i ate few..."
2,1ERIK_LEE,['Why is someone even against the I ll watch y...
3,1LORENAFAVA1,['Come vedere Juventus Milan in streaming o in...
4,2NDHALFONION,[]


In [84]:
#user_original = user_original[user_original.tweet_text != np.nan]
#user_rt = user_rt[user_rt.tweet_text != np.nan]

### N-grams

In [85]:
unigram_vect = TfidfVectorizer(ngram_range=(1,1), stop_words='english')
bigram_vect = TfidfVectorizer(ngram_range=(2,2), stop_words='english')
trigram_vect = TfidfVectorizer(ngram_range=(3,3), stop_words='english')

In [86]:
original_summaries = "".join(user_original['content'])
original_unigrams = unigram_vect.build_analyzer()(original_summaries)

Counter(original_unigrams).most_common(40)

[('trump', 84092),
 ('police', 43508),
 ('workout', 41977),
 ('man', 38983),
 ('new', 38317),
 ('just', 29565),
 ('obama', 25321),
 ('says', 23491),
 ('people', 22442),
 ('breaking', 21636),
 ('video', 19544),
 ('state', 19318),
 ('like', 17599),
 ('president', 17429),
 ('don', 16930),
 ('year', 16553),
 ('killed', 16520),
 ('black', 16165),
 ('woman', 16079),
 ('hillary', 15560),
 ('shooting', 15205),
 ('day', 15200),
 ('amp', 14920),
 ('clinton', 14716),
 ('time', 14031),
 ('exercise', 13928),
 ('white', 13653),
 ('need', 12928),
 ('good', 12478),
 ('city', 12473),
 ('say', 12467),
 ('america', 12273),
 ('old', 12264),
 ('house', 12231),
 ('look', 12219),
 ('make', 12017),
 ('dead', 11618),
 ('media', 11388),
 ('death', 11343),
 ('shot', 11209)]

In [87]:
original_summaries = "".join(user_original['content'])
original_bigrams = bigram_vect.build_analyzer()(original_summaries)

Counter(original_bigrams).most_common(40)

[('year old', 8558),
 ('lose weight', 6647),
 ('donald trump', 6621),
 ('north korea', 5428),
 ('president trump', 5410),
 ('new orleans', 4783),
 ('white house', 4710),
 ('hillary clinton', 4286),
 ('islamic state', 3696),
 ('new york', 3303),
 ('cnn cnn', 3280),
 ('police say', 2938),
 ('anti trump', 2863),
 ('high school', 2732),
 ('trump supporters', 2589),
 ('st louis', 2588),
 ('san jose', 2482),
 ('san francisco', 2389),
 ('police officer', 2338),
 ('weight loss', 2293),
 ('fake news', 2264),
 ('man shot', 2252),
 ('supreme court', 2246),
 ('trump just', 1989),
 ('man charged', 1953),
 ('good workout', 1952),
 ('workout workout', 1920),
 ('super bowl', 1885),
 ('pleads guilty', 1845),
 ('police blotter', 1713),
 ('man arrested', 1707),
 ('man accused', 1690),
 ('san diego', 1678),
 ('bernie sanders', 1621),
 ('patriot army', 1564),
 ('breaking trump', 1490),
 ('workout today', 1475),
 ('kim jong', 1448),
 ('hit run', 1429),
 ('man killed', 1425)]

In [88]:
original_summaries = "".join(user_original['content'])
original_trigrams = trigram_vect.build_analyzer()(original_summaries)

Counter(original_trigrams).most_common(40)

[('cnn cnn cnn', 3234),
 ('enlist patriot army', 1144),
 ('black lives matter', 1085),
 ('year old girl', 949),
 ('need lose weight', 916),
 ('new orleans saints', 895),
 ('year old boy', 828),
 ('south china sea', 701),
 ('year old man', 673),
 ('trump supporters react', 608),
 ('rewind 07 2017', 544),
 ('san juan mayor', 528),
 ('officer involved shooting', 507),
 ('make america great', 498),
 ('president trump just', 471),
 ('new orleans pelicans', 467),
 ('man fatally shot', 455),
 ('dani bostick dani', 450),
 ('bostick dani bostick', 450),
 ('john bel edwards', 448),
 ('new york city', 446),
 ('year old woman', 445),
 ('15 year old', 437),
 ('rewind 05 2017', 422),
 ('breaking north korea', 419),
 ('black trump supporter', 416),
 ('rewind 08 2017', 412),
 ('new york times', 407),
 ('12 year old', 403),
 ('trump travel ban', 401),
 ('rewind 06 2017', 400),
 ('fake news media', 396),
 ('man pleads guilty', 392),
 ('golden state warriors', 381),
 ('10 year old', 379),
 ('called trump

In [89]:
rt_summaries = "".join(user_rt['content'])
rt_unigrams = unigram_vect.build_analyzer()(rt_summaries)

Counter(rt_unigrams).most_common(40)

[('trump', 51410),
 ('people', 27664),
 ('just', 27168),
 ('new', 25448),
 ('like', 25001),
 ('amp', 24947),
 ('don', 23055),
 ('black', 19272),
 ('obama', 17556),
 ('hillary', 15952),
 ('time', 14805),
 ('white', 14543),
 ('know', 13957),
 ('day', 13036),
 ('make', 12896),
 ('clinton', 12335),
 ('good', 11901),
 ('video', 11853),
 ('want', 11827),
 ('love', 11717),
 ('man', 11717),
 ('need', 11600),
 ('right', 11471),
 ('america', 11465),
 ('president', 11456),
 ('today', 11115),
 ('news', 10275),
 ('think', 9985),
 ('police', 9916),
 ('says', 9585),
 ('let', 9539),
 ('say', 9383),
 ('life', 9212),
 ('did', 9100),
 ('women', 8981),
 ('got', 8952),
 ('year', 8905),
 ('world', 8846),
 ('watch', 8616),
 ('going', 8446)]

In [90]:
rt_summaries = "".join(user_rt['content'])
rt_bigrams = bigram_vect.build_analyzer()(rt_summaries)

Counter(rt_bigrams).most_common(40)

[('donald trump', 6347),
 ('hillary clinton', 4229),
 ('white house', 3010),
 ('year old', 2239),
 ('new york', 2233),
 ('payday loans', 2129),
 ('president trump', 1749),
 ('news post', 1733),
 ('black people', 1689),
 ('fake news', 1647),
 ('don know', 1635),
 ('new post', 1599),
 ('looks like', 1413),
 ('white people', 1410),
 ('don want', 1366),
 ('online payday', 1261),
 ('social media', 1224),
 ('united states', 1218),
 ('24x7 online', 1203),
 ('black women', 1154),
 ('gt gt', 1110),
 ('approved minutes', 1077),
 ('loans approved', 1076),
 ('fox news', 1063),
 ('years ago', 1010),
 ('look like', 1005),
 ('supreme court', 987),
 ('hip hop', 973),
 ('good morning', 969),
 ('targeted accounts', 964),
 ('lives matter', 938),
 ('god bless', 907),
 ('planned parenthood', 899),
 ('trump supporters', 872),
 ('bernie sanders', 869),
 ('just like', 837),
 ('ted cruz', 820),
 ('president obama', 813),
 ('high school', 795),
 ('don like', 793)]

In [91]:
rt_summaries = "".join(user_rt['content'])
rt_trigrams = trigram_vect.build_analyzer()(rt_summaries)

Counter(rt_trigrams).most_common(40)

[('online payday loans', 1258),
 ('24x7 online payday', 1203),
 ('payday loans approved', 1076),
 ('loans approved minutes', 1076),
 ('black lives matter', 640),
 ('make america great', 525),
 ('approved minutes payday', 497),
 ('minutes payday loans', 481),
 ('new york times', 480),
 ('join fight save', 447),
 ('fight save america', 446),
 ('patriots dare join', 444),
 ('dare join fight', 444),
 ('save america click', 443),
 ('following twitter thanks', 421),
 ('twitter thanks ready', 421),
 ('win win win', 366),
 ('listen title track', 365),
 ('title track upcoming', 365),
 ('track upcoming mashup', 365),
 ('upcoming mashup album', 365),
 ('mashup album ft', 365),
 ('loans 24x7 online', 363),
 ('best promotion just', 324),
 ('martin luther king', 314),
 ('happy new year', 298),
 ('won want miss', 293),
 ('hit hands won', 278),
 ('hands won want', 278),
 ('gt gt gt', 275),
 ('president united states', 269),
 ('new york city', 265),
 ('win major exposure', 251),
 ('major exposure promo

### Topic Modelling

In [92]:
len(user_original.author.value_counts())

1989

In [102]:
import gensim
import nltk
import pyLDAvis.gensim
from gensim import corpora
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gcdunn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [94]:
en_stop = set(nltk.corpus.stopwords.words('english'))

In [95]:
en_stop.add('the')
en_stop.add('amp')

In [96]:
user_original['tokenized'] = user_original.apply(lambda row: nltk.word_tokenize(row['content'].lower()), axis=1)
user_rt['tokenized'] = user_rt.apply(lambda row: nltk.word_tokenize(row['content'].lower()), axis=1)

In [97]:
user_original['tokenized'] = user_original.tokenized.apply(lambda x: [item for item in x if item not in en_stop])
user_rt['tokenized'] = user_rt.tokenized.apply(lambda x: [item for item in x if item not in en_stop])

In [98]:
user_original['tokenized'] = user_original.tokenized.apply(lambda x: [item for item in x if len(item) > 2])
user_rt['tokenized'] = user_rt.tokenized.apply(lambda x: [item for item in x if len(item) > 2])

In [99]:
dictionary = corpora.Dictionary(user_original['tokenized'])
corpus = [dictionary.doc2bow(text) for text in user_original['tokenized']]

In [100]:
ntopics = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = ntopics, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.009*"police" + 0.008*"man" + 0.004*"trump" + 0.004*"shooting" + 0.004*"year"')
(1, '0.008*"people" + 0.007*"like" + 0.005*"\'the" + 0.005*"black" + 0.004*"one"')
(2, '0.078*"workout" + 0.025*"exercise" + 0.017*"weight" + 0.013*"lose" + 0.008*"need"')
(3, '0.008*"says" + 0.007*"new" + 0.005*"state" + 0.004*"police" + 0.003*"killed"')
(4, '0.029*"trump" + 0.007*"\'breaking" + 0.005*"obama" + 0.004*"hillary" + 0.004*"video"')


In [103]:
#dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
#corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')

lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [None]:
# the vectorizer object will be used to transform text to vector form
# reject tokens in more than 90% or less than 25% of documents
vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')

# apply transformation
tf = vectorizer.fit_transform(user_original['content']).toarray()

# tf_feature_names tells us what word each column in the matrix represents
tf_feature_names = vectorizer.get_feature_names()

In [None]:
number_of_topics = 10

model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)

In [None]:
model.fit(tf)

In [None]:
#use nlp.pipe to efficiently process many docs
original_docs = list(nlp.pipe(user_original_docs.tweet_text))
retweet_docs = list(nlp.pipe(user_rt_docs.tweet_text))

In [None]:
nlp = English()
# nlp = spacy.load('en_core_web_lg')
matcher = Matcher(nlp.vocab)

In [None]:
doc = nlp('some text')
print(doc.text)

In [None]:
#part-of-speech tags -- token.pos_
for token in doc:
    # Get the token text, part-of-speech tag and dependency label
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    # This is for formatting only
    print('{:<12}{:<10}{:<10}'.format(token_text, token_pos, token_dep))
#syntactic dependencies -- token.dep_
#Named entites -- doc.ents
# Iterate over the entities
for ent in doc.ents:
    # print the entity text and label
    print(ent.text, ent.label_)

In [None]:
matches = matcher(doc)

In [None]:
for matchid, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)

In [None]:
# hash table storage, lexemes
coffee_hash = nlp.vocab.strings['coffee']
coffee_string = nlp.vocab.strings[coffee_hash]
lexeme = nlp.vocab['coffee']

In [None]:
# create a doc manually
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

### Pipelines
Text-->tokenizer->tagger->parser->ner....-->Doc

In [None]:
def custom_component(doc):
    # do something
    return doc

nlp.add_pipeline(custom_component, first=True)

In [None]:
#attribute extensions
#property extensions
#method extensions

In [None]:
#use nlp.pipe to efficiently process many docs
docs = list(nlp.pipe(my_series_of_tweet_docs))

### Training and updating a model
Initialize --> Predict --> Compare --> Calculate --> Update
* Training data: Examples and their annotations
* Text: The input text the model should predict a label for
* Label: The label the model should predict.
* Gradient: How to change the weights.
Problems:
* Models can forget  A. Mix in previously correct predictions
* Models can't learn everything  A. Plan label scheme carefully

In [None]:
# to save your model
nlp.to_disk(path_to_model)