In [20]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from spacy.lang.en import English
from spacy.matcher import Matcher, PhraseMatcher
from spacy.tokens import Doc
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

In [24]:
user_original = pd.read_csv('../data/user_original.csv')
user_rt = pd.read_csv('../data/user_rt.csv')

In [25]:
user_original.head()

Unnamed: 0,user_display_name,user_screen_name,tweet_text
0,1-800-WOKE-AF,gloed_up,Real talk Martin real talk this is true Some...
1,10 GOP,10_gop,While Hurricane bears down on Florida some pe...
2,Amy Green,AmandaVGreen,And you came here so happily The News Trends ...
3,Andy Sparks,AndyHashtagger,cakikeith what bastards she looks the bestdogn...
4,Atlanta Online,Atlanta_Online,Boy 14 hit while changing tire off I-75 in Bar...


In [26]:
user_original = user_original[user_original.tweet_text != np.nan]
user_rt = user_rt[user_rt.tweet_text != np.nan]

### N-grams

In [52]:
unigram_vect = TfidfVectorizer(ngram_range=(1,1), stop_words='english')
ngram_vect = TfidfVectorizer(ngram_range=(2,3), stop_words='english')

In [53]:
original_summaries = "".join(user_original['tweet_text'])
original_ngrams = unigram_vect.build_analyzer()(original_summaries)

Counter(original_ngrams).most_common(20)

[('police', 45863),
 ('man', 45337),
 ('new', 42104),
 ('trump', 36985),
 ('says', 24535),
 ('state', 19638),
 ('year', 18909),
 ('woman', 18385),
 ('shooting', 17978),
 ('killed', 17821),
 ('shot', 15136),
 ('city', 14836),
 ('obama', 13924),
 ('people', 13707),
 ('house', 13621),
 ('old', 13446),
 ('san', 13321),
 ('day', 13273),
 ('black', 13270),
 ('school', 13026)]

In [54]:
Counter(original_ngrams).most_common(40)

[('police', 45863),
 ('man', 45337),
 ('new', 42104),
 ('trump', 36985),
 ('says', 24535),
 ('state', 19638),
 ('year', 18909),
 ('woman', 18385),
 ('shooting', 17978),
 ('killed', 17821),
 ('shot', 15136),
 ('city', 14836),
 ('obama', 13924),
 ('people', 13707),
 ('house', 13621),
 ('old', 13446),
 ('san', 13321),
 ('day', 13273),
 ('black', 13270),
 ('school', 13026),
 ('home', 12943),
 ('dead', 12794),
 ('crash', 12227),
 ('death', 11985),
 ('county', 11984),
 ('arrested', 11827),
 ('texas', 11714),
 ('game', 11713),
 ('clinton', 11542),
 ('car', 10590),
 ('court', 10559),
 ('say', 10468),
 ('white', 10424),
 ('video', 10087),
 ('win', 9918),
 ('president', 9801),
 ('dies', 9773),
 ('years', 9605),
 ('charged', 9565),
 ('suspect', 9503)]

In [55]:
original_summaries = "".join(user_original['tweet_text'])
original_ngrams = ngram_vect.build_analyzer()(original_summaries)

Counter(original_ngrams).most_common(20)

[('year old', 10250),
 ('new orleans', 7612),
 ('donald trump', 5095),
 ('new york', 4687),
 ('white house', 4138),
 ('st louis', 3661),
 ('san jose', 3484),
 ('high school', 3388),
 ('hillary clinton', 3335),
 ('man shot', 3319),
 ('san francisco', 3178),
 ('supreme court', 2902),
 ('police say', 2867),
 ('super bowl', 2789),
 ('pleads guilty', 2539),
 ('police officer', 2486),
 ('man charged', 2486),
 ('islamic state', 2288),
 ('health care', 2241),
 ('man arrested', 2204)]

In [56]:
Counter(original_ngrams).most_common(40)

[('year old', 10250),
 ('new orleans', 7612),
 ('donald trump', 5095),
 ('new york', 4687),
 ('white house', 4138),
 ('st louis', 3661),
 ('san jose', 3484),
 ('high school', 3388),
 ('hillary clinton', 3335),
 ('man shot', 3319),
 ('san francisco', 3178),
 ('supreme court', 2902),
 ('police say', 2867),
 ('super bowl', 2789),
 ('pleads guilty', 2539),
 ('police officer', 2486),
 ('man charged', 2486),
 ('islamic state', 2288),
 ('health care', 2241),
 ('man arrested', 2204),
 ('san diego', 2099),
 ('man accused', 2071),
 ('man killed', 1963),
 ('bay area', 1891),
 ('hit run', 1882),
 ('fort worth', 1824),
 ('fatally shot', 1775),
 ('san antonio', 1724),
 ('police blotter', 1700),
 ('los angeles', 1585),
 ('orleans saints', 1554),
 ('new orleans saints', 1491),
 ('city council', 1414),
 ('police investigating', 1410),
 ('white sox', 1374),
 ('fatal shooting', 1367),
 ('bernie sanders', 1354),
 ('north korea', 1338),
 ('old girl', 1312),
 ('year old girl', 1227)]

In [57]:
rt_summaries = "".join(user_rt['tweet_text'])
rt_ngrams = unigram_vect.build_analyzer()(rt_summaries)

Counter(rt_ngrams).most_common(20)

[('people', 5224),
 ('im', 5084),
 ('gloed_up', 5016),
 ('like', 4735),
 ('just', 4668),
 ('dont', 4622),
 ('giselleevns', 4092),
 ('danageezus', 4011),
 ('black', 3438),
 ('amp', 3342),
 ('trump', 3165),
 ('make', 2704),
 ('trayneshacole', 2461),
 ('chrixmorgan', 2372),
 ('time', 2355),
 ('know', 2340),
 ('white', 2290),
 ('day', 2120),
 ('love', 2110),
 ('bleepthepolice', 2086)]

In [58]:
Counter(rt_ngrams).most_common(40)

[('people', 5224),
 ('im', 5084),
 ('gloed_up', 5016),
 ('like', 4735),
 ('just', 4668),
 ('dont', 4622),
 ('giselleevns', 4092),
 ('danageezus', 4011),
 ('black', 3438),
 ('amp', 3342),
 ('trump', 3165),
 ('make', 2704),
 ('trayneshacole', 2461),
 ('chrixmorgan', 2372),
 ('time', 2355),
 ('know', 2340),
 ('white', 2290),
 ('day', 2120),
 ('love', 2110),
 ('bleepthepolice', 2086),
 ('andyhashtagger', 2061),
 ('https', 2009),
 ('new', 1944),
 ('want', 1918),
 ('think', 1895),
 ('really', 1835),
 ('good', 1826),
 ('man', 1809),
 ('id', 1680),
 ('say', 1662),
 ('worldofhashtags', 1618),
 ('best', 1610),
 ('youre', 1592),
 ('got', 1569),
 ('need', 1567),
 ('police', 1506),
 ('right', 1497),
 ('going', 1445),
 ('life', 1419),
 ('work', 1419)]

In [59]:
rt_summaries = "".join(user_rt['tweet_text'])
rt_ngrams = ngram_vect.build_analyzer()(rt_summaries)

Counter(rt_ngrams).most_common(20)

[('donald trump', 503),
 ('year old', 393),
 ('white people', 385),
 ('black people', 376),
 ('white house', 369),
 ('hashtag game', 357),
 ('dont know', 298),
 ('look like', 283),
 ('hillary clinton', 281),
 ('dont want', 274),
 ('hashtag games', 246),
 ('dont like', 237),
 ('looks like', 235),
 ('black women', 218),
 ('im going', 206),
 ('black man', 202),
 ('lives matter', 200),
 ('high school', 196),
 ('make sure', 175),
 ('im gonna', 172)]

In [60]:
Counter(rt_ngrams).most_common(40)

[('donald trump', 503),
 ('year old', 393),
 ('white people', 385),
 ('black people', 376),
 ('white house', 369),
 ('hashtag game', 357),
 ('dont know', 298),
 ('look like', 283),
 ('hillary clinton', 281),
 ('dont want', 274),
 ('hashtag games', 246),
 ('dont like', 237),
 ('looks like', 235),
 ('black women', 218),
 ('im going', 206),
 ('black man', 202),
 ('lives matter', 200),
 ('high school', 196),
 ('make sure', 175),
 ('im gonna', 172),
 ('people dont', 149),
 ('united states', 145),
 ('social media', 142),
 ('black lives', 140),
 ('make america', 140),
 ('new york', 140),
 ('worldofhashtags giselleevns', 138),
 ('worldofhashtags game', 136),
 ('black woman', 133),
 ('game hosted', 133),
 ('dont think', 132),
 ('gloed_up black', 129),
 ('dont need', 129),
 ('____because hate', 129),
 ('im just', 126),
 ('gloed_up white', 124),
 ('years ago', 122),
 ('https gloed_up', 120),
 ('play hashtag', 116),
 ('black lives matter', 116)]

In [None]:
#use nlp.pipe to efficiently process many docs
original_docs = list(nlp.pipe(user_original_docs.tweet_text))
retweet_docs = list(nlp.pipe(user_rt_docs.tweet_text))

In [None]:
nlp = English()
# nlp = spacy.load('en_core_web_lg')
matcher = Matcher(nlp.vocab)

In [None]:
doc = nlp('some text')
print(doc.text)

In [None]:
#part-of-speech tags -- token.pos_
for token in doc:
    # Get the token text, part-of-speech tag and dependency label
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    # This is for formatting only
    print('{:<12}{:<10}{:<10}'.format(token_text, token_pos, token_dep))
#syntactic dependencies -- token.dep_
#Named entites -- doc.ents
# Iterate over the entities
for ent in doc.ents:
    # print the entity text and label
    print(ent.text, ent.label_)

In [None]:
matches = matcher(doc)

In [None]:
for matchid, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)

In [None]:
# hash table storage, lexemes
coffee_hash = nlp.vocab.strings['coffee']
coffee_string = nlp.vocab.strings[coffee_hash]
lexeme = nlp.vocab['coffee']

In [None]:
# create a doc manually
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

### Pipelines
Text-->tokenizer->tagger->parser->ner....-->Doc

In [None]:
def custom_component(doc):
    # do something
    return doc

nlp.add_pipeline(custom_component, first=True)

In [None]:
#attribute extensions
#property extensions
#method extensions

In [None]:
#use nlp.pipe to efficiently process many docs
docs = list(nlp.pipe(my_series_of_tweet_docs))

### Training and updating a model
Initialize --> Predict --> Compare --> Calculate --> Update
* Training data: Examples and their annotations
* Text: The input text the model should predict a label for
* Label: The label the model should predict.
* Gradient: How to change the weights.
Problems:
* Models can forget  A. Mix in previously correct predictions
* Models can't learn everything  A. Plan label scheme carefully

In [None]:
# to save your model
nlp.to_disk(path_to_model)