# _Exploration: January 14, 2020_

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
# import libraries
import pandas as pd
pd.options.display.max_columns = None
import numpy as np
import random
import os

# Matplotlib
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

## _Load in Data_

In [10]:
# create dataframe from Donald Trump's Twitter CSV
df = pd.read_csv("csv-data/realDonaldTrump.csv", dtype={"id_str": str})

# convert created_at column to datetime type
df["created_at"] = pd.to_datetime(df["created_at"])

# convert lang and source to categorical types
df["lang"] = df["lang"].astype("category")
df["source"] = df["source"].astype("category")

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3195 entries, 0 to 3194
Data columns (total 9 columns):
id_str            3195 non-null object
screen_name       3195 non-null object
created_at        3195 non-null datetime64[ns]
lang              3195 non-null category
source            3195 non-null category
retweet_count     3195 non-null int64
favorite_count    3195 non-null int64
is_retweet        3195 non-null bool
full_text         3195 non-null object
dtypes: bool(1), category(2), datetime64[ns](1), int64(2), object(3)
memory usage: 159.7+ KB


In [12]:
df[:5]

Unnamed: 0,id_str,screen_name,created_at,lang,source,retweet_count,favorite_count,is_retweet,full_text
0,1208587674342301703,realDonaldTrump,2019-12-22 03:18:50,und,Twitter for iPhone,12003,40784,False,https://t.co/ryVvzb6EGt
1,1208587342879047681,realDonaldTrump,2019-12-22 03:17:31,und,Twitter for iPhone,16688,52784,False,https://t.co/rJ4yo4htsy
2,1208541550424264710,realDonaldTrump,2019-12-22 00:15:33,en,Twitter for iPhone,9966,0,True,RT @WhiteHouse: LIVE: President @realDonaldTru...
3,1208494102062477312,realDonaldTrump,2019-12-21 21:07:01,und,Twitter for iPhone,34484,123089,False,https://t.co/h5bAKuoyV2
4,1208471806815997953,realDonaldTrump,2019-12-21 19:38:25,en,Twitter for iPhone,29094,143949,False,Last night I was so proud to have signed the l...


In [30]:
tweet = df["full_text"][4]
print(tweet)

Last night I was so proud to have signed the largest Defense Bill ever. The very vital Space Force was created. New planes, ships, missiles, rockets and equipment of every kind, and all made right here in the USA. Additionally, we got Border Wall (being built) funding. Nice!


## _Continuation of DataCamp Tutorial --> `Advanced NLP with spaCy`_

In [13]:
import spacy 

nlp = spacy.load("en_core_web_md")

In [14]:
text = "New iPhone X release date leaked as Apple reveals pre-orders by mistake"

# process the text
doc = nlp(text)

# iterate over the entities
for ent in doc.ents:
    # print the entity text and label
    print(ent.text, ent.label_)

Apple ORG


In [18]:
# get the span for iPhone X
iphone_x = doc[1:3]

# print the span text
print("Missing entity: ", iphone_x.text)

Missing entity:  iPhone X


In [31]:
# process the text
doc = nlp(tweet)

# iterate over the tntities
for ent in doc.ents:
    print(ent.text, ent.label_)

Last night TIME
Space Force ORG
USA GPE
Border Wall FAC


In [32]:
spacy.explain("FAC")

'Buildings, airports, highways, bridges, etc.'

### _Rule-based mathcing_

- why note just regular expressions?
    - match on `Doc` objects, not just strings
    - match on tokens and token attributes
    - use the model's predictions
        - for example find the word "duck" only if its a verb (not a noun)
- match patterns
    - list of dictionaries, one per token
    - match exact token texts = `[{"ORTH": "iPhone"}, {"ORTH": "X"}]`
    - match lexical attributes = `[{"LOWER": "iphone"}, {"LOWER": "x"}]`
    - match any token attributes = `[{"LEMMA": "buy"}, {"POS":"Noun"}]`

In [20]:
# import the Matcher
from spacy.matcher import Matcher
# load a model and create the nlp object
nlp = spacy.load("en_core_web_md")
# initialize the matcher with the shared vocab
matcher = Matcher(nlp.vocab)
# add the pattern to the matcher
pattern = [{"ORTH": "iPhone"}, {"ORTH": "X"}]
matcher.add("IPHONE_PATTERN", None, pattern)

In [24]:
# process some text
doc = nlp("New iPhone X relsease date leaked")
# call the matcher on the doc
matches = matcher(doc)
# iterate over the matches
for match_id, start, end in matches:
    # get matched span
    matched_span = doc[start:end]
    print(matched_span.text)

iPhone X


In [26]:
# matching lexical attributes
pattern = [
    {"IS_DIGIT": True},
    {"LOWER": "fifa"},
    {"LOWER": "world"},
    {"LOWER": "cup"},
    {"IS_PUNCT": True}
]
# initialize the matcher with the shared vocab
matcher = Matcher(nlp.vocab)
# add pattern to matcher
matcher.add("FIFA", None, pattern)

# create a doc object
doc = nlp("2018 FIFA World Cup: France won!")

# call the matcher on the doc
matches = matcher(doc)

# iterate over the matches
for match_id, start, end in matches:
    # get matched span
    matched_span = doc[start:end]
    print(matched_span.text)

2018 FIFA World Cup:


In [27]:
# matching other token attributes
pattern = [
    {"LEMMA": "love", "POS": "VERB"},
    {"POS":"NOUN"}
]
# initialize matcher with shared vocab
matcher = Matcher(nlp.vocab)
# add pattern to matcher
matcher.add("ATTRIBUTES", None, pattern)

# create doc object
doc = nlp("I loved dogs but not I love cats more.")

# call the matcher on the doc
matches = matcher(doc)

# iterate over the matches
for match_id, start, end in matches:
    # get matched span
    matched_span = doc[start:end]
    print(matched_span.text)

loved dogs
love cats


In [29]:
# using operators and quantifiers
pattern = [
    {"LEMMA": "buy"},
    {"POS":"DET", "OP": "?"}, # optional: match 0 or 1 times (!, ?, +, *)
    {"POS": "NOUN"}
]
# initialize matcher with shared vocab
matcher = Matcher(nlp.vocab)
# add pattern to matcher
matcher.add("QUANT", None, pattern)

# create doc object
doc = nlp("I bought a smartphone. Now I'm buying apps.")

# call the matcher on the doc
matches = matcher(doc)

# iterate over the matches
for match_id, start, end in matches:
    # get matched span
    matched_span = doc[start:end]
    print(matched_span.text)

bought a smartphone
buying apps


In [34]:
doc = nlp("New iPhone X release date leaked as Apple reveals pre-orders by mistake")

# import matcher and initialize with shared vocabulary
matcher = Matcher(nlp.vocab)

# create a pattern matching two tokens: "iPhone" and "X"
pattern = [
    {"TEXT": "iPhone"},
    {"TEXT": "X"}
]

# add pattern to the matcher
matcher.add("IPHONE_X_PATTERN", None, pattern)

# use the matcher on the doc
matches = matcher(doc)
print('Matches:', [doc[start:end].text for match_id, start, end in matches])

Matches: ['iPhone X']


In [36]:
doc = nlp("""After making the iOS update you won't notice a radical system-wide redesign: nothing like 
the aesthetic upheaval we got with iOS 7. Most of iOS 11's furniture remains the same as in iOS 10. But 
you will discover some tweaks once you delve a little deeper.""")

# write a pattern for full iOS versions ("iOS 7", "iOS 11", "iOS 10")
pattern = [
    {"TEXT": "iOS"},
    {"IS_DIGIT": True}
]

# add the pattern to the matcher
matcher.add("IOS_VERSION_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found: ", len(matches))

# iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found: ", doc[start:end].text)

Total matches found:  3
Match found:  iOS 7
Match found:  iOS 11
Match found:  iOS 10


In [37]:
doc = nlp("""i downloaded Fortnite on my laptop and can't open the game at all. Help? so when I was 
downloading Minecraft, I got the Windows version where it is the '.zip' folder and I used the default 
program to unpack it... do I also need to download Winzip?""")

# write a pattern that matches a form of "download" plus proper noun
pattern = [
    {"LEMMA": "download"},
    {"POS": "PROPN"}
]

# add the pattern to the matcher and apply the matcher to the doc
matcher.add("DOWNLOAD_THINGS_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))

# iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

Total matches found: 3
Match found: downloaded Fortnite
Match found: downloading Minecraft
Match found: download Winzip


In [39]:
doc = nlp("""Features of the app include a beautiful design, 
smart search, automatic labels and optional voice responses.""")

# write a pattern for adjective plus one or two nouns
pattern = [
    {"POS": "ADJ"},
    {"POS": "NOUN"},
    {"POS": "NOUN", "OP": "?"}
]

# add the pattern to the matcher and apply the matcher to the doc
matcher.add("ADJ_NOUN_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found: ", len(matches))

# iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found: ", doc[start:end].text)

Total matches found:  5
Match found:  beautiful design
Match found:  smart search
Match found:  automatic labels
Match found:  optional voice
Match found:  optional voice responses
