# Introduction to NLP

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [4]:
doc = nlp("Tea is healthy and calming, don't you think?")

# Tokenizing

In [5]:
for token in doc:
    print(token)

Tea
is
healthy
and
calming
,
do
n't
you
think
?


# Text preprocessing
Each word can be broken down into its 'lemma' (base word). We can also identify stopwords.

In [6]:
print(f"Token \t\tLemma \t\tStopword".format('Token', 'Lemma', 'Stopword'))
print("-"*40)
for token in doc:
    print(f"{str(token)}\t\t{token.lemma_}\t\t{token.is_stop}")

Token 		Lemma 		Stopword
----------------------------------------
Tea		tea		False
is		be		True
healthy		healthy		False
and		and		True
calming		calming		False
,		,		False
do		do		True
n't		n't		True
you		you		True
think		think		False
?		?		False


# Pattern Matching
Matching words or phrases with chunks of text or whole documents.

In [5]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab, attr='LOWER')

Setting the attribute to LOWER will match on lowercased text, so we have a case insensetive matcher. 

In [8]:
terms = ['Galaxy Note', 'iPhone 11', 'iPhone XS', 'Google Pixel']
patterns = [nlp(text) for text in terms]
matcher.add("TerminologyList", patterns)

In [9]:
text_doc = nlp("Glowing review overall, and some really interesting side-by-side "
               "photography tests pitting the iPhone 11 Pro against the "
               "Galaxy Note 10 Plus and last year’s iPhone XS and Google Pixel 3.") 
matches = matcher(text_doc)
print(matches)

[(3766102292120407359, 17, 19), (3766102292120407359, 22, 24), (3766102292120407359, 30, 32), (3766102292120407359, 33, 35)]


Here we have a tuple of match ID and position of the start and end of the phrase.

In [13]:
matchid, start, end = matches[0]
print(matchid, start, end)
print(nlp.vocab.strings[matchid], text_doc[start:end])

3766102292120407359 17 19
TerminologyList iPhone 11


# Exercise
## Find items in one review 

In [3]:
menu = ["Cheese Steak", "Cheesesteak", "Steak and Cheese", "Italian Combo", "Tiramisu", "Cannoli",
        "Chicken Salad", "Chicken Spinach Salad", "Meatball", "Pizza", "Pizzas", "Spaghetti",
        "Bruchetta", "Eggplant", "Italian Beef", "Purista", "Pasta", "Calzones",  "Calzone",
        "Italian Sausage", "Chicken Cutlet", "Chicken Parm", "Chicken Parmesan", "Gnocchi",
        "Chicken Pesto", "Turkey Sandwich", "Turkey Breast", "Ziti", "Portobello", "Reuben",
        "Mozzarella Caprese",  "Corned Beef", "Garlic Bread", "Pastrami", "Roast Beef",
        "Tuna Salad", "Lasagna", "Artichoke Salad", "Fettuccini Alfredo", "Chicken Parmigiana",
        "Grilled Veggie", "Grilled Veggies", "Grilled Vegetable", "Mac and Cheese", "Macaroni",  
         "Prosciutto", "Salami"]

menu_tokens_list = [nlp(item) for item in menu] 

Add the menu items into the matcher:

In [6]:
matcher.add("MENU", menu_tokens_list)

In [7]:
text_to_test_on = "The Il Purista sandwich has become a staple of my life. Mozzarella, basil, prosciutto, roasted red peppers and balsamic vinaigrette blend into a front runner for the best sandwich in the valley. Goes great with sparkling water or a beer. DeFalco's also has other Italian fare such as a delicious meatball sub and classic pastas."

In [8]:
review_doc = nlp(text_to_test_on)
matches = matcher(review_doc)
for match in matches:
   print(f"Token number {match[1]}: {review_doc[match[1]:match[2]]}")

Token number 2: Purista
Token number 16: prosciutto
Token number 57: meatball


## Matching on the whole dataset 

In [9]:
from collections import defaultdict
item_ratings = defaultdict(list)

In [None]:
#cant run this block as dont have data

for idx, review in data.iterrows():
    doc = nlp(review.text)
    matches = matcher(doc)
    
    found_items = set([doc[match[1]:match[2]].lower_ for match in matches])

    for item in found_items:
        item_ratings[item].append(review.stars)