In [1]:
# NLP: Natural Language Processing
# --------------------------------

#   It's a field of Artificial Intelligence that
#   gives the machines the ability to read and understand human language

# Facts:
#  - Human conversations have a lot of information: Topics, tones, selection of words, phasal verbs, etc.
#  - There could be more than a topic, adjetives, nouns in a sentence
#  - Data from conversations are examples of unstructued data
#  - Unstructured data don't fit in traditional databases (column-row pattern)
#  - Human conversations besided being unstructured, they can be in multiple languages

# The goal of NLP is to bring techniques in order to create structure out of text data

# NLP Use Cases:
# - Email classification as Spam vs Legitimate
# - Sentiment Analysis of comments
# - Analyzing trends from written customer feedbacks
# - Understanding text commands

In [None]:
# NLP with Spacy
# --------------

# What is Spacy?
# From Real Python page: 
#   spaCy is a free and open-source library for Natural Language Processing (NLP)
#   in Python with a lot of in-built capabilities 

# Resources:
# - https://spacy.io/
# - https://realpython.com/natural-language-processing-spacy-python/

# Steps for Working with Spacy
# - Load the Language Library
# - Build a pipeline object
# - Using tokens
# - Parts-of-Speech tagging
# - Undersanding token attributes

In [3]:
import spacy


# Load the Language Library
nlp = spacy.load('en_core_web_sm')
print(type(nlp))
print(nlp)

<class 'spacy.lang.en.English'>
<spacy.lang.en.English object at 0x7f94250b5850>


In [26]:
mystr = "We're moving to the beautiful L.A.! and i will earn $12.000 dollars monthly as well as Sam"
doc = nlp(mystr)
print(doc)
print(type(doc))
print()
for token in doc:
    print(token.text, token.pos, token.pos_)
    
# Key things to Analyze
# 1. Spacy understands L.A. as an entity
# 2. Spacy understands verbs an separate them
# 3. Spacy understands punctuations

# Other things:
# token.text => Give us an individual token
# token.pos => Give us a code, and which of the number corresponds to a verb, noun, adjective, etc
# token.pos_ => Refers to the main concept of token.pos, but it returns a meaning in a string rather than a code.


# Spacy documents can be "sliced" & they produce Span Objects
print(f"\n{'-'*8}\nSlicing\n{'-'*8}")
print(f"doc[0]: {doc[0]}")
print(f"doc[1]: {doc[1]}")
print(f"doc[:4]: {doc[:4]}")
sp = doc[1:3]
print(type(sp))


We're moving to the beautiful L.A.! and i will earn $12.000 dollars monthly as well as Sam
<class 'spacy.tokens.doc.Doc'>

We 94 PRON
're 99 VERB
moving 99 VERB
to 84 ADP
the 89 DET
beautiful 83 ADJ
L.A. 95 PROPN
! 96 PUNCT
and 88 CCONJ
i 94 PRON
will 99 VERB
earn 99 VERB
$ 98 SYM
12.000 92 NUM
dollars 91 NOUN
monthly 85 ADV
as 85 ADV
well 85 ADV
as 84 ADP
Sam 95 PROPN

--------
Slicing
--------
doc[0]: We
doc[1]: 're
doc[:4]: We're moving to
<class 'spacy.tokens.span.Span'>


In [28]:
# Spacy Sentences
d1 = nlp("I can do whatever i propose to myself. If i believe it, i will acomplish it. I will neves surrender.")
for sentence in d1.sents:
    print(sentence)
    
# Spacy understands that a period plus a space is where the sentence ends.
# d1[7].is_sent_start

I can do whatever i propose to myself.
If i believe it, i will acomplish it.
I will neves surrender.


In [13]:
# NLP Pipeline
# ------------

# What do we mean with pipeline?
# "Doing anything complicated in machine learning usually means building a pipeline.
#  The idea is to break up your problem into very small pieces and then use machine
#  learning to solve each smaller piece separately. Then by chaining together several
#  machine learning models that feed into each other, you can do very complicated things."
#    Resource: https://medium.com/@suneelpatel.in/nlp-pipeline-building-an-nlp-pipeline-step-by-step-7f0576e11d08 

# In order to do NLP, some steps need to be done, the input text enters in a pipeline 
# (think of a pipeline as the following):

# Input Text --> | Operation A | --> | Operation B | --> ... -> | Operation N | ---> Output
# To note: Depending on the situation, you can customize your pipeline, re-order the operations (or steps)

# Resources
# - https://spacy.io/api#architecture-pipeline

In [12]:
nlp.pipeline

[('tagger', <spacy.pipeline.Tagger at 0x7f944b301a90>),
 ('parser', <spacy.pipeline.DependencyParser at 0x7f94220147d0>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x7f9422014d70>)]

In [14]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [None]:
# NLP Tokenization
# ----------------
# - Is the process of breaking up the original text into component pieces (tokens)
# - Tokens are pieces of the original text.
# - Tokens are the basic building blocks of a doc object
# - Tokens help us understand the meaning of a text

# Token types
# -----------

# - Prefix: Character(s) at the beginning ( $("¿) )
# - Suffix: Character(s) at the end ( km ) , . ! )
# - Infix: Character(s) in betweeen ( - -- / ... )
# - Exception: Special-case rule to split a string into several tokens or
#              prevent a token from being split when punctuation rules are
#              applied.   ( let's U.S. )

In [33]:
str1 = "For more information, you can contact us at customerservice@mysite.com or get in touch on http://www.mysite.com"
doc_tk = nlp(str1)
for tk in doc_tk:
    print(tk.text, tk.pos)
    
# Note that spacy understands that punctuation in email should not be splitted as the token is an email

For 84
more 83
information 91
, 96
you 94
can 99
contact 99
us 94
at 84
customerservice@mysite.com 100
or 88
get 99
in 84
touch 91
on 84
http://www.mysite.com 100


In [35]:
# Count Spacy Vocab
doc_tk.vocab
len(doc_tk.vocab) # Number of tokens in the loaded model

57852

In [41]:
# Spacy: Token Entities
doc2 = nlp(u"Juan told me, Apple is going to build a Hong Kong factory for $6 million")
print(f"Doc Entities: {doc2.ents}")
print()

for entity in doc2.ents:
    print(f"{entity} - {entity.label_} - {spacy.explain(entity.label_)}")

Doc Entities: (Juan, Apple, Hong Kong, $6 million)

Juan - PERSON - People, including fictional
Apple - ORG - Companies, agencies, institutions, etc.
Hong Kong - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [47]:
# Spacy Displacy: Visualizing Entities
doc3 = nlp(u"Apple is going to build a U.K. factory for $6 million")
spacy.displacy.render(doc3, style='ent', jupyter=True, options={'distance':100})