# Basic NLP library

## 1. SPACY (Object Oriented)

In [1]:
import spacy

In [3]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Dr. Strange loves chinese food of China town. Captain America loves briyani of Hyderabad")

In [4]:
for sentence in doc.sents:
    print(sentence)

Dr. Strange loves chinese food of China town.
Captain America loves briyani of Hyderabad


In [5]:
for sentence in doc.sents:
    for word in sentence:
        print(word)

Dr.
Strange
loves
chinese
food
of
China
town
.
Captain
America
loves
briyani
of
Hyderabad


## 2. NLTK

In [10]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\debje\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [11]:
from nltk.tokenize import sent_tokenize

sent_tokenize("Dr. Strange loves chinese food of China town. Captain America loves briyani of Hyderabad")

['Dr.',
 'Strange loves chinese food of China town.',
 'Captain America loves briyani of Hyderabad']

In [12]:
from nltk.tokenize import word_tokenize

word_tokenize("Dr. Strange loves chinese food of China town. Captain America loves briyani of Hyderabad")

['Dr',
 '.',
 'Strange',
 'loves',
 'chinese',
 'food',
 'of',
 'China',
 'town',
 '.',
 'Captain',
 'America',
 'loves',
 'briyani',
 'of',
 'Hyderabad']

## Exercise

Q1. Think stats is a free book to study statistics (https://greenteapress.com/thinkstats2/thinkstats2.pdf)

This book has references to many websites from where you can download free datasets. You are an NLP engineer working for some company and you want to collect all dataset websites from this book. To keep exercise simple you are given a paragraph from this book and you want to grab all urls from this paragraph using spacy

In [13]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''

# TODO: Write code here
# Hint: token has an attribute that can be used to detect a url

In [22]:
doc = nlp(text)

urls = [token.text for token in doc if token.like_url]
urls

['http://www.data.gov/',
 'http://www.science',
 'http://data.gov.uk/.',
 'http://www3.norc.org/gss+website/',
 'http://www.europeansocialsurvey.org/.']

Q2. Extract all money transaction from below sentence along with currency. 

Output should be, <br>
two $, 500 
5.00 €

In [24]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"

# TODO: Write code here
# Hint: Use token.i for the index of a token and token.is_currency for currency symbol detection

In [55]:
doc = nlp(transactions)

currency = [(token, doc[token.i+1]) for token in doc if token.like_num and doc[token.i+1].is_currency]
print(*currency, sep="\n")

(two, $)
(500, €)
