# Sentence tokenization


In [17]:
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')


In [47]:
text = "In a hole in the ground there lived a hobbit. Not a nasty, dirty, wet hole, filled with the ends of worms and an oozy smell, nor yet a dry, bare, sandy hole with nothing in it to sit down on or to eat: it was a hobbit-hole, and that means comfort."

In [5]:
# NLTK
sentences = sent_tokenize(text)
print(sentences)

['In a hole in the ground there lived a hobbit.', 'Not a nasty, dirty, wet hole, filled with the ends of worms and an oozy smell, nor yet a dry, bare, sandy hole with nothing in it to sit down on or to eat: it was a hobbit-hole, and that means comfort.']


In [50]:
# Spacy
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
doc = nlp(text)
for s in doc.sents:
    print(s)
sentences = [s for s in doc.sents]
print(sentences)

In a hole in the ground there lived a hobbit.
Not a nasty, dirty, wet hole, filled with the ends of worms and an oozy smell, nor yet a dry, bare, sandy hole with nothing in it to sit down on or to eat
: it was a hobbit-hole, and that means comfort.
[In a hole in the ground there lived a hobbit., Not a nasty, dirty, wet hole, filled with the ends of worms and an oozy smell, nor yet a dry, bare, sandy hole with nothing in it to sit down on or to eat, : it was a hobbit-hole, and that means comfort.]


# Word tokenization

In [8]:
text = "In a hole in the ground there lived a hobbit."

In [4]:
# NLTK
word_tokens = nltk.word_tokenize(text)
print(word_tokens)

['In', 'a', 'hole', 'in', 'the', 'ground', 'there', 'lived', 'a', 'hobbit', '.', 'Not', 'a', 'nasty', ',', 'dirty', ',', 'wet', 'hole', ',', 'filled', 'with', 'the', 'ends', 'of', 'worms', 'and', 'an', 'oozy', 'smell', ',', 'nor', 'yet', 'a', 'dry', ',', 'bare', ',', 'sandy', 'hole', 'with', 'nothing', 'in', 'it', 'to', 'sit', 'down', 'on', 'or', 'to', 'eat', ':', 'it', 'was', 'a', 'hobbit-hole', ',', 'and', 'that', 'means', 'comfort', '.']


In [12]:
# With Part of Speech
tagged = nltk.pos_tag(word_tokens)
print(tagged)

[('In', 'IN'), ('a', 'DT'), ('hole', 'NN'), ('in', 'IN'), ('the', 'DT'), ('ground', 'NN'), ('there', 'RB'), ('lived', 'VBD'), ('a', 'DT'), ('hobbit', 'NN'), ('.', '.'), ('Not', 'RB'), ('a', 'DT'), ('nasty', 'JJ'), (',', ','), ('dirty', 'JJ'), (',', ','), ('wet', 'JJ'), ('hole', 'NN'), (',', ','), ('filled', 'VBN'), ('with', 'IN'), ('the', 'DT'), ('ends', 'NNS'), ('of', 'IN'), ('worms', 'NNS'), ('and', 'CC'), ('an', 'DT'), ('oozy', 'NN'), ('smell', 'NN'), (',', ','), ('nor', 'CC'), ('yet', 'RB'), ('a', 'DT'), ('dry', 'NN'), (',', ','), ('bare', 'NN'), (',', ','), ('sandy', 'JJ'), ('hole', 'NN'), ('with', 'IN'), ('nothing', 'NN'), ('in', 'IN'), ('it', 'PRP'), ('to', 'TO'), ('sit', 'VB'), ('down', 'RP'), ('on', 'IN'), ('or', 'CC'), ('to', 'TO'), ('eat', 'VB'), (':', ':'), ('it', 'PRP'), ('was', 'VBD'), ('a', 'DT'), ('hobbit-hole', 'JJ'), (',', ','), ('and', 'CC'), ('that', 'DT'), ('means', 'VBZ'), ('comfort', 'NN'), ('.', '.')]


In [10]:
# Spacy
nlp = en_core_web_sm.load()
doc = nlp(text)

for word in doc:
    print(word.text,  word.pos_)

In ADP
a DET
hole NOUN
in ADP
the DET
ground NOUN
there ADV
lived VERB
a DET
hobbit NOUN
. PUNCT


# Stemming

In [13]:
text = "We don't want any adventures here, thank you!"

In [15]:
# NLTK
from nltk.stem import PorterStemmer
porter = PorterStemmer()
tokenized = nltk.word_tokenize(text)
for word in tokenized:
    print(porter.stem(word))

We
do
n't
want
ani
adventur
here
,
thank
you
!


There's no stemming in Spacy.

# Lemmatization

In [22]:
text = "Elvish singing is not a thing to miss"


In [30]:
# NLTK
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
wordnetLem = WordNetLemmatizer()
tokenized = nltk.word_tokenize(text)
for word in tokenized:
    print(word, wordnetLem.lemmatize(word, pos="v"))

This This
is be
the the
story story
of of
how how
a a
Baggins Baggins
had have
an an
adventure adventure
, ,
and and
found find
himself himself
doing do
and and
saying say
things things
altogether altogether
unexpected unexpected
. .


In [25]:
# Spacy
nlp = en_core_web_sm.load()
doc = nlp(text)

for word in doc:
    print(word.text,  word.lemma_)

Elvish elvish
singing singing
is be
not not
a a
thing thing
to to
miss miss


In [67]:
text = "This is the story of how a Baggins had an adventure, and found himself doing and saying things altogether unexpected."
doc = nlp(text)
s=""
for word in doc:
    s+=word.lemma_+" "
print(s)

this be the story of how a Baggins have an adventure , and find -PRON- do and say thing altogether unexpected . 


In [32]:
text = "This is the story of how a Baggins had an adventure, and found himself doing and saying things altogether unexpected."
tokenized = nltk.word_tokenize(text)
s=""
for word in tokenized:
    s+=wordnetLem.lemmatize(word, pos="v")+" "
print(s)

This be the story of how a Baggins have an adventure , and find himself do and say things altogether unexpected . 


# Stop words and Bag of words

In [74]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/guillaumec/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [43]:
text = "In a hole in the ground there lived a hobbit."

In [44]:
from nltk.corpus import stopwords
stop = set(stopwords.words('english')) 

tokenized = nltk.word_tokenize(text)
filtered = [w.lower() for w in tokenized if (not w.lower() in stop and w.isalnum())]
print(filtered)



['hole', 'ground', 'lived', 'hobbit']


In [46]:
from spacy.lang.en.stop_words import STOP_WORDS
doc = nlp(text)
token_list = [token.text for token in doc if token.is_punct == False]

filtered = [word for word in token_list if nlp.vocab[word].is_stop == False]
print(filtered)

['hole', 'ground', 'lived', 'hobbit']


In [112]:
text = "In a hole in the ground there lived a hobbit."


from nltk.corpus import stopwords
stop = set(stopwords.words('english')) 


tokenized = nltk.word_tokenize(text)
f =  [w.lower() for w in tokenized]
print(f)
filtered = [w.lower() for w in tokenized \
            if (not w.lower in stop and w.isalnum())]

['in', 'a', 'hole', 'in', 'the', 'ground', 'there', 'lived', 'a', 'hobbit', '.']


In [117]:
for w in tokenized:
    print(w, w.lower() in stop)

In True
a True
hole False
in True
the True
ground False
there True
lived False
a True
hobbit False
. False


# Bag of words

In [135]:
t1 = "I wish I was at home [...] with the kettle just beginning to sing!"
t2 = "Home is now behind you, the world is ahead!"
t3 = "He jumped up to [...] put his kettle on — and found he was not home at all."

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lem = WordNetLemmatizer()
stop = set(stopwords.words('english')) 


tokenized = nltk.word_tokenize(t1)
f =  [w.lower() for w in tokenized]
filtered1 = [lem.lemmatize(w.lower(), pos="v") for w in tokenized \
            if (not w.lower() in stop and w.isalnum())]


tokenized = nltk.word_tokenize(t2)
f =  [w.lower() for w in tokenized]
filtered2 = [lem.lemmatize(w.lower(), pos="v") for w in tokenized \
            if (not w.lower() in stop and w.isalnum())]


tokenized = nltk.word_tokenize(t3)
f =  [w.lower() for w in tokenized]
filtered3 = [lem.lemmatize(w.lower(), pos="v") for w in tokenized \
            if (not w.lower() in stop and w.isalnum())]


# tokenized = nltk.word_tokenize(t4)
# f =  [w.lower() for w in tokenized]
# filtered4 = [lem.lemmatize(w.lower(), pos="v") for w in tokenized \
#             if (not w.lower() in stop and w.isalnum())]

vocab = list(set(filtered1+filtered2+filtered3))

print(vocab)

['kettle', 'wish', 'behind', 'sing', 'home', 'ahead', 'jump', 'begin', 'world', 'find', 'put']


In [138]:
v1 = []
for w in vocab:
    c = 1 if w in filtered1 else 0
    v1.append(c)
print(v1)

v2 = []
for w in vocab:
    c = 1 if w in filtered2 else 0
    v2.append(c)
print(v2)

v3 = []
for w in vocab:
    c = 1 if w in filtered3 else 0
    v3.append(c)
print(v3)

[1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0]
[0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0]
[1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1]


# NER

In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [40]:
ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [41]:
sent = nltk.word_tokenize(sent)
sent = nltk.pos_tag(sent)

NameError: name 'sent' is not defined

In [34]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [38]:
text = """Not that Belladonna Took ever had any adventures after she became Mrs. Bungo Baggins. Bungo, that was Bilbo’s father, built the most luxurious hobbit-hole for her (and partly with her money) that was to be found either under The Hill or over The Hill or across The Water, and there they remained to the end of their days."""


In [39]:
doc = nlp(text)
displacy.render(doc, jupyter=True, style='ent')