# Day 6 - Natural Language Processing

In [1]:
from nltk import download

In [46]:
download('punkt')
download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/efeacikgoz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/efeacikgoz/nltk_data...


True

In [3]:
text ="Yaylaya gitmişti yayla zamanı, Gülizar döndü de Döndü dönmedi."

In [4]:
text.split()

['Yaylaya',
 'gitmişti',
 'yayla',
 'zamanı,',
 'Gülizar',
 'döndü',
 'de',
 'Döndü',
 'dönmedi.']

## Tokenize - split the words of a text into a python array

sent_tokenize tokenizes sentences.

In [5]:
from nltk.tokenize import sent_tokenize as st

In [6]:
sentence = "Welcome readers. I hope you find it interesting. Please do reply."

In [7]:
st(sentence)

['Welcome readers.', 'I hope you find it interesting.', 'Please do reply.']

In [8]:
from nltk.tokenize import word_tokenize as wt

In [9]:
wt(sentence)

['Welcome',
 'readers',
 '.',
 'I',
 'hope',
 'you',
 'find',
 'it',
 'interesting',
 '.',
 'Please',
 'do',
 'reply',
 '.']

In [10]:
from nltk.tokenize import TreebankWordTokenizer

In [11]:
tknzr = TreebankWordTokenizer()
tknzr.tokenize(sentence)

['Welcome',
 'readers.',
 'I',
 'hope',
 'you',
 'find',
 'it',
 'interesting.',
 'Please',
 'do',
 'reply',
 '.']

This tokenizer does not include the dots in the midlle of the text.

In [12]:
from nltk.tokenize import WordPunctTokenizer

In [13]:
tk = WordPunctTokenizer()

In [14]:
tk.tokenize("Don't heistate to ask questions.")

['Don', "'", 't', 'heistate', 'to', 'ask', 'questions', '.']

In [15]:
from nltk.tokenize import BlanklineTokenizer

In [16]:
blt = BlanklineTokenizer()

sent = '''
Hello miss

I saw that I got an unexpected grade from the midterm. I would like to know if I can see my paper.

with respect
student
'''

In [17]:
blt.tokenize(sent)

['\nHello miss',
 'I saw that I got an unexpected grade from the midterm. I would like to know if I can see my paper.',
 'with respect\nstudent\n']

In [18]:
from nltk.tokenize import RegexpTokenizer

In [19]:
sent = "She secures 90.56% in class X. She is a meritorious student."

In [20]:
tkn = RegexpTokenizer('[A-Z]\w+') # Regex that matches all the words that start with a capital letter.

In [21]:
tkn.tokenize(sent)

['She', 'She']

### Lemma & Stemma

Remove the prefixes and find the stem of a word.

#### Stemma: Remove the prefixes

In [34]:
from nltk.stem import PorterStemmer

In [35]:
pr = PorterStemmer()

In [36]:
pr.stem('talking')

'talk'

Removed the 'ing' prefix.

In [37]:
pr.stem('happiness')

'happi'

In [38]:
pr.stem('geliyorlar')

'geliyorlar'

Did not work on a turkish word.

In [39]:
pr.stem('welcome')

'welcom'

In [40]:
words = [
    'houses', 'trains', 'pens', 'cars', 'eaten', 'sick', 'bought', 'selling', 'sized', 'speech', 'rolling', 'marching',
    'identification', 'universal', 'beautiful', 'references'
]

In [41]:
stems = [pr.stem(word) for word in words]

In [42]:
stems

['hous',
 'train',
 'pen',
 'car',
 'eaten',
 'sick',
 'bought',
 'sell',
 'size',
 'speech',
 'roll',
 'march',
 'identif',
 'univers',
 'beauti',
 'refer']

#### Lemma: Find the stem

In [43]:
from nltk.stem import WordNetLemmatizer

In [44]:
lm = WordNetLemmatizer()

In [48]:
lm.lemmatize('working') # Needs nltk.download('wordnet')

'working'

In [51]:
lm.lemmatize(pr.stem('happiness'))

'happi'