# NLP Preparation Lesson

In [2]:
import pandas as pd
import numpy as np

In [3]:
original = "Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [4]:
original

"Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

## 1.  lowercase everthing

In [9]:
original = original.lower()
original

"paul erdős and george pólya were influential hungarian mathematicians who contributed a lot to the field. erdős's name contains the hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as erdos or erdös either by mistake or out of typographical necessity"

## 2. remove accented  characters and non-ASCII characters

In [10]:
import unicodedata

original = unicodedata.normalize('NFKD', original).encode('ascii', 'ignore').decode('utf-8')
original

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field. erdos's name contains the hungarian letter 'o' ('o' with double acute accent), but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

## 3. Remove special characters

In [11]:
import re

In [17]:
original = re.sub('[^a-z0-9\'\s]', '', original)
original

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

## 4. Tokenize

In [18]:
import nltk

In [19]:
tokenize = nltk.tokenize.ToktokTokenizer()
tokenize

<nltk.tokenize.toktok.ToktokTokenizer at 0x13f60e5b0>

In [30]:
original = tokenize.tokenize(original, return_str=True)
original

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

## 5. Stemming or Lemmatizing

### Stemming

In [31]:
ps = nltk.porter.PorterStemmer()
ps

<PorterStemmer>

In [32]:
ps.stem('calling'), ps.stem('calls'), ps.stem('called'), ps.stem('call')

('call', 'call', 'call', 'call')

In [33]:
ps.stem('house'), ps.stem('housing')

('hous', 'hous')

In [34]:
ps.stem(original)

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necess"

In [35]:
ps.stem('contributed')

'contribut'

In [38]:
stems = [ps.stem(word) for word in original.split()]
' '.join(stems)

"paul erdo and georg polya were influenti hungarian mathematician who contribut a lot to the field erdo ' s name contain the hungarian letter ' o ' ' o ' with doubl acut accent but is often incorrectli written as erdo or erdo either by mistak or out of typograph necess"

### Lemmatize

In [43]:
#Run the first time
#nltk.download('all')

In [39]:
original

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [40]:
wnl = nltk.stem.WordNetLemmatizer()
wnl

<WordNetLemmatizer>

In [41]:
wnl.lemmatize('calling'), wnl.lemmatize('calls'), wnl.lemmatize('called'), wnl.lemmatize('call')

('calling', 'call', 'called', 'call')

In [44]:
wnl.lemmatize('house'), wnl.lemmatize('housing')

('house', 'housing')

In [45]:
ps.stem('mouse'), ps.stem('mice')

('mous', 'mice')

In [46]:
wnl.lemmatize('mouse'), wnl.lemmatize('mice')

('mouse', 'mouse')

In [47]:
lemmas = [wnl.lemmatize(word) for word in original.split()]
' '.join(lemmas)

"paul erdos and george polya were influential hungarian mathematician who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written a erdos or erdos either by mistake or out of typographical necessity"

## 6. Remove Stopwords

In [49]:
from nltk.corpus import stopwords

In [51]:
#Do this once
#nltk.download('stopwords')

In [52]:
stopwords_english = stopwords.words('english')
stopwords_english[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [53]:
len(stopwords_english)

179

In [54]:
original

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [55]:
stopwords_english.append('o')

In [56]:
len(stopwords_english)

180

In [57]:
stopwords_english.append("'")

In [58]:
len(stopwords_english)

181

In [59]:
[word for word in original.split() if word not in stopwords_english]

['paul',
 'erdos',
 'george',
 'polya',
 'influential',
 'hungarian',
 'mathematicians',
 'contributed',
 'lot',
 'field',
 'erdos',
 'name',
 'contains',
 'hungarian',
 'letter',
 'double',
 'acute',
 'accent',
 'often',
 'incorrectly',
 'written',
 'erdos',
 'erdos',
 'either',
 'mistake',
 'typographical',
 'necessity']