------------------------
#### Overview on NLTK

- Natural language tool kit
------------------------

In [1]:
import nltk

In [2]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

#### Token
- A token is a piece of a whole, so a word is a token in a sentence, and a sentence is a token in a paragraph.
- Tokenization is the process of splitting a string into a list of tokens.

In [6]:
mystring = "My favorite color is blue"

In [7]:
mystring.split()

['My', 'favorite', 'color', 'is', 'blue']

In [9]:
mystring = "My favorite colors are blue, red, and green."
mystring.split()

['My', 'favorite', 'colors', 'are', 'blue,', 'red,', 'and', 'green.']

In [11]:
mystring = 'https://t.co/9z2J3P33Uc'
mystring.split()

['https://t.co/9z2J3P33Uc']

In [12]:
compare_list = ['https://t.co/9z2J3P33Uc',
               'laugh/cry',
               '😬😭😓🤢🙄😱',
               "world's problems",
               "@datageneral",
                "It's interesting",
               "don't spell my name right",
               'all-nighter',
                "My favorite color is blue",
                "My favorite colors are blue, red, and green."]

#### Method 1 : using word tokenizer of NLTK

In [13]:
from nltk.tokenize import word_tokenize

In [15]:
word_tokenize('My favorite colors are blue, red, and green.')

['My',
 'favorite',
 'colors',
 'are',
 'blue',
 ',',
 'red',
 ',',
 'and',
 'green',
 '.']

In [17]:
word_tokens = []

for sent in compare_list:
    sent_tokens = word_tokenize(sent)

    word_tokens.append(sent_tokens)

In [18]:
word_tokens

[['https', ':', '//t.co/9z2J3P33Uc'],
 ['laugh/cry'],
 ['😬😭😓🤢🙄😱'],
 ['world', "'s", 'problems'],
 ['@', 'datageneral'],
 ['It', "'s", 'interesting'],
 ['do', "n't", 'spell', 'my', 'name', 'right'],
 ['all-nighter'],
 ['My', 'favorite', 'color', 'is', 'blue'],
 ['My',
  'favorite',
  'colors',
  'are',
  'blue',
  ',',
  'red',
  ',',
  'and',
  'green',
  '.']]

#### Method 2 - Word Punct tokenizer

In [19]:
from nltk.tokenize import WordPunctTokenizer

In [20]:
punct_tokenizer = WordPunctTokenizer()

In [21]:
punct_tokens = []

for sent in compare_list:
    
    punct_tokens.append(punct_tokenizer.tokenize(sent))
punct_tokens

[['https', '://', 't', '.', 'co', '/', '9z2J3P33Uc'],
 ['laugh', '/', 'cry'],
 ['😬😭😓🤢🙄😱'],
 ['world', "'", 's', 'problems'],
 ['@', 'datageneral'],
 ['It', "'", 's', 'interesting'],
 ['don', "'", 't', 'spell', 'my', 'name', 'right'],
 ['all', '-', 'nighter'],
 ['My', 'favorite', 'color', 'is', 'blue'],
 ['My',
  'favorite',
  'colors',
  'are',
  'blue',
  ',',
  'red',
  ',',
  'and',
  'green',
  '.']]

#### Method 3 - Tweet tokenizer

In [22]:
from nltk.tokenize import TweetTokenizer

In [23]:
tweet_tokenizer = TweetTokenizer()

In [24]:
tweet_tokens = []

for sent in compare_list:
    
    print(tweet_tokenizer.tokenize(sent))
    
    tweet_tokens.append(tweet_tokenizer.tokenize(sent))

['https://t.co/9z2J3P33Uc']
['laugh', '/', 'cry']
['😬', '😭', '😓', '🤢', '🙄', '😱']
["world's", 'problems']
['@datageneral']
["It's", 'interesting']
["don't", 'spell', 'my', 'name', 'right']
['all-nighter']
['My', 'favorite', 'color', 'is', 'blue']
['My', 'favorite', 'colors', 'are', 'blue', ',', 'red', ',', 'and', 'green', '.']


#### Stop words

In [25]:
from nltk.corpus import stopwords

In [26]:
print(set(stopwords.words('english')))

{'t', 'against', 'under', 'haven', 'couldn', 'will', 'o', "haven't", "wouldn't", 'is', 'be', "you'll", 'that', 'do', 'into', 'any', 'll', 'itself', 'mightn', 'are', 'theirs', 'when', 'so', 's', 'yours', 'doing', 'before', 'own', 'am', "hadn't", "won't", 'me', "she's", "you're", "shan't", 'y', 'no', 'as', 'with', 'his', 'can', 'this', 'those', "didn't", 'hadn', "isn't", 'most', 'shouldn', 'and', 'these', 'very', 'don', 'at', 've', 'been', 'both', 'other', 'm', "wasn't", 'its', "doesn't", 'hers', 'won', 'yourself', 'the', 'ourselves', 'her', 'needn', 'were', 'for', 'what', 'same', 'wasn', 'few', 'ours', 'ain', 'there', 'below', 'some', 'from', 'between', 'whom', 'then', 'about', 're', 'while', 'a', "hasn't", 'my', 'wouldn', 'because', 'off', "needn't", 'to', 'such', 'have', 'didn', "mustn't", "weren't", 'or', 'of', 'in', 'having', 'up', 'themselves', 'on', 'you', 'ma', "should've", "you've", 'it', 'being', 'which', 'over', 'shan', "it's", 'them', 'should', 'doesn', 'after', 'aren', "shou

In [39]:
example_sent = "This is a sample sentence, showing off the stop words filtration."

In [40]:
word_tokens = word_tokenize(example_sent)

In [41]:
stop_words = set(stopwords.words('english'))

In [42]:
#list(stop_words)

In [43]:
filtered_sentence = [w for w in word_tokens if not w in list(stop_words)]

In [44]:
filtered_sentence

['This',
 'sample',
 'sentence',
 ',',
 'showing',
 'stop',
 'words',
 'filtration',
 '.']

#### Stemming

In [45]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import SnowballStemmer

In [46]:
porter   = PorterStemmer()
lancaster= LancasterStemmer()
sno      = nltk.stem.SnowballStemmer('english')

In [47]:
word_list = ["run", "ran", "runner", "running"]

print("{0:20} {1:20} {2:20} {3:20}".format("Word","Porter Stemmer", "lancaster Stemmer", "Snowball Stemmer"))

for word in word_list:
    print("{0:20} {1:20} {2:20} {3:20}".format(word, porter.stem(word), lancaster.stem(word), sno.stem(word)))

Word                 Porter Stemmer       lancaster Stemmer    Snowball Stemmer    
run                  run                  run                  run                 
ran                  ran                  ran                  ran                 
runner               runner               run                  runner              
running              run                  run                  run                 


In [49]:
word_list = ["cats", "trouble", "troubling", "troubled", "troublesome"]

print("{0:20} {1:20} {2:20} {3:20}".format("Word","Porter Stemmer", "lancaster Stemmer", "Snowball Stemmer"))

for word in word_list:
    print("{0:20} {1:20} {2:20} {3:20}".format(word, porter.stem(word), lancaster.stem(word), sno.stem(word))) 

Word                 Porter Stemmer       lancaster Stemmer    Snowball Stemmer    
cats                 cat                  cat                  cat                 
trouble              troubl               troubl               troubl              
troubling            troubl               troubl               troubl              
troubled             troubl               troubl               troubl              
troublesome          troublesom           troublesom           troublesom          


In [50]:
#A list of words to be stemmed
word_list = ["friend", "friendship", "friends", "friendships","stabil","destabilize","misunderstanding","railroad","moonlight","football"]

print("{0:20} {1:20} {2:20} {3:20}".format("Word","Porter Stemmer", "lancaster Stemmer", "Snowball Stemmer"))
for word in word_list:
    print("{0:20} {1:20} {2:20} {3:20}".format(word, porter.stem(word), lancaster.stem(word), sno.stem(word)))

Word                 Porter Stemmer       lancaster Stemmer    Snowball Stemmer    
friend               friend               friend               friend              
friendship           friendship           friend               friendship          
friends              friend               friend               friend              
friendships          friendship           friend               friendship          
stabil               stabil               stabl                stabil              
destabilize          destabil             dest                 destabil            
misunderstanding     misunderstand        misunderstand        misunderstand       
railroad             railroad             railroad             railroad            
moonlight            moonlight            moonlight            moonlight           
football             footbal              footbal              footbal             


#### lemmatize

In [51]:
from nltk.stem import WordNetLemmatizer

# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()

In [52]:
word_list = ["friend", "friendship", "friends", "friendships","stabilize","destabilize","misunderstanding","railroad","moonlight","football"]

print("{0:20} {1:20}".format("Word","WordNetLemmatizer"))

for word in word_list:
    print("{0:20} {1:20} ".format(word, lemmatizer.lemmatize(word)))

Word                 WordNetLemmatizer   
friend               friend               
friendship           friendship           
friends              friend               
friendships          friendship           
stabilize            stabilize            
destabilize          destabilize          
misunderstanding     misunderstanding     
railroad             railroad             
moonlight            moonlight            
football             football             


#### ngrams

In [53]:
import nltk
from nltk.util import ngrams

In [54]:
text = 'Data science is a wonderful program, \
Data science is a land of opportunities,data science is about machine learning '

In [58]:
list(ngrams(nltk.word_tokenize(text), 3))

[('Data', 'science', 'is'),
 ('science', 'is', 'a'),
 ('is', 'a', 'wonderful'),
 ('a', 'wonderful', 'program'),
 ('wonderful', 'program', ','),
 ('program', ',', 'Data'),
 (',', 'Data', 'science'),
 ('Data', 'science', 'is'),
 ('science', 'is', 'a'),
 ('is', 'a', 'land'),
 ('a', 'land', 'of'),
 ('land', 'of', 'opportunities'),
 ('of', 'opportunities', ','),
 ('opportunities', ',', 'data'),
 (',', 'data', 'science'),
 ('data', 'science', 'is'),
 ('science', 'is', 'about'),
 ('is', 'about', 'machine'),
 ('about', 'machine', 'learning')]