------------------------
#### Overview on NLTK

- Natural language tool kit
------------------------

In [1]:
import nltk

In [2]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

#### Token
- A token is a piece of a whole, so a word is a token in a sentence, and a sentence is a token in a paragraph.
- Tokenization is the process of splitting a string into a list of tokens.

In [6]:
mystring = "My favorite color is blue"

In [7]:
mystring.split()

['My', 'favorite', 'color', 'is', 'blue']

In [9]:
mystring = "My favorite colors are blue, red, and green."
mystring.split()

['My', 'favorite', 'colors', 'are', 'blue,', 'red,', 'and', 'green.']

In [11]:
mystring = 'https://t.co/9z2J3P33Uc'
mystring.split()

['https://t.co/9z2J3P33Uc']

In [12]:
compare_list = ['https://t.co/9z2J3P33Uc',
               'laugh/cry',
               'ðŸ˜¬ðŸ˜­ðŸ˜“ðŸ¤¢ðŸ™„ðŸ˜±',
               "world's problems",
               "@datageneral",
                "It's interesting",
               "don't spell my name right",
               'all-nighter',
                "My favorite color is blue",
                "My favorite colors are blue, red, and green."]

#### Method 1 : using word tokenizer of NLTK

In [13]:
from nltk.tokenize import word_tokenize

In [15]:
word_tokenize('My favorite colors are blue, red, and green.')

['My',
 'favorite',
 'colors',
 'are',
 'blue',
 ',',
 'red',
 ',',
 'and',
 'green',
 '.']

In [17]:
word_tokens = []

for sent in compare_list:
    sent_tokens = word_tokenize(sent)

    word_tokens.append(sent_tokens)

In [18]:
word_tokens

[['https', ':', '//t.co/9z2J3P33Uc'],
 ['laugh/cry'],
 ['ðŸ˜¬ðŸ˜­ðŸ˜“ðŸ¤¢ðŸ™„ðŸ˜±'],
 ['world', "'s", 'problems'],
 ['@', 'datageneral'],
 ['It', "'s", 'interesting'],
 ['do', "n't", 'spell', 'my', 'name', 'right'],
 ['all-nighter'],
 ['My', 'favorite', 'color', 'is', 'blue'],
 ['My',
  'favorite',
  'colors',
  'are',
  'blue',
  ',',
  'red',
  ',',
  'and',
  'green',
  '.']]

#### Method 2 - Word Punct tokenizer

In [19]:
from nltk.tokenize import WordPunctTokenizer

In [20]:
punct_tokenizer = WordPunctTokenizer()

In [21]:
punct_tokens = []

for sent in compare_list:
    
    punct_tokens.append(punct_tokenizer.tokenize(sent))
punct_tokens

[['https', '://', 't', '.', 'co', '/', '9z2J3P33Uc'],
 ['laugh', '/', 'cry'],
 ['ðŸ˜¬ðŸ˜­ðŸ˜“ðŸ¤¢ðŸ™„ðŸ˜±'],
 ['world', "'", 's', 'problems'],
 ['@', 'datageneral'],
 ['It', "'", 's', 'interesting'],
 ['don', "'", 't', 'spell', 'my', 'name', 'right'],
 ['all', '-', 'nighter'],
 ['My', 'favorite', 'color', 'is', 'blue'],
 ['My',
  'favorite',
  'colors',
  'are',
  'blue',
  ',',
  'red',
  ',',
  'and',
  'green',
  '.']]

#### Method 3 - Tweet tokenizer

In [22]:
from nltk.tokenize import TweetTokenizer

In [23]:
tweet_tokenizer = TweetTokenizer()

In [24]:
tweet_tokens = []

for sent in compare_list:
    
    print(tweet_tokenizer.tokenize(sent))
    
    tweet_tokens.append(tweet_tokenizer.tokenize(sent))

['https://t.co/9z2J3P33Uc']
['laugh', '/', 'cry']
['ðŸ˜¬', 'ðŸ˜­', 'ðŸ˜“', 'ðŸ¤¢', 'ðŸ™„', 'ðŸ˜±']
["world's", 'problems']
['@datageneral']
["It's", 'interesting']
["don't", 'spell', 'my', 'name', 'right']
['all-nighter']
['My', 'favorite', 'color', 'is', 'blue']
['My', 'favorite', 'colors', 'are', 'blue', ',', 'red', ',', 'and', 'green', '.']


#### Stop words

In [25]:
from nltk.corpus import stopwords

In [26]:
print(set(stopwords.words('english')))

{'t', 'against', 'under', 'haven', 'couldn', 'will', 'o', "haven't", "wouldn't", 'is', 'be', "you'll", 'that', 'do', 'into', 'any', 'll', 'itself', 'mightn', 'are', 'theirs', 'when', 'so', 's', 'yours', 'doing', 'before', 'own', 'am', "hadn't", "won't", 'me', "she's", "you're", "shan't", 'y', 'no', 'as', 'with', 'his', 'can', 'this', 'those', "didn't", 'hadn', "isn't", 'most', 'shouldn', 'and', 'these', 'very', 'don', 'at', 've', 'been', 'both', 'other', 'm', "wasn't", 'its', "doesn't", 'hers', 'won', 'yourself', 'the', 'ourselves', 'her', 'needn', 'were', 'for', 'what', 'same', 'wasn', 'few', 'ours', 'ain', 'there', 'below', 'some', 'from', 'between', 'whom', 'then', 'about', 're', 'while', 'a', "hasn't", 'my', 'wouldn', 'because', 'off', "needn't", 'to', 'such', 'have', 'didn', "mustn't", "weren't", 'or', 'of', 'in', 'having', 'up', 'themselves', 'on', 'you', 'ma', "should've", "you've", 'it', 'being', 'which', 'over', 'shan', "it's", 'them', 'should', 'doesn', 'after', 'aren', "shou

In [31]:
example_sent = "This is a sample sentence, showing off the stop words filtration."

In [32]:
stop_words = set(stopwords.words('english'))

In [36]:
#list(stop_words)

In [33]:
filtered_sentence = [w for w in word_tokens if not w in list(stop_words]

TypeError: unhashable type: 'list'