# Tokenization

In [1]:
import nltk
sentence = "At eight o'clock on Thursday morning Arthur didn't feel very good."
tokens = nltk.word_tokenize(sentence)
tokens

['At',
 'eight',
 "o'clock",
 'on',
 'Thursday',
 'morning',
 'Arthur',
 'did',
 "n't",
 'feel',
 'very',
 'good',
 '.']

# POS Tagging

In [2]:
tagged = nltk.pos_tag(tokens)
tagged[0:6]

[('At', 'IN'),
 ('eight', 'CD'),
 ("o'clock", 'NN'),
 ('on', 'IN'),
 ('Thursday', 'NNP'),
 ('morning', 'NN')]

# Stop words removal!

In [3]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
print(stop_words)

{'not', 'hasn', 'until', 'herself', 'into', 'here', 'because', 'it', 'itself', 'then', 'such', 'aren', 'very', 'isn', "haven't", 'only', 'whom', 'being', 'when', 'if', 'its', 'won', 'just', 'to', 'both', 'through', 've', 'who', 'weren', 'but', 'ours', 'hers', 'an', 'has', 'couldn', "wasn't", 'll', 'further', 'them', 'can', 'are', 'i', "you'd", "you're", "you'll", "shouldn't", 'theirs', 'against', 'you', "wouldn't", 'myself', 'o', 'their', 'what', 'as', 'haven', "didn't", 'my', 'any', "it's", "should've", 'all', 'this', 'wasn', 'of', 'before', 'where', "couldn't", 'ain', 'did', 'between', 'him', 'themselves', 'y', "you've", 'above', 'he', 'most', "aren't", "hasn't", 'yourselves', 'which', 'the', 'these', 'himself', 'with', 'each', 'a', 're', 'up', 'she', 'yours', 'off', 'after', 'too', "hadn't", 'wouldn', 'no', 'his', 'do', 'nor', 'shouldn', 'about', "mustn't", 'that', 'needn', 'in', "she's", 'few', 'doing', "needn't", 'while', 'we', 'me', 'again', 'on', 'shan', 'was', 'those', 'for', '

In [4]:
filtered_sent = []
for w in tokens:
    if w not in stop_words:
        filtered_sent.append(w)
print("Tokenized Sentence:", tokens)
print("Filtered Sentense:", filtered_sent)

Tokenized Sentence: ['At', 'eight', "o'clock", 'on', 'Thursday', 'morning', 'Arthur', 'did', "n't", 'feel', 'very', 'good', '.']
Filtered Sentense: ['At', 'eight', "o'clock", 'Thursday', 'morning', 'Arthur', "n't", 'feel', 'good', '.']


# Stemming

In [5]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
py_stem = PorterStemmer()
sentence = "At eight o'clock on Thursday morning Arthur didn't feel very good."
py_word = word_tokenize (sentence)
for word in py_word:
    print (word, " : ", py_stem.stem(word))

At  :  at
eight  :  eight
o'clock  :  o'clock
on  :  on
Thursday  :  thursday
morning  :  morn
Arthur  :  arthur
did  :  did
n't  :  n't
feel  :  feel
very  :  veri
good  :  good
.  :  .


# Lemmatization

In [6]:
import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
sentence = "At eight o'clock on Thursday morning Arthur didn't feel very good."
tokenization = nltk.word_tokenize(sentence)
for w in tokenization:
    print("Lemma for {} is {}".format(w, wordnet_lemmatizer.lemmatize(w)))

Lemma for At is At
Lemma for eight is eight
Lemma for o'clock is o'clock
Lemma for on is on
Lemma for Thursday is Thursday
Lemma for morning is morning
Lemma for Arthur is Arthur
Lemma for did is did
Lemma for n't is n't
Lemma for feel is feel
Lemma for very is very
Lemma for good is good
Lemma for . is .


# Term Frequency

In [7]:
import nltk
import re
import heapq
import numpy as np
sentence = "At eight o'clock on Thursday morning Arthur didn't feel very good."

In [8]:
data = nltk.sent_tokenize(sentence)

In [9]:
up_count = {}
for d in data:
    words = nltk.word_tokenize(d)
    for word in words:
        if word not in up_count.keys():
            up_count[word] = 1
        else:
            up_count[word] += 1

In [10]:
top_features = heapq.nlargest(9,up_count,key=up_count.get)

In [11]:
tf = {}
for word in top_features:
    doc_tf = []
    for d in data:
        frequency = 0
        for w in nltk.word_tokenize(d):
            if word == w:
                frequency += 1
        tf_word = frequency/len(nltk.word_tokenize(d))
        doc_tf.append(tf_word)
    tf[word] = doc_tf

In [12]:
tf

{'At': [0.07692307692307693],
 'eight': [0.07692307692307693],
 "o'clock": [0.07692307692307693],
 'on': [0.07692307692307693],
 'Thursday': [0.07692307692307693],
 'morning': [0.07692307692307693],
 'Arthur': [0.07692307692307693],
 'did': [0.07692307692307693],
 "n't": [0.07692307692307693]}

# Inverse Document Frequency

In [13]:
idf = {}
for word in top_features:
    doc_count = 0
    for d in data:
        if word in nltk.word_tokenize(d):
            doc_count += 1
    idf[word] = np.log(len(data)/(1+doc_count))

In [14]:
idf

{'At': -0.6931471805599453,
 'eight': -0.6931471805599453,
 "o'clock": -0.6931471805599453,
 'on': -0.6931471805599453,
 'Thursday': -0.6931471805599453,
 'morning': -0.6931471805599453,
 'Arthur': -0.6931471805599453,
 'did': -0.6931471805599453,
 "n't": -0.6931471805599453}