***PRE-PROCESSING***

In [None]:
#TOKENIZATION
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
para = "I am lost form the light, who was born from the shadows. I am the rightful heir of death and bastard son of fate. Wherever I go, ruin follows. If you had any sense, you would have run away as soon as you saw me."

In [None]:
#sentence tokenization
sent_tokens = nltk.sent_tokenize(para)
print("Sentence Tokenized : ", sent_tokens)

#word tokenization
word_tokens = nltk.word_tokenize(para)
print("Word Tokenized : ", word_tokens)

Sentence Tokenized :  ['I am lost form the light, who was born from the shadows.', 'I am the rightful heir of death and bastard son of fate.', 'Wherever I go, ruin follows.', 'If you had any sense, you would have run away as soon as you saw me.']
Word Tokenized :  ['I', 'am', 'lost', 'form', 'the', 'light', ',', 'who', 'was', 'born', 'from', 'the', 'shadows', '.', 'I', 'am', 'the', 'rightful', 'heir', 'of', 'death', 'and', 'bastard', 'son', 'of', 'fate', '.', 'Wherever', 'I', 'go', ',', 'ruin', 'follows', '.', 'If', 'you', 'had', 'any', 'sense', ',', 'you', 'would', 'have', 'run', 'away', 'as', 'soon', 'as', 'you', 'saw', 'me', '.']


***Stopwords removal***

In [None]:
#stopword removal
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
print("Stop Words : ", stop_words)
nstop = []
for w in word_tokens:
  if w not in stop_words:
    nstop.append(w)

print("After removing stopwords :", nstop)

Stop Words :  {'into', 'myself', 'such', 'is', 'we', 'through', 'against', 'over', "needn't", 'between', 'an', 'whom', 'had', "you're", 'they', 'below', 'just', 'i', 'hers', 'won', 'because', 'themselves', 'or', 'your', 'does', 'so', 'until', 'aren', 'are', 'to', 'himself', 'other', "you'd", 'been', 'my', 'on', 'who', 'their', 'doesn', 'only', 'o', 'while', 'd', 'ain', "haven't", 'here', 'both', 'weren', 'these', 'off', "don't", 'once', 'do', 'them', 'with', 'those', 'no', "won't", 'all', 'why', 'shouldn', 'his', 'during', 'for', 've', "she's", 'down', 'there', "hasn't", 're', "should've", "isn't", 'isn', 'will', 'was', 'by', "wouldn't", "it's", 'before', 'm', 'any', 'me', 'he', "shan't", 'where', 'in', "doesn't", 'yourself', 'have', 'at', 's', "didn't", 'when', 'can', 'further', 'being', 'now', 'than', 'it', 'few', 'the', 'and', "wasn't", 'own', 'haven', 'after', 'wouldn', 'under', 'each', 'more', 'am', "you've", 'if', 'not', 'theirs', 'don', 'same', 'then', "weren't", 'some', 'yours'

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


***Stemming and Lemmatization***

In [None]:
#stemming
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmed = []
for w in nstop:
  print(w, " : ", stemmer.stem(w))
  stemmed.append(stemmer.stem(w))

print("Stemmed : ", stemmed)

I  :  i
lost  :  lost
form  :  form
light  :  light
,  :  ,
born  :  born
shadows  :  shadow
.  :  .
I  :  i
rightful  :  right
heir  :  heir
death  :  death
bastard  :  bastard
son  :  son
fate  :  fate
.  :  .
Wherever  :  wherev
I  :  i
go  :  go
,  :  ,
ruin  :  ruin
follows  :  follow
.  :  .
If  :  if
sense  :  sens
,  :  ,
would  :  would
run  :  run
away  :  away
soon  :  soon
saw  :  saw
.  :  .
Stemmed :  ['i', 'lost', 'form', 'light', ',', 'born', 'shadow', '.', 'i', 'right', 'heir', 'death', 'bastard', 'son', 'fate', '.', 'wherev', 'i', 'go', ',', 'ruin', 'follow', '.', 'if', 'sens', ',', 'would', 'run', 'away', 'soon', 'saw', '.']


In [None]:
#lemmetization
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmetizer = WordNetLemmatizer()
lemm = []

for w in stemmed:
  print(w, ":", lemmetizer.lemmatize(w))
  lemm.append(lemmetizer.lemmatize(w))

print("After lemmatization: ", lemm)

[nltk_data] Downloading package wordnet to /root/nltk_data...


i : i
lost : lost
form : form
light : light
, : ,
born : born
shadow : shadow
. : .
i : i
right : right
heir : heir
death : death
bastard : bastard
son : son
fate : fate
. : .
wherev : wherev
i : i
go : go
, : ,
ruin : ruin
follow : follow
. : .
if : if
sens : sen
, : ,
would : would
run : run
away : away
soon : soon
saw : saw
. : .
After lemmatization:  ['i', 'lost', 'form', 'light', ',', 'born', 'shadow', '.', 'i', 'right', 'heir', 'death', 'bastard', 'son', 'fate', '.', 'wherev', 'i', 'go', ',', 'ruin', 'follow', '.', 'if', 'sen', ',', 'would', 'run', 'away', 'soon', 'saw', '.']


***WORD2VEC***

In [None]:
!pip install gensim
from gensim.models import Word2Vec



In [None]:
tokenized_sentences = [
    [word for word in nltk.word_tokenize(sentence.lower()) if word not in stop_words]
    for sentence in sent_tokens
]
print("Word Tokenized: ", tokenized_sentences)
model = Word2Vec(sentences=tokenized_sentences, min_count=1)

Word Tokenized:  [['lost', 'form', 'light', ',', 'born', 'shadows', '.'], ['rightful', 'heir', 'death', 'bastard', 'son', 'fate', '.'], ['wherever', 'go', ',', 'ruin', 'follows', '.'], ['sense', ',', 'would', 'run', 'away', 'soon', 'saw', '.']]


In [None]:
model.wv.similar_by_word('death')

[('soon', 0.12812119722366333),
 ('son', 0.10947464406490326),
 ('shadows', 0.10898242145776749),
 ('lost', 0.09931963682174683),
 ('follows', 0.09614686667919159),
 ('would', 0.0863887295126915),
 ('.', 0.06285606324672699),
 ('born', 0.05057644471526146),
 ('rightful', 0.026872780174016953),
 ('light', 0.020000817254185677)]

In [None]:
model.wv.words_closer_than('shadows', 'light')

  model.wv.words_closer_than('shadows', 'light')


['.',
 'saw',
 'bastard',
 'form',
 'born',
 'death',
 'son',
 'soon',
 'fate',
 'wherever',
 'go',
 'ruin',
 'follows',
 'sense',
 'would',
 'run',
 'lost']