Installing necessary libraries

In [None]:
import pandas as pd
import nltk
import math

from collections import Counter
from operator import itemgetter
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

Get the english dataset

In [None]:
!wget http://www.cs.cmu.edu/~ark/personas/data/MovieSummaries.tar.gz
!tar -xf MovieSummaries.tar.gz

In [None]:
!python3 -m nltk.downloader wordnet
!unzip /root/nltk_data/corpora/wordnet.zip -d /root/nltk_data/corpora/

Tokenize english dataset with NLTK and visualize

In [None]:
# read as dataframe
df_english = pd.read_csv('/content/MovieSummaries/plot_summaries.txt', delimiter = '\t',names=['id','text'])
df_english.info()
df_english = df_english.head(20)

# normalize and tokenize
token_list_nltk = []
tokenizer = RegexpTokenizer(r'\w+')
for i in range(len(df_english)):
  content = df_english['text'][i]
  token_list_nltk.append(tokenizer.tokenize(content))

# remove stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
for text_tokens in token_list_nltk:
  for word in text_tokens:
    if word in list(stop_words):
      text_tokens.remove(word)

# add to data frame
df_english['tokenizedTextNLTK'] = token_list_nltk

# stemming
stemmer = PorterStemmer()
stem_list_nltk = []
for each_list in token_list_nltk:
  each_list_stems = []
  for word in each_list:
    each_list_stems.append(stemmer.stem(word))
  stem_list_nltk.append(each_list_stems)

# add to data frame
df_english['stemmedTextNLTK'] = stem_list_nltk

# lemmatize
lemmatizer = WordNetLemmatizer()
lemma_list_nltk = []
for each_list in stem_list_nltk:
  each_list_lemmas = []
  for word in each_list:
    each_list_lemmas.append(lemmatizer.lemmatize(word))
  lemma_list_nltk.append(each_list_lemmas)

# add to data frame
df_english['lemmatizedTextNLTK'] = lemma_list_nltk

def count_list(lst, x):
    count = 0
    for i in range(len(lst)):
        if x in lst[i]:
            count+= 1
    return count

tf_dics_list = []
all_tokens =[]
for doc_tokens in lemma_list_nltk:
  for token in doc_tokens:
    all_tokens.append(token)

all_tokens = set(all_tokens)
for doc_tokens in lemma_list_nltk:
  doc_dic_list = []
  for token in all_tokens:
    doc_dic_list.append(
        {
            'keyword':token,
            'tf':(doc_tokens.count(token)/len(doc_tokens)),
            'idf':0,
            'tf-idf':0
        }
    )
  tf_dics_list.append(doc_dic_list)

for doc_dics in tf_dics_list:
  for doc_dic in doc_dics:
    doc_dic['idf'] = math.log(len(lemma_list_nltk)/count_list(lemma_list_nltk,doc_dic['keyword']))
    doc_dic['tf-idf'] = doc_dic['tf'] * doc_dic['idf']

while(True):

  query = input('Enter your query: ')

  # tokenize query
  query_tokenized = []
  tokenizer = RegexpTokenizer(r'\w+')
  query_tokenized = tokenizer.tokenize(query)

  # remove stop words
  query_removed_stop_words = []
  for token in query_tokenized:
     if token not in list(stop_words):
        query_removed_stop_words.append(token)

  # stem
  query_stemmed = []
  for token in query_removed_stop_words:
      query_stemmed.append(stemmer.stem(token))

  # lemma
  query_lemmatized = []
  for token in query_stemmed:
      query_lemmatized.append(lemmatizer.lemmatize(token))

  docs_scores = []
  index_counter = 0
  for doc in tf_dics_list:
    doc_score = 0
    for dic in doc:
      if dic['keyword'] in query_lemmatized:
        doc_score = doc_score + dic['tf-idf']
    docs_scores.append({
        'index':index_counter,
        'score':doc_score
    })
    index_counter = index_counter + 1

  newlist = sorted(docs_scores, key=itemgetter('score'), reverse=True)
  for i in range(5):
    print(df_english.loc[newlist[i]['index']])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42303 entries, 0 to 42302
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      42303 non-null  int64 
 1   text    42303 non-null  object
dtypes: int64(1), object(1)
memory usage: 661.1+ KB


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Enter your query: go
id                                                              4018288
text                  Debbie's favorite band is Dream Street, and he...
tokenizedTextNLTK     [Debbie, favorite, band, Dream, Street, her, f...
stemmedTextNLTK       [debbi, favorit, band, dream, street, her, fav...
lemmatizedTextNLTK    [debbi, favorit, band, dream, street, her, fav...
Name: 17, dtype: object
id                                                              1480747
text                  {{plot}} Following the sudden death of Kid's f...
tokenizedTextNLTK     [plot, Following, sudden, death, Kid, father, ...
stemmedTextNLTK       [plot, follow, sudden, death, kid, father, pop...
lemmatizedTextNLTK    [plot, follow, sudden, death, kid, father, pop...
Name: 14, dtype: object
id                                                              1952976
text                  {{plot}} The film opens in 1974, as a young gi...
tokenizedTextNLTK     [plot, The, film, opens, 1974, young, girl, D