## Importing libraries

In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
import nltk
resources = ['stopwords','wordnet','punkt','averaged_perceptron_tagger']
nltk.download(resources)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [17]:
## Spacy pre processing
import spacy
import regex as re
import string

## Loading data

In [18]:
df = pd.read_csv('/content/movies.csv')
df.head()

Unnamed: 0,rank,title,genre,wiki_plot,imdb_plot
0,0,The Godfather,"[u' Crime', u' Drama']","On the day of his only daughter's wedding, Vit...","In late summer 1945, guests are gathered for t..."
1,1,The Shawshank Redemption,"[u' Crime', u' Drama']","In 1947, banker Andy Dufresne is convicted of ...","In 1947, Andy Dufresne (Tim Robbins), a banker..."
2,2,Schindler's List,"[u' Biography', u' Drama', u' History']","In 1939, the Germans move Polish Jews into the...",The relocation of Polish Jews from surrounding...
3,3,Raging Bull,"[u' Biography', u' Drama', u' Sport']","In a brief scene in 1964, an aging, overweight...","The film opens in 1964, where an older and fat..."
4,4,Casablanca,"[u' Drama', u' Romance', u' War']",It is early December 1941. American expatriate...,"In the early years of World War II, December 1..."


In [5]:
df.shape

(100, 5)

## Data Cleaning and Pre-processing

### Using Spacy

In [19]:
from spacy.lang.en.stop_words import STOP_WORDS

spacy_nlp = spacy.load('en_core_web_sm')

#create list of punctuations and stopwords
punctuations = string.punctuation
stop_words = spacy.lang.en.stop_words.STOP_WORDS

#function for data cleaning and processing
#This can be further enhanced by adding / removing reg-exps as desired.

def spacy_tokenizer(sentence):

    #remove distracting single quotes
    sentence = re.sub('\'','',sentence)

    #remove digits adnd words containing digits
    sentence = re.sub('\w*\d\w*','',sentence)

    #replace extra spaces with single space
    sentence = re.sub(' +',' ',sentence)

    #remove unwanted lines starting from special charcters
    sentence = re.sub(r'\n: \'\'.*','',sentence)
    sentence = re.sub(r'\n!.*','',sentence)
    sentence = re.sub(r'^:\'\'.*','',sentence)

    #remove non-breaking new line characters
    sentence = re.sub(r'\n',' ',sentence)

    #remove punctunations
    sentence = re.sub(r'[^\w\s]',' ',sentence)

    #creating token object
    tokens = spacy_nlp(sentence)

    #lower, strip and lemmatize
    tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens]

    #remove stopwords, and exclude words less than 2 characters
    tokens = [word for word in tokens if word not in stop_words and word not in punctuations and len(word) > 2]

    #return tokens
    return tokens

In [20]:
df.head()

Unnamed: 0,rank,title,genre,wiki_plot,imdb_plot
0,0,The Godfather,"[u' Crime', u' Drama']","On the day of his only daughter's wedding, Vit...","In late summer 1945, guests are gathered for t..."
1,1,The Shawshank Redemption,"[u' Crime', u' Drama']","In 1947, banker Andy Dufresne is convicted of ...","In 1947, Andy Dufresne (Tim Robbins), a banker..."
2,2,Schindler's List,"[u' Biography', u' Drama', u' History']","In 1939, the Germans move Polish Jews into the...",The relocation of Polish Jews from surrounding...
3,3,Raging Bull,"[u' Biography', u' Drama', u' Sport']","In a brief scene in 1964, an aging, overweight...","The film opens in 1964, where an older and fat..."
4,4,Casablanca,"[u' Drama', u' Romance', u' War']",It is early December 1941. American expatriate...,"In the early years of World War II, December 1..."


In [21]:
df['wiki_plot_tokenized'] = df['wiki_plot'].map(lambda x: spacy_tokenizer(x))

In [9]:
df.head()

Unnamed: 0,rank,title,genre,wiki_plot,imdb_plot,wiki_plot_tokenized
0,0,The Godfather,"[u' Crime', u' Drama']","On the day of his only daughter's wedding, Vit...","In late summer 1945, guests are gathered for t...","[day, daughter, wedding, vito, corleone, hear,..."
1,1,The Shawshank Redemption,"[u' Crime', u' Drama']","In 1947, banker Andy Dufresne is convicted of ...","In 1947, Andy Dufresne (Tim Robbins), a banker...","[banker, andy, dufresne, convict, murder, wife..."
2,2,Schindler's List,"[u' Biography', u' Drama', u' History']","In 1939, the Germans move Polish Jews into the...",The relocation of Polish Jews from surrounding...,"[germans, polish, jews, kraków, ghetto, world,..."
3,3,Raging Bull,"[u' Biography', u' Drama', u' Sport']","In a brief scene in 1964, an aging, overweight...","The film opens in 1964, where an older and fat...","[brief, scene, age, overweight, italian, ameri..."
4,4,Casablanca,"[u' Drama', u' Romance', u' War']",It is early December 1941. American expatriate...,"In the early years of World War II, December 1...","[early, december, american, expatriate, rick, ..."


Unnamed: 0,wiki_plot,wiki_plot_tokenized
0,"On the day of his only daughter's wedding, Vit...","[day, daughter, wedding, vito, corleone, hear,..."
1,"In 1947, banker Andy Dufresne is convicted of ...","[banker, andy, dufresne, convict, murder, wife..."
2,"In 1939, the Germans move Polish Jews into the...","[germans, polish, jews, kraków, ghetto, world,..."
3,"In a brief scene in 1964, an aging, overweight...","[brief, scene, age, overweight, italian, ameri..."
4,It is early December 1941. American expatriate...,"[early, december, american, expatriate, rick, ..."


### Using NLTK


In [52]:
data = df.loc[:, ['wiki_plot', 'wiki_plot_tokenized']]
data.columns = ['plot', 'sp_token']
data.head()

Unnamed: 0,plot,sp_token
0,"On the day of his only daughter's wedding, Vit...","[day, daughter, wedding, vito, corleone, hear,..."
1,"In 1947, banker Andy Dufresne is convicted of ...","[banker, andy, dufresne, convict, murder, wife..."
2,"In 1939, the Germans move Polish Jews into the...","[germans, polish, jews, kraków, ghetto, world,..."
3,"In a brief scene in 1964, an aging, overweight...","[brief, scene, age, overweight, italian, ameri..."
4,It is early December 1941. American expatriate...,"[early, december, american, expatriate, rick, ..."


In [47]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = stopwords.words('english')



from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from nltk.corpus import wordnet as wn

lemmatizer = WordNetLemmatizer()

tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J']: wn.ADJ
tag_map['V']: wn.VERB
tag_map['R']: wn.ADV

In [48]:
def lemmatize_sent(sentence):
    sentence = sentence.lower()
    clean_sentence = re.sub('[^a-zA-Z\s]','', sentence)
    words = clean_sentence.split()
    fil_words = [ word for word in words if word not in stop_words]
    new_sentence = ' '.join(fil_words)
    tokens = word_tokenize(new_sentence)
    tagged_tokens = pos_tag(tokens)
    final_words = []
    for i in range(len(tokens)):
        token, tag = tagged_tokens[i]
        word_final = lemmatizer.lemmatize(token, tag_map[tag[0]])
        final_words.append(word_final)
    # final_words = ' '.join(final_words)

    return final_words

In [53]:
data['nl_token'] = data['plot'].map(lambda x: lemmatize_sent(x))
data.head()

Unnamed: 0,plot,sp_token,nl_token
0,"On the day of his only daughter's wedding, Vit...","[day, daughter, wedding, vito, corleone, hear,...","[day, daughter, wedding, vito, corleone, hears..."
1,"In 1947, banker Andy Dufresne is convicted of ...","[banker, andy, dufresne, convict, murder, wife...","[banker, andy, dufresne, convicted, murdering,..."
2,"In 1939, the Germans move Polish Jews into the...","[germans, polish, jews, kraków, ghetto, world,...","[german, move, polish, jew, krakw, ghetto, wor..."
3,"In a brief scene in 1964, an aging, overweight...","[brief, scene, age, overweight, italian, ameri...","[brief, scene, aging, overweight, italian, ame..."
4,It is early December 1941. American expatriate...,"[early, december, american, expatriate, rick, ...","[early, december, american, expatriate, rick, ..."


#### Comparision of methods

In [78]:
def differ(token_n, token_s):
    unique = [word for word in token_n if word not in token_s]

    # unique
    print(len(token_s) - len(token_n))
    print(len(unique))

token_s = data.sp_token[1]
token_n = data.nl_token[1]

set_n = set(token_n)
set_s = set(token_s)

differ(set_n, set_s)


-32
82


In [73]:
def differ(token_n, token_s):
    set_n = set(token_n)
    set_s = set(token_s)

    unique_in_n = set_n - set_s  # Elements unique to token_n
    unique_in_s = set_s - set_n  # Elements unique to token_s

    print("Unique elements in token_n:", len(unique_in_n))
    print("Unique elements in token_s:", len(unique_in_s))

# Example usage


differ(token_n, token_s)

Unique elements in token_n: 2
Unique elements in token_s: 2


In [81]:
set_n = set(token_n)
set_s = set(token_s)

unique_in_n = set_n - set_s  # Elements unique to token_n
unique_in_s = set_s - set_n  # Elements unique to token_s
print(unique_in_n
, '\n', unique_in_s)

{'sister', 'includes', 'several', 'taxed', 'attacked', 'complaining', 'assaulted', 'undercutting', 'living', 'writing', 'move', 'redding', 'buried', 'fifty', 'threatens', 'paroled', 'ever', 'overhears', 'pas', 'brook', 'containing', 'call', 'reunited', 'commits', 'explains', 'asked', 'next', 'remembering', 'sentenced', 'bog', 'ged', 'alias', 'reveals', 'learns', 'implying', 'not', 'decaying', 'two', 'exploiting', 'murdering', 'stephen', 'becomes', 'convicted', 'escaped', 'resulting', 'get', 'violates', 'never', 'another', 'sends', 'laundered', 'worse', 'elli', 'sent', 'admitting', 'asking', 'removing', 'relents', 'procures', 'serving', 'back', 'used', 'befriends', 'claimed', 'using', 'including', 'take', 'can', 'released', 'incarcerated', 'dug', 'hanging', 'receives', 'dismisses', 'reassigns', 'nortons', 'discovers', 'empty', 'receiving', 'six', 'following', 'managing'} 
 {'reassign', 'ask', 'bogs', 'dismisse', 'violate', 'bad', 'follow', 'admit', 'serve', 'claim', 'use', 'discover', '

## Bulding Word dictionary

In [72]:
data.head()

Unnamed: 0,plot,sp_token,nl_token
0,"On the day of his only daughter's wedding, Vit...","[day, daughter, wedding, vito, corleone, hear,...","[day, daughter, wedding, vito, corleone, hears..."
1,"In 1947, banker Andy Dufresne is convicted of ...","[banker, andy, dufresne, convict, murder, wife...","[banker, andy, dufresne, convicted, murdering,..."
2,"In 1939, the Germans move Polish Jews into the...","[germans, polish, jews, kraków, ghetto, world,...","[german, move, polish, jew, krakw, ghetto, wor..."
3,"In a brief scene in 1964, an aging, overweight...","[brief, scene, age, overweight, italian, ameri...","[brief, scene, aging, overweight, italian, ame..."
4,It is early December 1941. American expatriate...,"[early, december, american, expatriate, rick, ...","[early, december, american, expatriate, rick, ..."


In [101]:
from gensim import corpora

In [102]:
movie_plot = data.nl_token

In [106]:
corpus = [dictionary.doc2bow(desc) for desc in movie_plot]
word_frequencies = [[(dictionary[id], frequency) for id, frequency in line] for line in corpus[0:3]]

In [113]:
dictionary

<gensim.corpora.dictionary.Dictionary at 0x7c5c8a4bac20>

### Bulding a Tfidif model

In [111]:
import gensim

In [112]:
movie_tfidf_model = gensim.models.TfidfModel(corpus, id2word=dictionary)
movie_lsi_model = gensim.models.LsiModel(movie_tfidf_model[corpus], id2word=dictionary, num_topics=300)

#Serialize and Store the corpus locally for easy retrival whenver required.
gensim.corpora.MmCorpus.serialize('movie_tfidf_model_mm', movie_tfidf_model[corpus])
gensim.corpora.MmCorpus.serialize('movie_lsi_model_mm',movie_lsi_model[movie_tfidf_model[corpus]])

#Load the indexed corpus
movie_tfidf_corpus = gensim.corpora.MmCorpus('movie_tfidf_model_mm')
movie_lsi_corpus = gensim.corpora.MmCorpus('movie_lsi_model_mm')

#Load the MatrixSimilarity
from gensim.similarities import MatrixSimilarity
movie_index = MatrixSimilarity(movie_lsi_corpus, num_features = movie_lsi_corpus.num_terms)

### Semantic search

In [117]:
from operator import itemgetter

def search_similar_movies(search_term):

    query_bow = dictionary.doc2bow(spacy_tokenizer(search_term))
    query_tfidf = movie_tfidf_model[query_bow]
    query_lsi = movie_lsi_model[query_tfidf]

    movie_index.num_best = 5

    movies_list = movie_index[query_lsi]

    movies_list.sort(key=itemgetter(1), reverse=True)
    movie_names = []

    for j, movie in enumerate(movies_list):

        movie_names.append (
            {
                'Relevance': round((movie[1] * 100),2),
                'Movie Title': df['title'][movie[0]],
                'Movie Plot': df['wiki_plot'][movie[0]]
            }

        )
        if j == (movie_index.num_best-1):
            break

    return pd.DataFrame(movie_names, columns=['Relevance','Movie Title','Movie Plot'])

In [118]:
search_similar_movies('violence protest march')

Unnamed: 0,Relevance,Movie Title,Movie Plot
0,73.36,Gandhi,The screenplay of Gandhi is available as a pub...
1,47.85,A Clockwork Orange,"In futuristic London, Alex DeLarge is the lead..."
2,34.47,Amadeus,The story begins in 1823 as the elderly Antoni...
3,24.01,The Best Years of Our Lives,"After World War II, Fred Derry (Dana Andrews),..."
4,15.03,All Quiet on the Western Front,\n\n\n\n\nThis section's plot summary may be t...
