In [77]:
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

import pandas as pd
import numpy as np
from math import sqrt
from collections import Counter

# from main import plot_summaries, 
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/oriol/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/oriol/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/oriol/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [78]:
def dot_product(v1, v2):
    print(set(v1) & set(v2))
    return sum(v1[key] * v2[key] for key in set(v1) & set(v2))

def magnitude(vector):
    return sqrt(sum(value**2 for value in vector.values()))

def cosine_similarity(v1, v2):
    return dot_product(v1, v2) / (magnitude(v1) * magnitude(v2))

In [98]:
def is_name(word):
    # Use WordNet to check if the word is a proper noun (Assuming person names are proper nouns)
    synsets = wordnet.synsets(word)

    for synset in synsets:
        if synset.pos() == 'n':
            return True
    
    return False

def is_verb(word):
    # Use WordNet to check if the word is a verb
    synsets = wordnet.synsets(word)
    for synset in synsets:
        if synset.pos() == 'v':
            return True
    return False

def filter_words(word):
    # Sometimes people's name can also be verbs, therefore we should remove them first
    # For instance, to carol: to sing especially in a joyful manner
    # Also it may induce similarities which are not there. There are both Harry's in Harry Potter and Mamma Mia without having anything in common
    # TODO: function not working properly

    return is_verb(word) # or is_name(word)

def get_infinitive_form(verb):
    lemmatizer = WordNetLemmatizer()
    infinitive_form = lemmatizer.lemmatize(verb, pos='v')
    return infinitive_form

def preprocess_text(text):
    # Tokenize the text and while remove all punctuation
    stop_words = set(stopwords.words('english'))
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text)
    
    # Lowercase everything and filter out stopwords
    words = [word.lower() for word in words if word.lower() not in stop_words]

    # Sometimes people's name can also be verbs, therefore we should remove them first
    # For instance, to carol: to sing especially in a joyful manner
    # Also it may induce similarities which are not there. There are both Harry's in Harry Potter and Mamma Mia without having anything in common
    # TODO: function not working properly
    # words = list(filter(lambda word: not is_name(word), words))
    
    # Separate verbs and others to obtain the infinitive form and be able to generalizes
    verbs = list(filter(is_verb, words))
    others = list(filter(lambda word: not is_verb(word), words))

    # Transform verbs into their infinitive form
    verbs = [get_infinitive_form(verb) for verb in verbs]

    # Count words appearance and keep the 100 most common
    words = Counter(verbs + others).most_common(100)
    print(words)

    return dict(words)


In [99]:
'''
Plot summaries of 42,306 movies extracted from the November 2, 2012 dump of English-language Wikipedia.
Each line contains the Wikipedia movie ID (which indexes into movie.metadata.tsv) followed by the summary.
'''

data_folder = './data/MovieSummaries/'
plot_summaries = pd.read_csv(data_folder + 'plot_summaries.txt', sep='\t', header=None)
plot_summaries.columns = ['wiki_movie_id', 'plot']
plot_summaries.set_index('wiki_movie_id')

plot_summaries['count_words'] = plot_summaries.apply(lambda x: len(str(x['plot']).split()), axis=1)

# Drop plots with less than 200 words
plot_summaries.drop(plot_summaries[plot_summaries['count_words'] < 200].index, inplace= True)

# Preprocess the synopses
# The Hunger Games
preprocessed_synopsis1 = preprocess_text(plot_summaries.query("wiki_movie_id == 31186339").iloc[0]['plot'])

# Fast and Furious: Tokyo Drift
preprocessed_synopsis2 = preprocess_text(plot_summaries.query("wiki_movie_id == 2913859").iloc[0]['plot'])

# Harry Potter: Deathly Hallows Part I
preprocessed_synopsis3 = preprocess_text(plot_summaries.query("wiki_movie_id == 9834441").iloc[0]['plot'])

# Mamma mia
preprocessed_synopsis4 = preprocess_text(plot_summaries.query("wiki_movie_id == 8425661").iloc[0]['plot'])


similarity_1 = cosine_similarity(preprocessed_synopsis1, preprocessed_synopsis2)
similarity_2 = cosine_similarity(preprocessed_synopsis1, preprocessed_synopsis3)
similarity_3 = cosine_similarity(preprocessed_synopsis1, preprocessed_synopsis4)
similarity_4 = cosine_similarity(preprocessed_synopsis3, preprocessed_synopsis4)

[('katniss', 24), ('peeta', 16), ('rue', 11), ('district', 10), ('kill', 6), ('game', 5), ('12', 5), ('crane', 4), ('tributes', 4), ('haymitch', 4), ('cato', 4), ('provide', 3), ('warn', 3), ('medicine', 3), ('run', 3), ('find', 3), ('shoot', 3), ('change', 3), ('cleave', 3), ('tribute', 3), ('nightlock', 3), ('hunger', 2), ('take', 2), ('give', 2), ('career', 2), ('win', 2), ('love', 2), ('gain', 2), ('sponsor', 2), ('gift', 2), ('televise', 2), ('supply', 2), ('call', 2), ('tree', 2), ('draw', 2), ('gather', 2), ('hear', 2), ('spear', 2), ('die', 2), ('riot', 2), ('snow', 2), ('make', 2), ('rule', 2), ('proclaim', 2), ('feast', 2), ('thresh', 2), ('time', 2), ('berry', 2), ('capitol', 2), ('past', 2), ('must', 2), ('boy', 2), ('girl', 2), ('death', 2), ('arena', 2), ('survivor', 2), ('first', 2), ('however', 2), ('half', 2), ('away', 2), ('cornucopia', 2), ('alliance', 2), ('poisonous', 2), ('tracker', 2), ('jacker', 2), ('around', 2), ('instead', 2), ('arrow', 2), ('11', 2), ('presi

### Cosine similarity interpretation and validation
If cosine similarity is equal to 1 means both plots are identical. On the other side, if cosine similarity is equal to 0 means the plots have nothing in common.

Overviewing the below similarities, it makes sense that "The Hunger Games" is more similar to a film such as "Harry Potter: Deathly Hallows Part 1" for more common actions such as "kill", "reveal" and "take" than to Fast&Furious and Mamma Mia. This actions may appear also in the latter movies but with less relevance.

And unusual phenomena is observed when comparing Harry Potter to Mamma Mia. Using the user's experience, one can argue these films are not very much alike. Investigating the issue further, we discover that this high value on similarity is due to one of the main protagonist for both films are called Harry.

Filtering people's name might be necessary to refine the algorithm.

In [100]:
print(f"Cosine Similarity The Hunger Games - F&F:Tokyo Drift: {similarity_1}")
print(f"Cosine Similarity The Hunger Games - HP: Deathly Hallows P1: {similarity_2}")
print(f"Cosine Similarity The Hunger Games - Mamma mia: {similarity_3}")
print(f"Cosine Similarity HP: Death Hollows P1 - Mamma mia: {similarity_4}")

Cosine Similarity The Hunger Games - F&F:Tokyo Drift: 0.026975877067461216
Cosine Similarity The Hunger Games - HP: Deathly Hallows P1: 0.04661951635374365
Cosine Similarity The Hunger Games - Mamma mia: 0.02453380541675334
Cosine Similarity HP: Death Hollows P1 - Mamma mia: 0.08147550915738545


### Similarity computation by year
We are interested on how much similar are movies released on the same year are alike