In [1]:
# Importing the necessary libraries
import pandas as pd
import re
from collections import defaultdict
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import numpy as np
import gensim
import gensim.downloader as api
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Import code from other files
from model import model as m

In [3]:
# Fit the model

# Get data and model
data, meta_data, valid_words = m.get_data()
w2v_model = m.get_model()
# Preprocess the data
data, word2vec = m.w2v_preprocessing(data, valid_words, w2v_model)
# Get the vectorized wordcount data
X_sparse, word_to_index, movie_id_to_index = m.get_vectorized_data(data)
# Get the tf-idf matrix
tf_idf = m.compute_tf_idf(X_sparse)
print(tf_idf.shape)


Number of words in the model:  39291
Number of words not in the model (should be 0 now):  0


  return np.true_divide(self.todense(), other)


(42303, 39290)


In [4]:
# Get the movie vectors
movie_vectors = m.compute_movie_vectors(tf_idf, w2v_model, word_to_index, movie_id_to_index, data)

In [13]:
# clustering the movies

df_plot = pd.read_csv('../data/MovieSummaries/plot_summaries.txt', sep="\t", header=None)
df_plot.columns = ["Movie ID", "Plot_summary"]

kmeans = KMeans(n_clusters=10, random_state=0).fit(movie_vectors)

# Get the 5 movies closest from each cluster center
for i in range(10):
    print("Cluster", i+1)
    # Print the 5 closest words to the cluster center
    print("Closest words : ", [w2v_model.similar_by_vector(kmeans.cluster_centers_[i])[j][0] for j in range(10)])
    # For each cluster, print the movie descriptions to see if the clustering makes sense
    closest_movies_ids = data.iloc[np.argsort(np.linalg.norm(movie_vectors - kmeans.cluster_centers_[i], axis=1))[:5]]["Wikipedia_movie_ID"].values
    for j in range(5):
        print()
        print("Movie :", j, df_plot[df_plot["Movie ID"].isin(closest_movies_ids)]["Plot_summary"].values[j])
        print()

Cluster 1
Closest words :  ['pretending', 'pretend', 'thinks', 'telling', 'knowing', 'two-timing', 'something', 'terrified', 'afraid', 'indeed']

Movie : 0 A colorful family from a small Texas town must come to grips with the accidental death of the elderly family matriarch during a clandestine meeting in a seedy motel room with her much younger, married neighbor. The woman's family must deal with their own demons while preparing for what could be an embarrassing funeral.


Movie : 1 Priya  lost her parents when she was a kid and the will decried that her wealth would go to an orphanage if something happened to her. Ramanathan ( has cleverly positioned himself as an affectionate uncle to his niece Priya and planned to kill her with help of Raja  and Ranjith , a couple of petty thieves. Raja naturally falls in love with Priya. And the rest of story is how he protects her from Ramanathan.


Movie : 2 Ajay Sharma is a very successful entrepreneur who has now transformed into a workaholic 

Overall, we see that some cluster are not meaningfull. In particular, cluster 7 is full of movies with no actual descriptions, the plot summary is nothing but a list of the characters and the actors. This is reflected in the words closest to the cluster center : 'role', 'roles', 'rôle'.

Additionnally, some movies with very short to no descriptions appear near multiple cluster centers. This could indicate that a large part of the movies have very short to no description, and are thus forming a large cluster near the origin, while the remaining movies are dispersed in a vector space too large to be able to form actual clusters. 

This could be solved with an additionnal preprocessing step that ignores movies without a plot description, or by using another way of extraction meaning from our movie vectors that does not rely on clustering, such as a querry mechanism. We will investigate these options for Milestone 3.