In [1]:
#imports

import pandas as pd
import re
from collections import defaultdict
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import numpy as np
import gensim
import gensim.downloader as api
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
model = model = gensim.models.KeyedVectors.load_word2vec_format('../../data/w2v/6/model.txt', binary=False)
print(model.most_similar(positive=["king", "woman"], negative=["man"]))


[('queen', 0.7168769240379333), ('monarch', 0.6803387403488159), ('princess', 0.6657402515411377), ('kings', 0.6593936085700989), ('regnant', 0.6519632935523987), ('monarchs', 0.6379128098487854), ('consort', 0.6045569777488708), ('prince', 0.6032231450080872), ('throne', 0.5884857773780823), ('royal', 0.5883517861366272)]


In [3]:
# Load data
data = pd.read_csv('../../data/MovieSummaries/plot_summaries_cleaned.csv')
data.columns = ["movie_id", "plot"]
print(data.head())
print(data.shape)

meta_data = pd.read_csv('../../data/MovieSummaries/movie.metadata.tsv', sep='\t', header=None)
meta_data.columns = ["movie_id",1,"movie_name",3,4,5,6,7,"genre"]
print(meta_data.head())

   movie_id                                               plot
0  23890098  Shlykov hardworking taxi driver Lyosha saxopho...
1  31186339  nation Panem consists wealthy Capitol twelve p...
2  20663735  Poovalli Induchoodan sentenced six years priso...
3   2231378  Lemon Drop Kid New York City swindler illegall...
4    595909  Seventhday Adventist Church pastor Michael Cha...
(42303, 2)
   movie_id           1                                         movie_name  \
0    975900   /m/03vyhn                                     Ghosts of Mars   
1   3196793   /m/08yl5d  Getting Away with Murder: The JonBenét Ramsey ...   
2  28463795  /m/0crgdbh                                        Brun bitter   
3   9363483  /m/0285_cd                                   White Of The Eye   
4    261236   /m/01mrr1                                  A Woman in Flames   

            3           4      5                                   6  \
0  2001-08-24  14010832.0   98.0  {"/m/02h40lc": "English Language"}  

In [4]:
# Load the English dictionnary to filter out non-words
valid_words = set()
with open('../../data/words.txt') as word_file:
    valid_words = set(word_file.read().split())


In [5]:
# Drop rows where the plot summary is missing or shorter than 50 words
data = data[data["plot"].apply(lambda x: len(x.split()) > 50)]
print(data.head())
print(data.shape)

# Same for the metadata
meta_data = meta_data[meta_data["movie_id"].isin(data["movie_id"])]
print(meta_data.shape)


   movie_id                                               plot
1  31186339  nation Panem consists wealthy Capitol twelve p...
2  20663735  Poovalli Induchoodan sentenced six years priso...
3   2231378  Lemon Drop Kid New York City swindler illegall...
4    595909  Seventhday Adventist Church pastor Michael Cha...
5   5272176  president way give speech traveling man shows ...
(31137, 2)
(31070, 9)


In [6]:
words2vec = set()

words_not_in_model = set()
# Check if all the words in the descriptions are in the model
for description in data["plot"]:
    for word in description.split():
        # Remove non-words
        if word not in valid_words:
            words_not_in_model.add(word)
            continue
        if word in model:
            words2vec.add(word)
        else :
            # If the word is not in the model, we will try to change the first letter 
                word = word.capitalize()
                if word in model:
                    words2vec.add(word)
                else:
                    words_not_in_model.add(word)
print("Number of words in the model: ", len(words2vec))
print("Number of words not in the model: ", len(words_not_in_model))

print("lengh of a word2vec vector: ", len(model["king"]))

# Average vector for the words not in the model
average_vector = np.mean([model[word] for word in words2vec], axis=0)


Number of words in the model:  56423
Number of words not in the model:  152184
lengh of a word2vec vector:  300


In [7]:
# Filter out the words that are not in the model

def filter_words_not_in_model_helper(word): # We explain a bit later why me choose to filter out words not in the model
    if word in model and word in valid_words:
        return word
    elif word.lower() in model and word.lower() in valid_words:
        return word.lower()
    elif word.capitalize() in model and word.capitalize() in valid_words:
        return word.capitalize()
    else:
        return ""
def filter_words_not_in_model(s):
    return " ".join([filter_words_not_in_model_helper(word) for word in s.split()])

def filter_basic_patterns(s):
    pattern = "|".join([
    "\\d+", # Matches digits.
    r'http?://\S+|www\.\S+', # Matches url links
    ",", "\.", ":", "\(", "\)","_", "\{", "\}", "\?", "!", "&", "/", "\[", "\]", "\|", "#", "%", "\"", "\'", ";", "-", '®', 'à', '>', '<', '=', 'ü', "\*"
    ])
     # Cast uppercase letters that are not at the beginning of the string to lowercase + add a space before them
    s = re.sub(r"(?<!^)[A-Z]", lambda x: " " + x.group(0).lower(), s)
    # Cast uppercase letters that are after a point and a space to lowercase, as they are starting a new sentence
    s = re.sub(r"\. [A-Z]", lambda x: ". " + x.group(0).lower(), s)

    s = re.sub(pattern, " ", s) # replace by spaces to avoid a:b or a,b becoming ab instead of a b. 
    return s

def preprocess(s):
        s = filter_basic_patterns(s)
        s = filter_words_not_in_model(s)
        s = re.sub(r"\s+", " ", s).strip() # removing uncessary spaces
    
        return s

data["plot"] = data["plot"].apply(preprocess)

# Check if all the words in the descriptions are in the model
words2vec.clear()
words_not_in_model.clear()
word_count = [] # count the number of words in each description
for description in data["plot"]:
    for word in description.split():
        if word in model and word in valid_words:
            words2vec.add(word)
        else :
            words_not_in_model.add(word)

print("Number of words in the model: ", len(words2vec))
print("Number of words not in the model: ", len(words_not_in_model))

# find occurences of "_" 
for word in words2vec:
    if "_" in word:
        print(word)




Number of words in the model:  56700
Number of words not in the model:  0


In [8]:
# Save the data
data.to_csv("../../data/MovieSummaries/plot_summaries_cleaned_fit_model.csv", index=False)

In [9]:
# Clustering words

X = np.array([model[word] for word in words2vec])

# Now, we will cluster the words into 10 clusters
kmeans = KMeans(n_clusters=20, random_state=0).fit(X)

# Print the 10 words closest from each cluster center
for i in range(20):
    print("Cluster", i+1)
    print([model.similar_by_vector(kmeans.cluster_centers_[i])[j][0] for j in range(10)])

Cluster 1
['rebels', 'reprisals', 'fleeing', 'troops', 'massacred', 'insurrection', 'loyalists', 'deserters', 'retaliation', 'revolted']
Cluster 2
['injury-riddled', 'top-six', '26-game', '.393', 'goal-less', '22-game', 'league-record', '30-game', '21-game', 'homeruns']
Cluster 3
['hillside', 'streambeds', 'Low-lying', 'hillsides', 'dry-stone', 'steep-walled', 'shoreline', 'tree-covered', 'foreshores', 'well-wooded']
Cluster 4
['Julie', 'Rebecca', 'Brenda', 'Katie', 'Maggie', 'Renee', 'Pamela', 'Jenny', 'Amy', 'Kate']
Cluster 5
['xerostomia', 'hypopituitarism', 'overactivity', 'vasospasm', 'cholecystitis', 'bronchiectasis', 'esophagitis', 'pyogenic', 'coagulopathy', 'pyelonephritis']
Cluster 6
['parody', 'Hypnotized', 'Dreamhouse', 'Cahoots', 'Moanin', 'Moonchild', 'Humoresque', 'Troublemakers', 'Charnin', 'parodies']
Cluster 7
['confronts', 'realizes', 'pretending', 'discovers', 'pretends', 'learns', 'thinks', 'kidnaps', 'confesses', 'seduces']
Cluster 8
['liferafts', 'ship', 'ships',

In [10]:
# First, compute the TF-IDF matrix of the preprocessed descriptions
vectorizer = CountVectorizer(lowercase=False, tokenizer=lambda x: x.split())

# Count number of occurence of every word for each document
vectorized = vectorizer.fit_transform(data['plot'])
print(vectorized.shape)
print(vectorizer.get_feature_names_out().shape)

word_counts = vectorized.tocsr()
print(type(word_counts), word_counts.shape)

# Delete vectorized to free memory
del vectorized

# Compute the tf and the idf 
# the tf is the number of occurences of a word in a document divided by the maximum number of occurences of a word in the document
max_occurences = 1 / word_counts.max(axis=1).toarray().flatten()
tf = word_counts.T.multiply(max_occurences).T
# perform log normalization of the tf
tf = csr_matrix(np.log1p(tf))

# the idf is the log of the number of documents divided by the number of documents containing the word
idf = np.log(word_counts.shape[0] / np.bincount(word_counts.nonzero()[1]))
# Convert the idf to a sparse matrix
idf = csr_matrix(idf)



(31137, 56700)
(56700,)
<class 'scipy.sparse._csr.csr_matrix'> (31137, 56700)


In [11]:
# Compute the tf-idf matrix 
X = tf.T.multiply(idf.T)

print(X.shape)

(56700, 31137)


In [12]:
print("Shape of the TF-IDF matrix:", X.shape)

# Build maps from word to index and from movie ID to index
word_to_index = {word: i for i, word in enumerate(vectorizer.get_feature_names_out())}
movie_id_to_index = {movie_id: i for i, movie_id in enumerate(data['movie_id'])}

def compute_vector_movie(document, movieID):
    d_words = document.split()
  
    if len(d_words) == 0:
        return average_vector
    else:
        weighted_vec = np.array([np.mean([model[word] * X[word_to_index[word], movie_id_to_index[movieID]] for word in d_words], axis=0)]).flatten()
        return weighted_vec/np.linalg.norm(weighted_vec)

Shape of the TF-IDF matrix: (56700, 31137)


In [13]:
# Compute the vector for each movie description 

vectors = np.array([compute_vector_movie(document, movieID) for document, movieID in zip(data['plot'], data['movie_id'])])
print("Shape of the vectors matrix:", vectors.shape)



Shape of the vectors matrix: (31137, 300)


In [14]:
# Add the vectors to the dataframe
data['vector'] = [v for v in vectors]

In [15]:
# Load unprocessed descriptions
df_plot = pd.read_csv('../../data/MovieSummaries/plot_summaries.txt', sep="\t", header=None)
df_plot.columns = ["movie_id", "plot"]

df_plot[['movie_id', 'plot']]

Unnamed: 0,movie_id,plot
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...
...,...,...
42298,34808485,"The story is about Reema , a young Muslim scho..."
42299,1096473,"In 1928 Hollywood, director Leo Andreyev look..."
42300,35102018,American Luthier focuses on Randy Parsons’ tra...
42301,8628195,"Abdur Rehman Khan , a middle-aged dry fruit se..."


In [16]:
# clustering the movies
kmeans = KMeans(n_clusters=10, random_state=0).fit(vectors)

GENRES = []
# Get the 10 movies closest from each cluster center
for i in range(10):
    print("Cluster", i+1)
    # Print the 5 closest words to the cluster center
    print("Closest words : ", [model.similar_by_vector(kmeans.cluster_centers_[i])[j][0] for j in range(10)])
    # For each cluster, print the movie descriptions to see if the clustering makes sense
    for j in range(5):
        closest_movies_ids = data.iloc[np.argsort(np.linalg.norm(vectors - kmeans.cluster_centers_[i], axis=1))[:5]]["movie_id"].values
        print()
        print("Movie :", j, df_plot[df_plot["movie_id"].isin(closest_movies_ids)]["plot"].values[j])
        print()

Cluster 1
Closest words :  ['Wheezer', 'Maggie', 'Tom', 'Porky', 'Mickey', 'Grandpa', 'Jess', 'Jerry', 'Neddie', 'Molly']

Movie : 0 In the town of Derry, Maine, six-year old Georgie Denbrough is lured to a storm drain when his paper boat drops down into it. He encounters a seemingly friendly clown who calls himself "Pennywise the Dancing Clown." Pennywise tempts Georgie closer with promises of candy and balloons, exclaiming that they all float down into the sewer. He says that Georgie can float too  before revealing his true nature, grabbing him and viciously tearing his right arm off. His older brother Bill is overwhelmed with guilt for inadvertently sending Georgie to his death and is terrorized when his picture comes to life, giving him a stutter from the trauma. He befriends several similar "misfits." One is Ben Hanscom, a loyal, determined, overweight and ingenious builder from a home broken after the death of his military father. He is terrorized by visions of his father trying 

In [17]:
# Function to compute the vector of a query
def compute_vector_query(query):
    d_words = query.split()
    for word in d_words:
        if word not in model:
            d_words.remove(word)
            print("Word not in model: ", word)
    if len(d_words) == 0:
        return average_vector
    else:
        vec = np.mean([model[word] for word in d_words], axis=0)
       
        return vec/np.linalg.norm(vec)

In [18]:
# We can now find the most similar movies to a given querry string : 
def document_similarity(query_vector, n=5):
    # Compute the cosine similarity between the query vector and the movies vectors
    similarities = np.dot(vectors, query_vector) / (np.linalg.norm(vectors, axis=1) * np.linalg.norm(query_vector))
    # Find the n most similar movies
    best_movies = np.argsort(similarities)[::-1][:n]
    return best_movies
    


# Example of query
query1 = "climate"
vec1 = compute_vector_query(query1)
best_movies = document_similarity(vec1)
print(best_movies)
print("Best movies for query: ", query1)
for i in document_similarity(vec1):
    # Print the name of the movie
    print(meta_data[meta_data["movie_id"] == data["movie_id"][i]]["movie_name"].values[0])
    # Print the description of the movie
    print(df_plot[df_plot["movie_id"] == data["movie_id"][i]]["plot"].values[0])





[  251 28178 13243 26746  6775]
Best movies for query:  climate
Mela
The only brother ([[Ayub Khan  of a young woman, Roopa , returns to Chandanpur to arrange her marriage. A festival is arranged in the happiness of Roopa's marriage, however Chandanpur's happiness is short-lived, as the village is raided by a group of terrorists. The leader of the terrorists, Gujjar, , murders the politician, and to make matters worse,Roopa's beauty catches the eye of Gujjar. However, as Gujjar attempts to escape with the terrified Roopa, her brother comes to her rescue, only to be slain, much to the horror of the village, especially Roopa. Also, her best friend, Gopal , is killed too, much to the dismay of Gopal's mother . Roopa, enraged by the fact that her brother and Gopal are no more, vows avenge. Gujjar threatens Roopa that she will only be his mistress and will never be able to have a brother nor lover. Enraged, Roopa attempts to commit suicide by jumping into a waterfall as she finds it better 

KeyError: 28178

In [None]:

## Unuseful code

# extract the genres from the genre list json
genre_list = re.findall(r'\"(.*?)\"', genre_list_string)
for genre in genre_list:
    # remove entries containing a / as they are not genres
    if "/" not in genre:
        if genre in GENRES:
            genre_hist[GENRES.index(genre)] += 1
        else:
            GENRES.append(genre)
            genre_hist = np.append(genre_hist, 1)
        



NameError: name 'genre_list_string' is not defined