In [61]:
#imports

import pandas as pd
import re
from collections import defaultdict
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import numpy as np
import gensim
import gensim.downloader as api
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer

In [62]:
model = model = gensim.models.KeyedVectors.load_word2vec_format('../../data/w2v/6/model.txt', binary=False)
print(model.most_similar(positive=["king", "woman"], negative=["man"]))


[('queen', 0.7168769240379333), ('monarch', 0.6803387403488159), ('princess', 0.6657402515411377), ('kings', 0.6593936085700989), ('regnant', 0.6519632935523987), ('monarchs', 0.6379128098487854), ('consort', 0.6045569777488708), ('prince', 0.6032231450080872), ('throne', 0.5884857773780823), ('royal', 0.5883517861366272)]


In [63]:
# Load data
data = pd.read_csv('../../data/MovieSummaries/plot_summaries_cleaned.csv')
print(data.head())

meta_data = pd.read_csv('../../data/MovieSummaries/movie.metadata.tsv', sep='\t', header=None)
meta_data.columns = ["movie_id",1,"movie_name",3,4,5,6,7,"genre"]
print(meta_data.head())

   Movie ID                                       Plot summary
0  23890098  Shlykov hardworking taxi driver Lyosha saxopho...
1  31186339  nation Panem consists wealthy Capitol twelve p...
2  20663735  Poovalli Induchoodan sentenced six years priso...
3   2231378  Lemon Drop Kid New York City swindler illegall...
4    595909  Seventhday Adventist Church pastor Michael Cha...
   movie_id           1                                         movie_name  \
0    975900   /m/03vyhn                                     Ghosts of Mars   
1   3196793   /m/08yl5d  Getting Away with Murder: The JonBenét Ramsey ...   
2  28463795  /m/0crgdbh                                        Brun bitter   
3   9363483  /m/0285_cd                                   White Of The Eye   
4    261236   /m/01mrr1                                  A Woman in Flames   

            3           4      5                                   6  \
0  2001-08-24  14010832.0   98.0  {"/m/02h40lc": "English Language"}   
1  2000-0

In [64]:
# Load the English dictionnary to filter out non-words
valid_words = set()
with open('../../data/words.txt') as word_file:
    valid_words = set(word_file.read().split())


In [65]:
words2vec = set()

words_not_in_model = set()
# Check if all the words in the descriptions are in the model
for description in data["Plot summary"]:
    for word in description.split():
        # Remove non-words
        if word not in valid_words:
            words_not_in_model.add(word)
            continue
        if word in model:
            words2vec.add(word)
        else :
            # If the word is not in the model, we will try to change the first letter 
                word = word.capitalize()
                if word in model:
                    words2vec.add(word)
                else:
                    words_not_in_model.add(word)
print("Number of words in the model: ", len(words2vec))
print("Number of words not in the model: ", len(words_not_in_model))

print("lengh of a word2vec vector: ", len(model["king"]))

# Average vector for the words not in the model
average_vector = np.mean([model[word] for word in words2vec], axis=0)


Number of words in the model:  57317
Number of words not in the model:  162903
lengh of a word2vec vector:  300


In [66]:
# Filter out the words that are not in the model

def filter_words_not_in_model_helper(word): # We explain a bit later why me choose to filter out words not in the model
    if word in model and word in valid_words:
        return word
    elif word.lower() in model and word.lower() in valid_words:
        return word.lower()
    elif word.capitalize() in model and word.capitalize() in valid_words:
        return word.capitalize()
    else:
        return ""
def filter_words_not_in_model(s):
    return " ".join([filter_words_not_in_model_helper(word) for word in s.split()])

def filter_basic_patterns(s):
    pattern = "|".join([
    "\\d+", # Matches digits.
    r'http?://\S+|www\.\S+', # Matches url links
    ",", "\.", ":", "\(", "\)","_", "\{", "\}", "\?", "!", "&", "/", "\[", "\]", "\|", "#", "%", "\"", "\'", ";", "-", '®', 'à', '>', '<', '=', 'ü', "\*"
    ])
     # Cast uppercase letters that are not at the beginning of the string to lowercase + add a space before them
    s = re.sub(r"(?<!^)[A-Z]", lambda x: " " + x.group(0).lower(), s)
    # Cast uppercase letters that are after a point and a space to lowercase, as they are starting a new sentence
    s = re.sub(r"\. [A-Z]", lambda x: ". " + x.group(0).lower(), s)

    s = re.sub(pattern, " ", s) # replace by spaces to avoid a:b or a,b becoming ab instead of a b. 
    return s

def preprocess(s):
        s = filter_basic_patterns(s)
        s = filter_words_not_in_model(s)
        s = re.sub(r"\s+", " ", s).strip() # removing uncessary spaces
    
        return s

data["Plot summary"] = data["Plot summary"].apply(preprocess)

# Check if all the words in the descriptions are in the model
words2vec.clear()
words_not_in_model.clear()
for description in data["Plot summary"]:
    for word in description.split():
        if word in model and word in valid_words:
            words2vec.add(word)
        else :
            words_not_in_model.add(word)
print("Number of words in the model: ", len(words2vec))
print("Number of words not in the model: ", len(words_not_in_model))

# find occurences of "_" 
for word in words2vec:
    if "_" in word:
        print(word)




Number of words in the model:  57540
Number of words not in the model:  0


In [67]:
# Save the data
data.to_csv("../../data/MovieSummaries/plot_summaries_cleaned_fit_model.csv", index=False)

In [68]:
# Clustering words

X = np.array([model[word] for word in words2vec])

# Now, we will cluster the words into 10 clusters
kmeans = KMeans(n_clusters=20, random_state=0).fit(X)

# Print the 10 words closest from each cluster center
for i in range(20):
    print("Cluster", i+1)
    print([model.similar_by_vector(kmeans.cluster_centers_[i])[j][0] for j in range(10)])

Cluster 1
['troops', 'marines', 'soldiers', 'retreating', 'ambushers', 'garrison', 'enemy', 'reinforcements', 'counterattack', 'commandos']
Cluster 2
['Harrisonville', 'Centerville', 'Ashland', 'Pleasant', 'Perrysville', 'Brownsboro', 'Jimtown', 'Claysville', 'Galesville', 'Middletown']
Cluster 3
['Miller', 'Thompson', 'Bennett', 'Patterson', 'Baker', 'Smith', 'Dunn', 'Moore', 'Richardson', 'Walker']
Cluster 4
['Moonchild', 'Dreamt', 'Maldoror', 'Tzigane', 'amoureux', "d'Ulisse", 'parody', 'Erlkönig', 'Dreamhouse', 'Tenderly']
Cluster 5
['obligee', 'disallowance', 'refunding', 'tax-payers', 'withholding', 'nondiscriminatory', 'fiduciaries', 'overpayments', 'estopped', 'repayment']
Cluster 6
['interrelatedness', 'emphasizing', 'merely', 'notion', 'inherent', 'cause-effect', 'context', 'notions', 'insofar', 'conceptions']
Cluster 7
['oxidization', 'water-insoluble', 'Hydrochloric', 'arsine', 'oxygen-free', 'hydrobromic', 'borates', 'isopropanol', 'hypochlorous', 'dithionite']
Cluster 8
[

In [None]:
# First, compute the TF-IDF matrix of the preprocessed descriptions
vectorizer = CountVectorizer(lowercase=False, tokenizer=lambda x: x.split())

# Count number of occurence of every word for each document
vectorized = vectorizer.fit_transform(data['Plot summary'])
print(vectorized.shape)
print(vectorizer.get_feature_names_out().shape)

word_counts = vectorized.tocsr()
print(type(word_counts), word_counts.shape)

# Delete vectorized to free memory
del vectorized

# Compute the tf and the idf 
# the tf is the number of occurences of a word in a document divided by the maximum number of occurences of a word in the document
max_occurences = 1 / word_counts.max(axis=1).toarray().flatten()
tf = word_counts.T.multiply(max_occurences).T
# perform log normalization of the tf
tf = csr_matrix(np.log1p(tf))

# the idf is the log of the number of documents divided by the number of documents containing the word
idf = np.log(word_counts.shape[0] / np.bincount(word_counts.nonzero()[1]))
# Convert the idf to a sparse matrix
idf = csr_matrix(idf)

(42303, 57540)
(57540,)
<class 'scipy.sparse._csr.csr_matrix'> (42303, 57540)


In [89]:
# Compute the tf-idf matrix 
X = tf.T.multiply(idf.T)

print(X.shape)

(57540, 42303)


In [90]:
print("Shape of the TF-IDF matrix:", X.shape)

# Build maps from word to index and from movie ID to index
word_to_index = {word: i for i, word in enumerate(vectorizer.get_feature_names_out())}
movie_id_to_index = {movie_id: i for i, movie_id in enumerate(data['Movie ID'])}

def compute_vector_movie(document, movieID):
    d_words = document.split()
  
    if len(d_words) == 0:
        return average_vector
    else:
        weighted_vec = np.array([np.mean([model[word] * X[word_to_index[word], movie_id_to_index[movieID]] for word in d_words], axis=0)]).flatten()
        return weighted_vec/np.linalg.norm(weighted_vec)

Shape of the TF-IDF matrix: (57540, 42303)


In [91]:
# Compute the vector for each movie description 

vectors = np.array([compute_vector_movie(document, movieID) for document, movieID in zip(data['Plot summary'], data['Movie ID'])])
print("Shape of the vectors matrix:", vectors.shape)



Shape of the vectors matrix: (42303, 300)


In [92]:
# Add the vectors to the dataframe
data["Vector"] = None
for i in range(vectors.shape[0]):
    data.at[i, "Vector"] = vectors[i]

In [93]:
# Load unprocessed descriptions
df_plot = pd.read_csv('../../data/MovieSummaries/plot_summaries.txt', sep="\t", header=None)
df_plot.columns = ["Movie ID", "Plot summary"]

df_plot[['Movie ID', 'Plot summary']]

Unnamed: 0,Movie ID,Plot summary
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...
...,...,...
42298,34808485,"The story is about Reema , a young Muslim scho..."
42299,1096473,"In 1928 Hollywood, director Leo Andreyev look..."
42300,35102018,American Luthier focuses on Randy Parsons’ tra...
42301,8628195,"Abdur Rehman Khan , a middle-aged dry fruit se..."


In [94]:
# clustering the movies
kmeans = KMeans(n_clusters=10, random_state=0).fit(vectors)

GENRES = []
# Get the 10 movies closest from each cluster center
for i in range(10):
    print("Cluster", i+1)
    # Print the 5 closest words to the cluster center
    print("Closest words : ", [model.similar_by_vector(kmeans.cluster_centers_[i])[j][0] for j in range(10)])
    # For each cluster, print the movie descriptions to see if the clustering makes sense
    for j in range(5):
        closest_movies_ids = data.iloc[np.argsort(np.linalg.norm(vectors - kmeans.cluster_centers_[i], axis=1))[:5]]["Movie ID"].values
        print()
        print("Movie :", j, df_plot[df_plot["Movie ID"].isin(closest_movies_ids)]["Plot summary"].values[j])
        print()

Cluster 1
Closest words :  ['Eva', 'Rebecca', 'Isabelle', 'Julie', 'Laura', 'Seigner', 'Louise', 'Sophie', 'Luisa', 'Pamela']

Movie : 0 Luis Denard, a former concert musician, Charles Boyer, is a Nationalist, in the midst of the Spanish Civil War. He travels to England to secure supplies, where is threatened by suspicion and Fascist agents. He finds unexpected help when he meets young socialite Rose Cullen, Lauren Bacall, whose Father, Lord Benditch, Holmes Herbert, is one of the men Denard is trying to meet. Everything seems to go wrong, when he’s mugged, and laid out cold. Not knowing who to trust, he enlists the aid of the young maid Else, Wanda Hendrix. Then, he runs into Contreras, Peter Lorre, and Mrs. Melandy, Katina Paxinou, Oscar winner, in 1943's For Whom the Bell Tolls. It’s a convoluted race to the end.http://shop.tcm.com/confidential-agent-dvd/detail.php?p=356570


Movie : 1 The film presents the love stories of six couples, intertwined and developed between Christmas and

In [None]:
# Function to compute the vector of a query
def compute_vector_query(query):
    d_words = query.split()
    for word in d_words:
        if word not in model:
            d_words.remove(word)
            print("Word not in model: ", word)
    if len(d_words) == 0:
        return average_vector
    else:
        # compute tf of the query
        query_tf = defaultdict(int)
        for word in d_words:
            query_tf[word] += 1
        query_tf = {word: len(d_words) / count for word, count in query_tf.items()}
        # log normalization
        query_tf = {word: np.log1p(count) for word, count in query_tf.items()}

        
        # retreive idf of the words in the query
        query_idf =  np.array([idf.T[word_to_index[word]].todense() for word in d_words]).flatten()
        # compute the weighted vector
        weights = np.array([query_tf[word] * query_idf[i] for i, word in enumerate(d_words)])



        print("Weights : ", weights)
        print(model[d_words[0]].shape)
        weighted_vec = np.array([np.mean([model[word] * model[word] for i, word in enumerate(d_words)], axis=0)])
        return weighted_vec.flatten()

In [107]:
# We can now find the most similar movies to a given querry string : 
def document_similarity(query_vector, n=5):
    # Compute the cosine similarity between the query vector and the movies vectors
    similarities = np.array([np.dot(query_vector, data["Vector"][i].T) / (np.linalg.norm(query_vector) * np.linalg.norm(data["Vector"][i])) for i in range(data["Vector"].shape[0])])
    # Find the n most similar movies
    best_movies = np.argsort(similarities)[::-1][:n]
    return best_movies
    


# Example of query
query1 = "werewolf character"
vec1 = compute_vector_query(query1)
print(vec1)
best_movies = document_similarity(vec1)
print(best_movies)
print("Best movies for query: ", query1)
for i in document_similarity(vec1):
    # Print the name of the movie
    print(meta_data[meta_data["movie_id"] == data["Movie ID"][i]]["movie_name"].values[0])
    # Print the description of the movie
    print(df_plot[df_plot["Movie ID"] == data["Movie ID"][i]]["Plot summary"].values[0])





Query TF:  {'werewolf': 2.0, 'character': 2.0}
Weights :  [6.4434946  3.63409325]
(300,)
[6.55939896e-03 1.20342949e-04 9.66016203e-04 8.48920271e-03
 4.71307756e-03 7.04863342e-04 8.00869195e-04 5.57890045e-04
 1.99485733e-03 2.39042006e-03 2.25528609e-03 5.59701351e-03
 2.64238869e-03 4.65249992e-04 4.53990244e-04 1.21604571e-04
 8.91484227e-03 1.05312299e-02 2.27132114e-04 3.20314546e-03
 1.38001703e-03 1.00780034e-03 1.78448646e-03 9.25637875e-03
 5.52359910e-04 4.51600598e-03 2.52302829e-03 1.94830715e-03
 1.06864900e-03 1.57552236e-03 2.00316962e-03 1.93127373e-03
 7.68548169e-04 4.38491814e-03 5.56705613e-03 2.04658369e-03
 1.14275736e-03 1.88288942e-03 3.67032015e-03 1.39123690e-03
 1.46326434e-04 2.83595407e-03 3.59921157e-03 3.50383483e-03
 1.72204268e-03 2.41535730e-04 1.77764334e-03 5.05242031e-03
 1.99029848e-04 5.50948782e-04 2.23658935e-04 9.66711668e-04
 1.18781226e-02 1.15526477e-02 1.96652114e-03 5.65319695e-03
 3.57843284e-03 1.48645748e-04 2.36871000e-03 7.96386320e

In [97]:

## Unuseful code

# extract the genres from the genre list json
genre_list = re.findall(r'\"(.*?)\"', genre_list_string)
for genre in genre_list:
    # remove entries containing a / as they are not genres
    if "/" not in genre:
        if genre in GENRES:
            genre_hist[GENRES.index(genre)] += 1
        else:
            GENRES.append(genre)
            genre_hist = np.append(genre_hist, 1)
        



NameError: name 'genre_list_string' is not defined