In [1]:
#imports

import pandas as pd
import re
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import numpy as np
import gensim
import gensim.downloader as api
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
model = model = gensim.models.KeyedVectors.load_word2vec_format('../../data/w2v/6/model.txt', binary=False)
print(model.most_similar(positive=["king", "woman"], negative=["man"]))



[('queen', 0.7168769240379333), ('monarch', 0.6803387403488159), ('princess', 0.6657402515411377), ('kings', 0.6593936085700989), ('regnant', 0.6519632935523987), ('monarchs', 0.6379128098487854), ('consort', 0.6045569777488708), ('prince', 0.6032231450080872), ('throne', 0.5884857773780823), ('royal', 0.5883517861366272)]


In [3]:
# Load data
data = pd.read_csv('../../data/MovieSummaries/plot_summaries_cleaned.csv')
print(data.head())

meta_data = pd.read_csv('../../data/MovieSummaries/movie.metadata.tsv', sep='\t', header=None)
meta_data.columns = ["movie_id",1,"movie_name",3,4,5,6,7,"genre"]
print(meta_data.head())

   Movie ID                                       Plot summary
0  23890098  Shlykov hardworking taxi driver Lyosha saxopho...
1  31186339  nation Panem consists wealthy Capitol twelve p...
2  20663735  Poovalli Induchoodan sentenced six years priso...
3   2231378  Lemon Drop Kid New York City swindler illegall...
4    595909  Seventhday Adventist Church pastor Michael Cha...
   movie_id           1                                         movie_name  \
0    975900   /m/03vyhn                                     Ghosts of Mars   
1   3196793   /m/08yl5d  Getting Away with Murder: The JonBenét Ramsey ...   
2  28463795  /m/0crgdbh                                        Brun bitter   
3   9363483  /m/0285_cd                                   White Of The Eye   
4    261236   /m/01mrr1                                  A Woman in Flames   

            3           4      5                                   6  \
0  2001-08-24  14010832.0   98.0  {"/m/02h40lc": "English Language"}   
1  2000-0

In [4]:
words2vec = set()

words_not_in_model = set()
# Check if all the words in the descriptions are in the model
for description in data["Plot summary"]:
    for word in description.split():
        if word in model:
            words2vec.add(word)
        else :
            # If the word is not in the model, we will try to change the first letter 
                word = word.capitalize()
                if word in model:
                    words2vec.add(word)
                else:
                    words_not_in_model.add(word)
print("Number of words in the model: ", len(words2vec))
print("Number of words not in the model: ", len(words_not_in_model))

print("lengh of a word2vec vector: ", len(model["king"]))

# Average vector for the words not in the model
average_vector = np.mean([model[word] for word in words2vec], axis=0)


Number of words in the model:  103336
Number of words not in the model:  112603
lengh of a word2vec vector:  300


In [5]:
# Filter out the words that are not in the model


def filter_words_not_in_model_helper(word): # We explain a bit later why me choose to filter out words not in the model
    if word in model:
        return word
    elif word.lower() in model:
        return word.lower()
    elif word.capitalize() in model:
        return word.capitalize()
    else:
        return ""
def filter_words_not_in_model(s):
    return " ".join([filter_words_not_in_model_helper(word) for word in s.split()])

def filter_basic_patterns(s):
    pattern = "|".join([
    "\\d+", # Matches digits.
    r'http?://\S+|www\.\S+', # Matches url links
    ",", "\.", ":", "\(", "\)","_", "\{", "\}", "\?", "!", "&", "/", "\[", "\]", "\|", "#", "%", "\"", "\'", ";", "-", '®', 'à', '>', '<', '=', 'ü', "\*"
    ])
     # Cast uppercase letters that are not at the beginning of the string to lowercase + add a space before them
    s = re.sub(r"(?<!^)[A-Z]", lambda x: " " + x.group(0).lower(), s)
    # Cast uppercase letters that are after a point and a space to lowercase, as they are starting a new sentence
    s = re.sub(r"\. [A-Z]", lambda x: ". " + x.group(0).lower(), s)

    s = re.sub(pattern, " ", s) # replace by spaces to avoid a:b or a,b becoming ab instead of a b. 
    return s

def preprocess(s):
        s = filter_basic_patterns(s)
        s = filter_words_not_in_model(s)
        s = re.sub(r"\s+", " ", s).strip() # removing uncessary spaces
    
        return s

data["Plot summary"] = data["Plot summary"].apply(preprocess)

# Check if all the words in the descriptions are in the model
words2vec.clear()
words_not_in_model.clear()
for description in data["Plot summary"]:
    for word in description.split():
        if word in model:
            words2vec.add(word)
        else :
            words_not_in_model.add(word)
print("Number of words in the model: ", len(words2vec))
print("Number of words not in the model: ", len(words_not_in_model))

# find occurences of "_" 
for word in words2vec:
    if "_" in word:
        print(word)




Number of words in the model:  87637
Number of words not in the model:  0


In [6]:
# Save the data
data.to_csv("../../data/MovieSummaries/plot_summaries_cleaned_fit_model.csv", index=False)

In [7]:
# Clustering words

X = np.array([model[word] for word in words2vec])

# Now, we will cluster the words into 10 clusters
kmeans = KMeans(n_clusters=20, random_state=0).fit(X)

# Print the 10 words closest from each cluster center
for i in range(20):
    print("Cluster", i+1)
    print([model.similar_by_vector(kmeans.cluster_centers_[i])[j][0] for j in range(10)])

Cluster 1
['sibyl', 'Liberalis', 'elohim', 'gods', 'Cornutus', 'pious', 'Ēl', 'divine', 'daimon', 'numen']
Cluster 2
['Suleymanov', 'Rovshan', 'Seyid', 'Afandi', 'Aghasi', 'Nurettin', 'Mustafa', 'Ibrahim', 'Ismayil', 'Akhmed']
Cluster 3
['drawstring', 'fastened', 'grommets', 'buttonholes', 'footrest', 'grater', 'unscrewed', 'unscrew', 'crossguard', 'straps']
Cluster 4
['confronts', 'escapes', 'discovers', 'kidnaps', 'realizes', 'flees', 'terrifies', 'minions', 'learns', 'pretending']
Cluster 5
['Marischka', 'Thimig', 'Schönherr', 'Werner', 'Georg', 'Taubert', 'Heesters', 'Hans', 'Stöhr', 'Ludewig']
Cluster 6
['acetosella', 'crowberry', 'hawthorns', 'sapote', 'triandra', 'Ceratophyllus', 'Alternanthera', 'endive', 'Sporobolus', 'Ageratina']
Cluster 7
['Afroman', 'Hypnotized', 'Astro-man', "C'Mon", 'Ultramagnetic', 'Troublemakers', 'Daydreamer', 'Bugged', 'Aint', 'Dreamhouse']
Cluster 8
['Chavarría', 'Luengo', 'José', 'Blázquez', 'Jesús', 'María', 'Seoane', 'Dávalos', 'Feliú', 'Villalón'

In [8]:
# First, compute the TF-IDF matrix of the preprocessed descriptions
vectorizer = CountVectorizer(lowercase=False, tokenizer=lambda x: x.split())

# Count number of occurence of every word for each document
vectorized = vectorizer.fit_transform(data['Plot summary'])
print(vectorized.shape)
print(vectorizer.get_feature_names_out().shape)

word_counts = vectorized.tocsr()
print(type(word_counts), word_counts.shape)

# Delete vectorized to free memory
del vectorized

# Compute the tf and the idf 
# the tf is the number of occurences of a word in a document divided by the total number of words in the document
tf = word_counts / word_counts.sum(axis=1)
# perform log normalization of the tf
tf = csr_matrix(np.log1p(tf))

# the idf is the log of the inverse of the number of documents containing the word

idf = np.log(len(data) / (word_counts != 0).sum(axis=0))
# Convert the idf to a sparse matrix
idf = csr_matrix(idf)




(42303, 87637)
(87637,)
<class 'scipy.sparse._csr.csr_matrix'> (42303, 87637)


In [9]:
# Compute the tf-idf matrix 
print(tf.T.shape, idf.T.shape)
X = tf.T.multiply(idf.T)

print(X.shape)

(87637, 42303) (87637, 1)
(87637, 42303)


In [14]:
print("Shape of the TF-IDF matrix:", X.shape)

# Build maps from word to index and from movie ID to index
word_to_index = {word: i for i, word in enumerate(vectorizer.get_feature_names_out())}
movie_id_to_index = {movie_id: i for i, movie_id in enumerate(data['Movie ID'])}

def compute_vector_movie(document, movieID):
    d_words = document.split()
  
    if len(d_words) == 0:
        return average_vector
    else:
        weighted_vec = np.array([np.mean([model[word] * X[word_to_index[word], movie_id_to_index[movieID]] for word in d_words], axis=0)]).flatten()
        return weighted_vec/np.linalg.norm(weighted_vec)

Shape of the TF-IDF matrix: (87637, 42303)


In [15]:
# Compute the vector for each movie description 

vectors = np.array([compute_vector_movie(document, movieID) for document, movieID in zip(data['Plot summary'], data['Movie ID'])])

print("Shape of the vectors matrix:", vectors.shape)


Shape of the vectors matrix: (42303, 300)


In [16]:
# Load unprocessed descriptions
df_plot = pd.read_csv('../../data/MovieSummaries/plot_summaries.txt', sep="\t", header=None)
df_plot.columns = ["Movie ID", "Plot summary"]

df_plot[['Movie ID', 'Plot summary']]

Unnamed: 0,Movie ID,Plot summary
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...
...,...,...
42298,34808485,"The story is about Reema , a young Muslim scho..."
42299,1096473,"In 1928 Hollywood, director Leo Andreyev look..."
42300,35102018,American Luthier focuses on Randy Parsons’ tra...
42301,8628195,"Abdur Rehman Khan , a middle-aged dry fruit se..."


In [20]:
# clustering the movies
kmeans = KMeans(n_clusters=10, random_state=0).fit(vectors)

GENRES = []
# Get the 10 movies closest from each cluster center
for i in range(10):
    print("Cluster", i+1)
    # Print the 5 closest words to the cluster center
    print("Closest words : ", [model.similar_by_vector(kmeans.cluster_centers_[i])[j][0] for j in range(10)])
    # For each cluster, print the movie descriptions to see if the clustering makes sense
    for j in range(5):
        closest_movies_ids = data.iloc[np.argsort(np.linalg.norm(vectors - kmeans.cluster_centers_[i], axis=1))[:5]]["Movie ID"].values
        print()
        print("Movie :", j, df_plot[df_plot["Movie ID"].isin(closest_movies_ids)]["Plot summary"].values[j])
        

    

    


Cluster 1
Closest words :  ['Alcoriza', 'Seigner', 'Luisa', 'Estefania', 'Béart', 'Vanina', 'Maricruz', 'Marie-Laure', 'Guiomar', 'Graziella']

Movie : 0 The film is set in the 1950s in a large country residence, as a family and its servants are preparing for Christmas, when the master of the house is discovered dead in his bed, with a dagger stuck into his back. The murderer must be one of the eight women in the house at the time, and in the course of the investigations each has a tale to tell and secrets to hide. The scene opens with Suzon returning from school for Christmas break, finding her mother Gaby, her younger sister Catherine, and her wheelchair-bound grandmother Mamy in the living room, where most of the action of the film takes place. Their conversation drifts to the subject of the patriarch of the family, and Catherine leads the first song of the film, "Papa t'es plus dans le coup" . The singing wakes up Suzon and Catherine's aunt Augustine, who picks arguments with the r

In [None]:
print(len(GENRES))

0


In [None]:

## Unuseful code

# extract the genres from the genre list json
genre_list = re.findall(r'\"(.*?)\"', genre_list_string)
for genre in genre_list:
    # remove entries containing a / as they are not genres
    if "/" not in genre:
        if genre in GENRES:
            genre_hist[GENRES.index(genre)] += 1
        else:
            GENRES.append(genre)
            genre_hist = np.append(genre_hist, 1)
        

