In [59]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from scipy.spatial.distance import cosine


# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-Large-cased')
model = BertModel.from_pretrained('bert-large-cased')

# Function to tokenize and encode text
def encode(text, max_length=512):
    # Subtract 2 for [CLS] and [SEP] tokens
    if len(text) == 0:
        print("Empty text")  # Debugging
    
    max_length -= 2
    tokens = tokenizer.tokenize(text)
    if len(tokens) == 0:
        print("Empty tokens")  # Debugging
    chunks = [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]
    if not chunks:  # Check if chunks are empty
        print(f"No chunks for text: {text}")  # Debugging

    # Process each chunk
    chunk_embeddings = []
    for chunk in chunks:
        # Add special tokens
        chunk = ['[CLS]'] + chunk + ['[SEP]']
        input_ids = tokenizer.convert_tokens_to_ids(chunk)
        input_tensor = torch.tensor([input_ids]).to('cuda' if torch.cuda.is_available() else 'cpu')
        with torch.no_grad():
            last_hidden_states = model(input_tensor)[0]  # Get the embeddings
        chunk_embeddings.append(last_hidden_states[0].mean(dim=0))

    # Aggregate the embeddings from each chunk (mean pooling here)
    embeddings = torch.mean(torch.stack(chunk_embeddings), dim=0)
    return embeddings

# Function to calculate cosine similarity manually
def calculate_similarity(embedding1, embedding2):
    similarity = 1 - cosine(embedding1, embedding2)
    return similarity

# Read datasets
events = pd.read_csv('data/events.csv')
#convert the year column to int
events['Year'] = events['Year'].astype(int)

# Tokenize, encode, and get embeddings
events['Embeddings'] = events['Event Description'].apply(lambda x: encode(x).tolist() if pd.notnull(x) else None)



In [60]:
#save the embeddings of events as a csv file
events.to_csv('data/events_embeddings.csv', index=False)

In [61]:
import ast


events_embedded = pd.read_csv('data/events_embeddings.csv')
# Function to convert string representation back to list
def string_to_list(string):
    try:
        return torch.tensor(ast.literal_eval(string))

    except ValueError:
        return None  # or an appropriate default value

# Apply the function to each row
events_embedded['Embeddings'] = events_embedded['Embeddings'].apply(string_to_list)

len(events_embedded['Embeddings'][0])


1024

In [62]:
#read tsv file and add headers
movie_metadata_df = pd.read_csv('data/movie.metadata.tsv', sep='\t', header=None, 
                names=['wiki_movie_id', 
                        'freebase_movie_id', 
                        'movie_name', 
                        'movie_release_date', 
                        'movie_box_office_revenue', 
                        'movie_runtime', 
                        'movie_languages', 
                        'movie_countries', 
                        'movie_genres'])

#changing the values of outliers
movie_metadata_df.loc[movie_metadata_df['movie_name'] == 'Zero Tolerance', 'movie_runtime'] = 88
movie_metadata_df.loc[movie_metadata_df['movie_name'] == 'Hunting Season', 'movie_release_date'] = '2010-12-02'

#add realase_year 
movie_metadata_df['startYear']= movie_metadata_df['movie_release_date'].str[:4]

#change movie_release_date to pandas datetime
movie_metadata_df['movie_release_date'] = pd.to_datetime(movie_metadata_df['movie_release_date'], format='%Y-%m-%d', errors='coerce')

#load IMDB reviews
rating_id_df = pd.read_csv('data/rating_id.tsv',  sep='\t')
name_id_df = pd.read_csv('data/name_id.tsv',  sep='\t')
rating_df = pd.merge(rating_id_df, name_id_df, on='tconst')

#drop unnecessary columns 
rating_df.drop(['originalTitle','isAdult','endYear','runtimeMinutes','genres'], axis=1, inplace=True)

#loading the plot summaries dataset and add headers
plot_summaries_df = pd.read_csv('data/plot_summaries.txt', sep='\t', header=None, 
                names=['wiki_movie_id', 
                        'plot_summary'])
#merging the movie metadata with the rating data on movie name and release year
movies_ratings = pd.merge(movie_metadata_df, rating_df,  on=['movie_name', 'startYear'])
movies_ratings.shape

# printing the types of the merged data 
movies_ratings['titleType'].unique()

#remove any {{ }} from the plot summary text
plot_summaries_df['plot_summary'] = plot_summaries_df['plot_summary'].str.replace(r'\{\{.*?\}\}', '', regex=True)

# remove all summaries with length = 0
plot_summaries_df = plot_summaries_df[plot_summaries_df['plot_summary'].str.len() > 0]

33551961
# keeping only movies, delete tv episodes, tv movies, video games, etc.
movies_ratings = movies_ratings[movies_ratings['titleType']=='movie']


# only keep the movies with more than 100 votes on imdb ratings
movies_ratings = movies_ratings[movies_ratings['numVotes']>200]
movies_ratings.shape

#keep movie_metadata_df only with movies that have ratings
movie_metadata_df = movie_metadata_df[movie_metadata_df['freebase_movie_id'].isin(movies_ratings['freebase_movie_id'])]
movie_metadata_df.shape

#keep the summaries of the selected movies 
plot_summaries_df = plot_summaries_df[plot_summaries_df['wiki_movie_id'].isin(movie_metadata_df['wiki_movie_id'])]
plot_summaries_df.shape

#keep movie_metadata_df only with movies that have summaries
movie_metadata_df = movie_metadata_df[movie_metadata_df['wiki_movie_id'].isin(plot_summaries_df['wiki_movie_id'])]
movie_metadata_df.shape

# save the cleaned summary dataset
plot_summaries_df.to_csv('data/plot_summaries_cleaned.csv', index=False)



  name_id_df = pd.read_csv('data/name_id.tsv',  sep='\t')


In [None]:
plot_summaries_df.shape 

(20160, 2)

In [63]:
#take only the first 30 rows of the plot summaries dataset
#plot_summaries_df = plot_summaries_df.head(114)

# add a column to the plot_summaries_df with embedding of the summary
plot_summaries_df['Embeddings'] = plot_summaries_df['plot_summary'].apply(lambda x: encode(x).tolist() if pd.notnull(x) else None)

#save the embeddings of summaries as a csv file
plot_summaries_df.to_csv('data/plot_summaries_embeddings.csv', index=False)





KeyboardInterrupt: 

In [None]:
#merge the movie_metadata_df with the plot_summaries_df
merged_movie_df = pd.merge(plot_summaries_df, movie_metadata_df, on='wiki_movie_id')
merged_movie_df.head()

Unnamed: 0,wiki_movie_id,plot_summary,Embeddings,Event,Similarity,freebase_movie_id,movie_name,movie_release_date,movie_box_office_revenue,movie_runtime,movie_languages,movie_countries,movie_genres,startYear
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha...","[tensor(-0.1032), tensor(0.3488), tensor(-0.16...",,,/m/076w2lb,Taxi Blues,1990-09-07,,110.0,"{""/m/06b_j"": ""Russian Language""}","{""/m/0f8l9c"": ""France"", ""/m/05vz3zq"": ""Soviet ...","{""/m/07s9rl0"": ""Drama"", ""/m/03q4nz"": ""World ci...",1990
1,31186339,The nation of Panem consists of a wealthy Capi...,"[tensor(0.0094), tensor(0.4040), tensor(-0.001...",,,/m/0gkz15s,The Hunger Games,2012-03-12,686533290.0,142.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/03btsm8"": ""Action/Adventure"", ""/m/06n90"":...",2012
2,20663735,Poovalli Induchoodan is sentenced for six yea...,"[tensor(-0.1610), tensor(0.4349), tensor(0.109...",,,/m/051zjwb,Narasimham,NaT,,175.0,"{""/m/0999q"": ""Malayalam Language""}","{""/m/03rk0"": ""India""}","{""/m/04t36"": ""Musical"", ""/m/02kdv5l"": ""Action""...",2000
3,2231378,"The Lemon Drop Kid , a New York City swindler,...","[tensor(0.0528), tensor(0.4932), tensor(0.0889...",,,/m/06xtz3,The Lemon Drop Kid,1951-03-08,2300000.0,91.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06qm3"": ""Screwball comedy"", ""/m/01z4y"": ""...",1951
4,595909,Seventh-day Adventist Church pastor Michael Ch...,"[tensor(0.0699), tensor(0.3469), tensor(-0.008...",,,/m/02tqm5,A Cry in the Dark,1988-11-03,6908797.0,121.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America"", ""/m/...","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D...",1988


In [None]:
#create a new dataframe with the movie name, movie summary, matched event, similarity score, and event description
linked_movies_and_events_df = pd.DataFrame(columns=['movie_name', 'plot_summary', 'matched_event', 'similarity_score'])

def match_event_from_summary_embeddings(movie_embedding, movie_release_year):
    # Filter events that happened before the movie was released (at least 2 years since it takes time to make a movie)
    filtered_events = events[events['Year'] < (int(movie_release_year)-2)]
    # Calculate the similarity between the movie and all events
    similarities = filtered_events['Embeddings'].apply(lambda x: calculate_similarity(movie_embedding, x))
    # Get the index of the most similar event
    index = similarities.idxmax()
    # Get the similarity score of the most similar event
    similarity = similarities[index]
    # Get the name of the most similar event
    matched_event_name = filtered_events.loc[index]['Event Name']
    return similarity, matched_event_name

# Loop through each movie
for index, movie in merged_movie_df.iterrows():
    # Get the movie embedding
    movie_embedding = movie['Embeddings']
    movie_release_year = movie['startYear']
    similarity,matched_event = match_event_from_summary_embeddings(movie_embedding,movie_release_year)
    # Add the movie name, summary, matched event, and similarity score to the dataframe
    # add to the merged_movie_df the matched event and similarity score
    movie_metadata_df.loc[index, 'matched_event'] = matched_event
    movie_metadata_df.loc[index, 'similarity_score'] = similarity
        

  movie_metadata_df.loc[index, 'matched_event'] = matched_event


In [None]:
movie_metadata_df

Unnamed: 0,wiki_movie_id,freebase_movie_id,movie_name,movie_release_date,movie_box_office_revenue,movie_runtime,movie_languages,movie_countries,movie_genres,startYear,matched_event,similarity_score
0,975900.0,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",2001,"Publication of ""The Great Gatsby""",0.697752
4,261236.0,/m/01mrr1,A Woman in Flames,NaT,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}",1983,Sacco and Vanzetti Arrested,0.760739
13,171005.0,/m/016ywb,Henry V,1989-11-08,10161099.0,137.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/04xvh5"": ""Costume drama"", ""/m/082gq"": ""Wa...",1989,Gold Rush in Australia,0.734199
17,77856.0,/m/0kcn7,Mary Poppins,1964-08-27,102272727.0,139.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/0hj3myq"": ""Children's/Family"", ""/m/04t36""...",1964,Cuban Missile Crisis,0.720881
22,21926710.0,/m/05p45cv,White on Rice,NaT,,82.0,{},"{""/m/09c7w0"": ""United States of America""}","{""/m/06cvj"": ""Romantic comedy"", ""/m/02l7c8"": ""...",2009,"Publication of ""The Great Gatsby""",0.778169
...,...,...,...,...,...,...,...,...,...,...,...,...
91,,,,NaT,,,,,,,Death of Princess Diana,0.780011
94,,,,NaT,,,,,,,The First Feature-Length Motion Picture with S...,0.813890
95,,,,NaT,,,,,,,"Publication of ""The Great Gatsby""",0.767204
96,,,,NaT,,,,,,,Publication of Moby Dick,0.761621
