# LOAD DATA

In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from scipy.spatial.distance import cosine
import json

#read tsv file and add headers
movie_metadata_df = pd.read_csv('data/CMU_DATA/movie.metadata.tsv', sep='\t', header=None, 
                names=['wiki_movie_id', 
                        'freebase_movie_id', 
                        'movie_name', 
                        'movie_release_date', 
                        'movie_box_office_revenue', 
                        'movie_runtime', 
                        'movie_languages', 
                        'movie_countries', 
                        'movie_genres'])

#changing the values of outliers
movie_metadata_df.loc[movie_metadata_df['movie_name'] == 'Zero Tolerance', 'movie_runtime'] = 88
movie_metadata_df.loc[movie_metadata_df['movie_name'] == 'Hunting Season', 'movie_release_date'] = '2010-12-02'

#add realase_year 
movie_metadata_df['startYear']= movie_metadata_df['movie_release_date'].str[:4]

#change movie_release_date to pandas datetime
movie_metadata_df['movie_release_date'] = pd.to_datetime(movie_metadata_df['movie_release_date'], format='%Y-%m-%d', errors='coerce')
movie_metadata_df.head()

#loading the plot summaries dataset and add headers
plot_summaries_df = pd.read_csv('data/CMU_DATA/plot_summaries.txt', sep='\t', header=None, 
                names=['wiki_movie_id', 
                        'plot_summary'])
plot_summaries_df.head()
#read text file and add headers, name.clusters.txt
name_clusters_df = pd.read_csv('data/CMU_DATA/name.clusters.txt', sep='\t', header=None, 
                names=['character_name', 
                        'freebase_character_actor_map_id'])
name_clusters_df.head()
#read text file and add headers, name.clusters.txt
name_clusters_df = pd.read_csv('data/CMU_DATA/name.clusters.txt', sep='\t', header=None, 
                names=['character_name', 
                        'freebase_character_actor_map_id'])
name_clusters_df.head()
#read text file and add headers, name.clusters.txt
name_clusters_df = pd.read_csv('data/CMU_DATA/name.clusters.txt', sep='\t', header=None, 
                names=['character_name', 
                        'freebase_character_actor_map_id'])
name_clusters_df.head()
#read text file and add headers
character_metadata_df = pd.read_csv('data/CMU_DATA/character.metadata.tsv', sep='\t', header=None, 
                names=['wiki_movie_id',
                        'freebase_movie_id',
                        'movie_release_date',
                        'character_name',
                        'actor_date_of_birth',
                        'actor_gender',
                        'actor_height',
                        'actor_ethnicity',
                        'actor_name',
                        'actor_age_at_movie_release',
                        'freebase_character_actor_map_id',
                        'freebase_character_id',
                        'freebase_actor_id'])
character_metadata_df.head()
#read text file and add headers
tvtropes_clusters_df = pd.read_csv('data/CMU_DATA/tvtropes.clusters.txt', sep='\t', header=None, names=['character_type', 'data_dict'])
#transform second column to columns
tvtropes_clusters_df['data_dict'] = tvtropes_clusters_df['data_dict'].apply(json.loads)
tvtropes_clusters_df = pd.concat([tvtropes_clusters_df.drop(['data_dict'], axis=1), tvtropes_clusters_df['data_dict'].apply(pd.Series)], axis=1)
#rename columns
tvtropes_clusters_df.columns = ['character_type', 'character_name', 'movie_name', 'freebase_character_actor_map_id', 'actor_name']
tvtropes_clusters_df.head()
#load IMDB reviews
rating_id_df = pd.read_csv('data/IMDB_DATA/rating_id.tsv',  sep='\t')
name_id_df = pd.read_csv('data/IMDB_DATA/name_id.tsv',  sep='\t')
rating_df = pd.merge(rating_id_df, name_id_df, on='tconst')

#drop unnecessary columns 
rating_df.drop(['originalTitle','isAdult','endYear','runtimeMinutes','genres'], axis=1, inplace=True)
rating_df.head()
#loading the events dataset 
events = pd.read_csv('data/GENERATED_DATA/events.csv')
events.head()
#merging the movie metadata with the rating data on movie name and release year
movies_metadata_ratings = pd.merge(movie_metadata_df, rating_df,  on=['movie_name', 'startYear'])
movies_metadata_ratings.head()
#remove any {{ }} from the plot summary text
plot_summaries_df['plot_summary'] = plot_summaries_df['plot_summary'].str.replace(r'\{\{.*?\}\}', '', regex=True)
# keeping only movies, delete tv episodes, tv movies, video games, etc.
movies_metadata_ratings = movies_metadata_ratings[movies_metadata_ratings['titleType']=='movie']
movies_metadata_ratings.head()
# only keep the movies with more than 200 votes on imdb ratings
movies_metadata_ratings = movies_metadata_ratings[movies_metadata_ratings['numVotes']>200]
movies_metadata_ratings.shape
#keep movie_metadata_df only with movies that have ratings
movie_metadata_df = movie_metadata_df[movie_metadata_df['freebase_movie_id'].isin(movies_metadata_ratings['freebase_movie_id'])]
movie_metadata_df.shape
#keep the summaries of the selected movies 
plot_summaries_df = plot_summaries_df[plot_summaries_df['wiki_movie_id'].isin(movie_metadata_df['wiki_movie_id'])]
plot_summaries_df.shape
#keep movie_metadata_df only with movies that have summaries
movie_metadata_df = movie_metadata_df[movie_metadata_df['wiki_movie_id'].isin(plot_summaries_df['wiki_movie_id'])]
movie_metadata_df.shape
#keep movies_metadata_ratings only with movies that have summaries
movies_metadata_ratings = movies_metadata_ratings[movies_metadata_ratings['wiki_movie_id'].isin(plot_summaries_df['wiki_movie_id'])]
movies_metadata_ratings.shape
#keep the character_metadata_df only with movies that have summaries and ratings
character_metadata_df = character_metadata_df[character_metadata_df['wiki_movie_id'].isin(movie_metadata_df['wiki_movie_id'])]
character_metadata_df.shape
#keep the tvtropes_clusters_df only with movies that have summaries and ratings
tvtropes_clusters_df = tvtropes_clusters_df[tvtropes_clusters_df['freebase_character_actor_map_id'].isin(character_metadata_df['freebase_character_actor_map_id'])]
tvtropes_clusters_df.shape

  name_id_df = pd.read_csv('data/IMDB_DATA/name_id.tsv',  sep='\t')


(453, 5)

# load the event embeddings and summaries embeddings

In [3]:
import ast
import pandas as pd
import torch
from scipy.spatial.distance import cosine


# Function to convert string representation back to list
def string_to_tensor(string):
    try:
        return torch.tensor(ast.literal_eval(string))

    except ValueError:
        return None  
    
events_embedded = pd.read_csv('data/EMBEDDINGS/events_embeddings.csv')
# Convert string representation back to tensor
events_embedded['Embeddings'] = events_embedded['Embeddings'].apply(string_to_tensor)

summaries_embedded = pd.read_csv('data/EMBEDDINGS/plot_summaries_embeddings.csv')
# Convert string representation back to tensor
summaries_embedded['Embeddings'] = summaries_embedded['Embeddings'].apply(string_to_tensor)


# LINK EVERY MOVIE SUMMARY WITH AN EVENT

In [4]:
#create a merged dataset with the movie metadata and the summaries(with the embeddings)
merged_movie_df = pd.merge(summaries_embedded, movies_metadata_ratings, on='wiki_movie_id',how='inner')

# Function to calculate cosine similarity manually
def calculate_similarity(embedding1, embedding2):
    similarity = 1 - cosine(embedding1, embedding2)
    return similarity


In [5]:
def match_event_from_summary_embeddings(movie_embedding, movie_release_year):
    # Filter events that happened before the movie was released (at least 2 years since it takes time to make a movie)
    filtered_events = events_embedded[events_embedded['Year'] < (int(movie_release_year)-2)]
    # Calculate the similarity between the movie and all events
    similarities = filtered_events['Embeddings'].apply(lambda x: calculate_similarity(movie_embedding, x))
    # Get the index of the most similar event
    index = similarities.idxmax()
    # Get the similarity score of the most similar event
    similarity = similarities[index]
    # Get the name of the most similar event
    matched_event_name = filtered_events.loc[index]['Event Name']
    return similarity, matched_event_name

# Initialize an empty list to store movie data
linked_movies_events = []

# Loop through each movie
for index, movie in merged_movie_df.iterrows():
    # Your existing logic to get movie_embedding, movie_release_year, similarity, and matched_event
    movie_embedding = movie['Embeddings']
    movie_release_year = movie['startYear']
    similarity, matched_event = match_event_from_summary_embeddings(movie_embedding, movie_release_year)
    event_description = events[events['Event Name'] == matched_event]['Event Description'].values[0]

    # Create a dictionary for the current movie
    movie_data = {
        'wiki_movie_id': movie['wiki_movie_id'],
        'movie_name': movie['movie_name'],
        'plot_summary': movie['plot_summary'],
        'matched_event': matched_event,
        'event description': event_description,
        'similarity_score': similarity,
        'rating': movie['averageRating']
    }

    # Append the dictionary to the list
    linked_movies_events.append(movie_data)

# Create a DataFrame from the list of dictionaries
linked_movies_and_events_df = pd.DataFrame(linked_movies_events)


In [6]:
linked_movies_and_events_df

Unnamed: 0,wiki_movie_id,movie_name,plot_summary,matched_event,event description,similarity_score,rating
0,23890098,Taxi Blues,"Shlykov, a hard-working taxi driver and Lyosha...",Fascist Italy and Soviet Russia Establish Dipl...,Mussolini’s Fascist Italy and the Soviet Union...,0.642929,7.3
1,31186339,The Hunger Games,The nation of Panem consists of a wealthy Capi...,Unabomber Arrested,After nearly two decades of terror and a natio...,0.704186,7.2
2,20663735,Narasimham,Poovalli Induchoodan is sentenced for six yea...,Inauguration of Operation Blue Star,Indian Prime Minister Indira Gandhi ordered a ...,0.711646,7.6
3,2231378,The Lemon Drop Kid,"The Lemon Drop Kid , a New York City swindler,...",Siege of Sidney Street,"A notorious gunfight in London’s East End, inv...",0.713506,7.0
4,595909,A Cry in the Dark,Seventh-day Adventist Church pastor Michael Ch...,Trial of Oscar Wilde,Prominent author Oscar Wilde was put on trial ...,0.762600,6.9
...,...,...,...,...,...,...,...
20210,17208834,Dance of the Dead,A high school prom in Georgia is unexpectedly ...,Unabomber Arrested,After nearly two decades of terror and a natio...,0.729695,5.8
20211,30352708,Paju,Twenty-something Eun-mo listens to a taxi driv...,Donghak Peasant Revolution,The Donghak Peasant Revolution began in South ...,0.736914,6.5
20212,1096473,The Last Command,"In 1928 Hollywood, director Leo Andreyev look...",Sergei Diaghilev's Ballet Russes,Russian impresario Sergei Diaghilev’s Ballets ...,0.719882,8.0
20213,8628195,Kabuliwala,"Abdur Rehman Khan , a middle-aged dry fruit se...",Mother Teresa Opened A Home,"In 1952, Mother Teresa opened the first Home f...",0.738110,8.0


In [7]:
#save the dataframe to csv
linked_movies_and_events_df.to_csv('data/RESULTS//linked_movies_and_events_df.csv', index=False)