In [86]:
import pandas as pd
import numpy as np 
import ast


In [87]:
# Define data paths constants
PLOT_DATA_PATH = "../data/plot_summaries.txt"
MOVIE_DATA_PATH = "../data/movie.metadata.tsv"
CLUSTER_NAME_DATA_PATH = "../data/name.clusters.txt"
CHARACTER_DATA_PATH = "../data/character.metadata.tsv"

We separate the data into two main categories, movie related data and actors related data

# Movie related

## Cleaning movie metada

In [88]:
"""
Function to parse dictionary-like strings in the file and separate keys and values
"""
def parse_dict_column(column):
    parsed_keys = []
    parsed_values = []
    
    for item in column:
        # Convert string representation of dictionary to actual dictionary
        item_dict = ast.literal_eval(item)
        parsed_keys.append(", ".join(item_dict.keys()))
        parsed_values.append(", ".join(item_dict.values()))
    
    return parsed_keys, parsed_values

In [96]:
"""
Function to load and clean movie metadata, returns a dataframe with cleaned movie data
"""
def load_and_clean_movie_data():
    # Load the movie metadata
    df_movie_metadata = pd.read_csv(
        MOVIE_DATA_PATH, sep='\t', header=None, 
        names=[
            'Wikipedia_movie_ID', 'Freebase_movie_ID', 'Movie_name', 
            'Movie_release_date', 'Movie_box_office_revenue', 'Movie_runtime',
            'Movie_languages_(Freebase ID:name tuples)', 'Movie_countries_(Freebase ID:name tuples)',
            'Movie_genres_(Freebase ID:name tuples)'
        ]
    )
    
    # Parse 'languages', 'countries', and 'genres' columns
    df_movie_metadata['id_movie_languages'], df_movie_metadata['Movie_languages'] = parse_dict_column(df_movie_metadata['Movie_languages_(Freebase ID:name tuples)'])
    df_movie_metadata['id_Movie_countries'], df_movie_metadata['Movie_countries'] = parse_dict_column(df_movie_metadata['Movie_countries_(Freebase ID:name tuples)'])
    df_movie_metadata['id_Movie_genres'], df_movie_metadata['Movie_genres'] = parse_dict_column(df_movie_metadata['Movie_genres_(Freebase ID:name tuples)'])

    # Convert dates to datetime and extract the year
    df_movie_metadata['Movie_release_date'] = pd.to_datetime(df_movie_metadata['Movie_release_date'], errors='coerce').dt.year

    # Select and rename the columns as required
    cleaned_df_movie_metadata = df_movie_metadata[[
        'Wikipedia_movie_ID', 'Freebase_movie_ID', 'Movie_name', 'Movie_release_date', 
        'Movie_box_office_revenue', 'Movie_runtime', 'id_movie_languages', 
        'Movie_languages', 'id_Movie_countries', 'Movie_countries', 
        'id_Movie_genres', 'Movie_genres'
    ]]

    # Drop unwanted id columns
    columns_to_drop = ['id_movie_languages', 'id_Movie_genres', 'id_Movie_countries']
    cleaned_df_movie_metadata = cleaned_df_movie_metadata.drop(columns=columns_to_drop)

    # Convert floats of box office and years to nullable integers, keeping NaNs as np.nan
    cleaned_df_movie_metadata['Movie_box_office_revenue'] = pd.to_numeric(cleaned_df_movie_metadata['Movie_box_office_revenue'], errors='coerce')
    cleaned_df_movie_metadata['Movie_release_date'] = pd.to_numeric(cleaned_df_movie_metadata['Movie_release_date'], errors='coerce')

    #need to drop a line that has some weird encodings 
    cleaned_df_movie_metadata = cleaned_df_movie_metadata.map(lambda x: x.encode('utf-8', 'ignore').decode('utf-8') if isinstance(x, str) else x)

    # Replace any <NA> with np.nan for uniform NaNs
    cleaned_df_movie_metadata = cleaned_df_movie_metadata.replace({pd.NA: np.nan})

    return cleaned_df_movie_metadata

df_movie_metadata = load_and_clean_movie_data()
df_movie_metadata.sample(2)

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres
48404,16655899,/m/03yk0yn,Sex Hygiene,,,30.0,English Language,United States of America,"Short Film, Black-and-white, Documentary"
22858,4305023,/m/0bw3_j,Global Heresy,2002.0,,106.0,English Language,United States of America,Comedy


In [105]:
print(df_movie_metadata.shape)
print(df_movie_metadata["Wikipedia_movie_ID"].nunique())

(81741, 9)
81741


## Cleaning plots

In [101]:
import re

"""
Helper function to clean plot texts from unwanted annotations and tags
"""
def clean_plot(txt):

    #Remove URLs
    txt = re.sub(r"http\S+|www\.\S+", '', txt)

    #Remove HTML tags
    txt = re.sub(r'<.*?>', '', txt)

    #Remove {{annotations}}
    txt = re.sub(r'\{\{.*?\}\}', '', txt)

    #Remove the ([[ annotation that is never closed
    txt = re.sub(r'\(\[\[', '', txt)

    #Remove the synopsis from context
    txt = re.sub(r'Synopsis from', '', txt)

    #Remove <ref...}} tags
    txt = re.sub(r'<ref[^}]*}}', '', txt)

    return txt

In [103]:
def load_and_clean_plots_data():
    df_plot_summaries = pd.read_csv(PLOT_DATA_PATH, sep='\t', header=None,  names=['Wikipedia_movie_ID', 'summary'])
    df_plot_summaries['summary'] = df_plot_summaries['summary'].apply(clean_plot)
    return df_plot_summaries

Merging metadata and plots

In [108]:
df_movie_plots = load_and_clean_plots_data()
df_movie_data = df_movie_plots.merge(df_movie_metadata, on='Wikipedia_movie_ID', how='outer')
print(df_movie_data.shape)

(81840, 10)


## Extracting clean data in a csv

In [109]:
df_movie_data.to_csv('../../data/our_movie_data.csv', index=False)

# Actor related

In [None]:
"""
Function to load cluster data, returns a dataframe with cleaned cluster data
"""
def load_and_clean_cluster_data():
    #get cluster data
    file_path = "../data/tvtropes.clusters.txt"
    with open(file_path, "r") as file:
        lines = file.readlines()

    # Replace `{"char": ` with a simpler delimiter like a tab
    lines = [line.replace('{"char": ', '').replace(', "movie": ', '\t')
            .replace(', "id": ', '\t').replace('}', '')
            .replace(', "actor": ', '\t').replace('\t\t', '\t') for line in lines]

    with open("../data/pro_tvtropes.clusters.txt", "w") as file:
        file.writelines(lines)
    file_path = "../data/pro_tvtropes.clusters.txt"

    # Load the processed file
    df_clusters_tvtropes = pd.read_csv(file_path, sep='\t', header=None, names=['character_types', 'character', 'movie','Freebase_character/actor_map_ID','Actor name'])

    # Replace any <NA> with np.nan for uniform NaNs
    df_clusters_tvtropes = df_clusters_tvtropes.replace({pd.NA: np.nan})
    return df_clusters_tvtropes


In [155]:
df_clusters_tvtropes = load_and_clean_cluster_data()
print(df_clusters_tvtropes.shape)
df_clusters_tvtropes.sample(3)


(501, 5)


Unnamed: 0,character_types,character,movie,Freebase_character/actor_map_ID,Actor name
278,granola_person,Max Dennison,Hocus Pocus,/m/02vd92w,Omri Katz
1,absent_minded_professor,Professor Keenbean,Richie Rich,/m/02vchl3,Michael McShane
417,romantic_runnerup,Proteus,Sinbad: Legend of the Seven Seas,/m/02vc177,Joseph Fiennes


In [159]:
def load_and_clean_character_data():
    
    # load from csv
    df_clusters_name = pd.read_csv(CLUSTER_NAME_DATA_PATH, sep='\t', header=None, names=['unique_character_name', 'Freebase_character/actor_map_ID'])
    df_character_metadata = pd.read_csv(CHARACTER_DATA_PATH, sep='\t', header=None, 
                                    names=[
                                        'Wikipedia movie ID','Freebase_movie_ID', 'Movie_release_date',
                                        'Character_name', 'Actor_date_of_birth', 'Actor_gender',
                                        'Actor_height_(in meters)', 'Actor_ethnicity_(Freebase ID)',
                                        'Actor name', 'Actor_age_at_movie_release', 'Freebase_character/actor_map_ID',
                                        'Freebase_character_ID', 'Freebase_actor_ID'
                                        ])
    #drop unwanted columns
    columns_to_drop = ['Actor_height_(in meters)',  'Movie_release_date']
    df_character_metadata = df_character_metadata.drop(columns=columns_to_drop)

    #keep only year of birth
    df_character_metadata['Actor_date_of_birth'] = pd.to_datetime(df_character_metadata['Actor_date_of_birth'], errors='coerce').dt.year

    #merge character info with their unique names
    df_character_metadata = df_character_metadata.merge(df_clusters_name, on='Freebase_character/actor_map_ID', how='outer')

    #Check actor age is bigger equal 0 and smaller than 110, else replace with NaN
    df_character_metadata['Actor_age_at_movie_release'] = df_character_metadata['Actor_age_at_movie_release'].apply(lambda x: x if 0 <= x <= 110 else np.nan)

    # Replace any <NA> with np.nan for uniform NaNs
    df_character_metadata = df_character_metadata.replace({pd.NA: np.nan})
        
    return df_character_metadata

In [161]:
df_character_metadata = load_and_clean_character_data()
print(df_character_metadata.shape)
df_character_metadata.sample(3)

(450674, 12)


Unnamed: 0,Wikipedia movie ID,Freebase_movie_ID,Character_name,Actor_date_of_birth,Actor_gender,Actor_ethnicity_(Freebase ID),Actor name,Actor_age_at_movie_release,Freebase_character/actor_map_ID,Freebase_character_ID,Freebase_actor_ID,unique_character_name
315880,31910060,/m/0h146vw,Micky,1940.0,M,/m/041rx,James Caan,70.0,/m/0h146vk,/m/0h146vm,/m/0252fh,
74502,32014995,/m/0gx0h2g,Randy Dobson,1977.0,M,/m/07hwkr,Eric Christian Olsen,24.0,/m/04j2k6q,/m/0h5jt_g,/m/0959pn,
165942,3843878,/m/02vkdjr,Bob,1945.0,M,,Larry Pine,61.0,/m/0cg4s79,/m/0h5wxmq,/m/099pjr,


Adding to the metadata on their ids character types and checking that all unique character names are instanciated

In [173]:
df_character_data = df_character_metadata.merge(df_clusters_tvtropes, on=['Freebase_character/actor_map_ID', 'Actor name'], how='outer')
df_clusters_name = pd.read_csv(CLUSTER_NAME_DATA_PATH, sep='\t', header=None, names=['unique_character_name', 'Freebase_character/actor_map_ID'])
df_character_data = df_character_data.merge(df_clusters_name, on=['Freebase_character/actor_map_ID', "unique_character_name"], how='outer')
print(df_character_data.shape)
df_character_data.sample(6)

(450742, 15)


Unnamed: 0,Wikipedia movie ID,Freebase_movie_ID,Character_name,Actor_date_of_birth,Actor_gender,Actor_ethnicity_(Freebase ID),Actor name,Actor_age_at_movie_release,Freebase_character/actor_map_ID,Freebase_character_ID,Freebase_actor_ID,unique_character_name,character_types,character,movie
317102,29758182.0,/m/0fq1f0q,,1977.0,M,/m/01xhh5,Park Yong-Ha,31.0,/m/0h1l1fb,,/m/03j2spj,,,,
180981,14023727.0,/m/03cr8_w,,1967.0,F,,Icíar Bollaín,18.0,/m/0chcxc2,,/m/026np8p,,,,
282148,12113466.0,/m/02vq6zf,,1925.0,M,,Jan Merlin,30.0,/m/0ggbxmf,,/m/0c3_jl3,,,,
113806,4102452.0,/m/0bj2cm,Chandramukhi,1971.0,F,/m/0bpjh3,Indrani Haldar,31.0,/m/09hz8c9,/m/0h5sy65,/m/02q38w4,,,,
30272,11350840.0,/m/02r8l03,Doctor Watson,1895.0,M,,Nigel Bruce,,/m/02vd8sc,/m/0cgryr0,/m/02l99f,,,,
273550,26554241.0,/m/0bh8pk4,,1918.0,M,,Cameron Mitchell,55.0,/m/0gdgqtm,,/m/07tvwy,,,,


Extract a csv for actor data

In [174]:
df_character_data.to_csv('../../data/our_character_data.csv', index=False)