In [2]:
import pandas as pd
import numpy as np 
import ast


In [3]:
# Define data paths constants
PLOT_DATA_PATH = "../data/plot_summaries.txt"
MOVIE_DATA_PATH = "../data/movie.metadata.tsv"
CLUSTER_NAME_DATA_PATH = "../data/name.clusters.txt"
CHARACTER_DATA_PATH = "../data/character.metadata.tsv"

We separate the data into two main categories, movie related data and actors related data

# Movie related

## Cleaning movie metada

In [4]:
"""
Function to parse dictionary-like strings in the file and separate keys and values
"""
def parse_dict_column(column):
    parsed_keys = []
    parsed_values = []
    
    for item in column:
        # Convert string representation of dictionary to actual dictionary
        item_dict = ast.literal_eval(item)
        parsed_keys.append(", ".join(item_dict.keys()))
        parsed_values.append(", ".join(item_dict.values()))
    
    return parsed_keys, parsed_values

In [5]:
"""
Function to load and clean movie metadata, returns a dataframe with cleaned movie data
"""
def load_and_clean_movie_data():
    # Load the movie metadata
    df_movie_metadata = pd.read_csv(
        MOVIE_DATA_PATH, sep='\t', header=None, 
        names=[
            'Wikipedia_movie_ID', 'Freebase_movie_ID', 'Movie_name', 
            'Movie_release_date', 'Movie_box_office_revenue', 'Movie_runtime',
            'Movie_languages_(Freebase ID:name tuples)', 'Movie_countries_(Freebase ID:name tuples)',
            'Movie_genres_(Freebase ID:name tuples)'
        ]
    )
    
    # Parse 'languages', 'countries', and 'genres' columns
    df_movie_metadata['id_movie_languages'], df_movie_metadata['Movie_languages'] = parse_dict_column(df_movie_metadata['Movie_languages_(Freebase ID:name tuples)'])
    df_movie_metadata['id_Movie_countries'], df_movie_metadata['Movie_countries'] = parse_dict_column(df_movie_metadata['Movie_countries_(Freebase ID:name tuples)'])
    df_movie_metadata['id_Movie_genres'], df_movie_metadata['Movie_genres'] = parse_dict_column(df_movie_metadata['Movie_genres_(Freebase ID:name tuples)'])

    # Convert dates to datetime and extract the year
    df_movie_metadata['Movie_release_date'] = pd.to_datetime(df_movie_metadata['Movie_release_date'], errors='coerce').dt.year

    # Select and rename the columns as required
    cleaned_df_movie_metadata = df_movie_metadata[[
        'Wikipedia_movie_ID', 'Freebase_movie_ID', 'Movie_name', 'Movie_release_date', 
        'Movie_box_office_revenue', 'Movie_runtime', 'id_movie_languages', 
        'Movie_languages', 'id_Movie_countries', 'Movie_countries', 
        'id_Movie_genres', 'Movie_genres'
    ]]

    # Drop unwanted id columns
    columns_to_drop = ['id_movie_languages', 'id_Movie_genres', 'id_Movie_countries']
    cleaned_df_movie_metadata = cleaned_df_movie_metadata.drop(columns=columns_to_drop)

    # Convert floats of box office and years to nullable integers, keeping NaNs as np.nan
    cleaned_df_movie_metadata['Movie_box_office_revenue'] = pd.to_numeric(cleaned_df_movie_metadata['Movie_box_office_revenue'], errors='coerce')
    cleaned_df_movie_metadata['Movie_release_date'] = pd.to_numeric(cleaned_df_movie_metadata['Movie_release_date'], errors='coerce')

    #need to drop a line that has some weird encodings 
    cleaned_df_movie_metadata = cleaned_df_movie_metadata.map(lambda x: x.encode('utf-8', 'ignore').decode('utf-8') if isinstance(x, str) else x)

    # Replace any <NA> with np.nan for uniform NaNs
    cleaned_df_movie_metadata = cleaned_df_movie_metadata.replace({pd.NA: np.nan})

    return cleaned_df_movie_metadata

df_movie_metadata = load_and_clean_movie_data()
df_movie_metadata.sample(2)

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres
6170,7006276,/m/0g_ytf,Sami swoi,,,80.0,Polish Language,Poland,"Black comedy, Satire, Comedy, World cinema"
32355,10113483,/m/02q27lg,The Birthday Party,1968.0,,123.0,English Language,"United States of America, United Kingdom",Comedy


In [6]:
print(df_movie_metadata.shape)
print(df_movie_metadata["Wikipedia_movie_ID"].nunique())

(81741, 9)
81741


## Cleaning plots

In [7]:
import re

"""
Helper function to clean plot texts from unwanted annotations and tags
"""
def clean_plot(txt):

    #Remove URLs
    txt = re.sub(r"http\S+|www\.\S+", '', txt)

    #Remove HTML tags
    txt = re.sub(r'<.*?>', '', txt)

    #Remove {{annotations}}
    txt = re.sub(r'\{\{.*?\}\}', '', txt)

    #Remove the ([[ annotation that is never closed
    txt = re.sub(r'\(\[\[', '', txt)

    #Remove the synopsis from context
    txt = re.sub(r'Synopsis from', '', txt)

    #Remove <ref...}} tags
    txt = re.sub(r'<ref[^}]*}}', '', txt)

    return txt

In [8]:
def load_and_clean_plots_data():
    df_plot_summaries = pd.read_csv(PLOT_DATA_PATH, sep='\t', header=None,  names=['Wikipedia_movie_ID', 'summary'])
    df_plot_summaries['summary'] = df_plot_summaries['summary'].apply(clean_plot)
    return df_plot_summaries

Merging metadata and plots

In [9]:
df_movie_plots = load_and_clean_plots_data()
df_movie_data = df_movie_plots.merge(df_movie_metadata, on='Wikipedia_movie_ID', how='outer')
print(df_movie_data.shape)

(81840, 10)


## Extracting clean data in a csv

In [10]:
df_movie_data.to_csv('../../data/our_movie_data.csv', index=False)

# Actor related

In [12]:
"""
Function to load cluster data, returns a dataframe with cleaned cluster data
"""
def load_and_clean_cluster_data():
    #get cluster data
    file_path = "../data/tvtropes.clusters.txt"
    with open(file_path, "r") as file:
        lines = file.readlines()

    # Replace `{"char": ` with a simpler delimiter like a tab
    lines = [line.replace('{"char": ', '').replace(', "movie": ', '\t')
            .replace(', "id": ', '\t').replace('}', '')
            .replace(', "actor": ', '\t').replace('\t\t', '\t') for line in lines]

    with open("../data/pro_tvtropes.clusters.txt", "w") as file:
        file.writelines(lines)
    file_path = "../data/pro_tvtropes.clusters.txt"

    # Load the processed file
    df_clusters_tvtropes = pd.read_csv(file_path, sep='\t', header=None, names=['character_types', 'character', 'movie','Freebase_character/actor_map_ID','Actor_name'])

    # Replace any <NA> with np.nan for uniform NaNs
    df_clusters_tvtropes = df_clusters_tvtropes.replace({pd.NA: np.nan})
    return df_clusters_tvtropes


In [13]:
df_clusters_tvtropes = load_and_clean_cluster_data()
print(df_clusters_tvtropes.shape)
df_clusters_tvtropes.sample(3)


(501, 5)


Unnamed: 0,character_types,character,movie,Freebase_character/actor_map_ID,Actor_name
226,egomaniac_hunter,The Predator,Predator,/m/0h34xg2,Peter Cullen
237,fastest_gun_in_the_west,Ned Nederlander,\u00a1Three Amigos!,/m/0jshd9,Martin Short
162,crazy_survivalist,Eli,The Book of Eli,/m/05nv915,Denzel Washington


In [14]:
def load_and_clean_character_data():
    
    # load from csv
    df_clusters_name = pd.read_csv(CLUSTER_NAME_DATA_PATH, sep='\t', header=None, names=['unique_character_name', 'Freebase_character/actor_map_ID'])
    df_character_metadata = pd.read_csv(CHARACTER_DATA_PATH, sep='\t', header=None, 
                                    names=[
                                        'Wikipedia_movie_ID','Freebase_movie_ID', 'Movie_release_date',
                                        'Character_name', 'Actor_date_of_birth', 'Actor_gender',
                                        'Actor_height_(in meters)', 'Actor_ethnicity_(Freebase ID)',
                                        'Actor_name', 'Actor_age_at_movie_release', 'Freebase_character/actor_map_ID',
                                        'Freebase_character_ID', 'Freebase_actor_ID'
                                        ])


    #keep only year of birth
    df_character_metadata['Actor_date_of_birth'] = pd.to_datetime(df_character_metadata['Actor_date_of_birth'], errors='coerce').dt.year

    #merge character info with their unique names
    df_character_metadata = df_character_metadata.merge(df_clusters_name, on='Freebase_character/actor_map_ID', how='outer')

    #Check actor age is bigger equal 0 and smaller than 110, else replace with NaN
    df_character_metadata['Actor_age_at_movie_release'] = df_character_metadata['Actor_age_at_movie_release'].apply(lambda x: x if 0 <= x <= 110 else np.nan)

    # Replace any <NA> with np.nan for uniform NaNs
    df_character_metadata = df_character_metadata.replace({pd.NA: np.nan})
        
    return df_character_metadata

In [15]:
df_character_metadata = load_and_clean_character_data()
print(df_character_metadata.shape)
df_character_metadata.sample(3)

(450674, 14)


Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_release_date,Character_name,Actor_date_of_birth,Actor_gender,Actor_height_(in meters),Actor_ethnicity_(Freebase ID),Actor_name,Actor_age_at_movie_release,Freebase_character/actor_map_ID,Freebase_character_ID,Freebase_actor_ID,unique_character_name
97340,15122431,/m/03hj5wg,1992-12-16,,1910.0,F,1.62,/m/041rx,Sylvia Sidney,82.0,/m/064843_,,/m/041b4j,
42631,6301209,/m/0g00zr,2006,Durga,1975.0,F,1.59,/m/0dryh9k,Amisha Patel,30.0,/m/03jsfy5,/m/0h169_g,/m/040khd,
42613,163451,/m/015qqg,1977-10-02,,1954.0,F,,,Lisa Pelikan,23.0,/m/03jsfpv,,/m/05xbll,


Adding to the metadata on their ids character types and checking that all unique character names are instanciated

In [17]:
df_actor_data = df_character_metadata.merge(df_clusters_tvtropes, on=['Freebase_character/actor_map_ID', 'Actor_name'], how='outer')
df_clusters_name = pd.read_csv(CLUSTER_NAME_DATA_PATH, sep='\t', header=None, names=['unique_character_name', 'Freebase_character/actor_map_ID'])
df_actor_data = df_actor_data.merge(df_clusters_name, on=['Freebase_character/actor_map_ID', "unique_character_name"], how='outer')
columns_to_drop = ['Freebase_movie_ID','Movie_name','Movie_release_date']
df_movie_metadata2 = df_movie_metadata.drop(columns=columns_to_drop)
df_actor_data = df_actor_data.merge(df_movie_metadata2, on=['Wikipedia_movie_ID'], how ='outer')
columns_to_drop = ['Freebase_movie_ID','Freebase_actor_ID','Freebase_character/actor_map_ID','Freebase_character_ID']
df_actor_data2 = df_actor_data.drop(columns=columns_to_drop)
df_actor_data2.head(6)

Unnamed: 0,Wikipedia_movie_ID,Movie_release_date,Character_name,Actor_date_of_birth,Actor_gender,Actor_height_(in meters),Actor_ethnicity_(Freebase ID),Actor_name,Actor_age_at_movie_release,unique_character_name,character_types,character,movie,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres
0,330.0,1996,,,F,,,Mercè Pons,29.0,,,,,,90.0,"Catalan language, Spanish Language",Spain,"Drama, Comedy-drama"
1,330.0,1996,,1935.0,F,,,Núria Espert,60.0,,,,,,90.0,"Catalan language, Spanish Language",Spain,"Drama, Comedy-drama"
2,330.0,1996,,1941.0,F,,/m/03ttfc,Rosa Maria Sardà,54.0,,,,,,90.0,"Catalan language, Spanish Language",Spain,"Drama, Comedy-drama"
3,330.0,1996,,1944.0,F,,,Anna Lizaran,51.0,,,,,,90.0,"Catalan language, Spanish Language",Spain,"Drama, Comedy-drama"
4,3217.0,1992-10-09,Fake shemp,,M,,,Ivan Raimi,36.0,,,,,21502796.0,81.0,English Language,United States of America,"Cult, Horror, Stop motion, Costume drama, Acti..."
5,3217.0,1992-10-09,Duke Henry the Red,,M,,,Richard Grove,37.0,,,,,21502796.0,81.0,English Language,United States of America,"Cult, Horror, Stop motion, Costume drama, Acti..."


Extract a csv for actor data

In [18]:
df_actor_data2.to_csv('../../data/our_actor_data.csv', index=False)

# With a focus on the character type

In [24]:
df_character_data = df_character_metadata.merge(df_clusters_tvtropes, on=['Freebase_character/actor_map_ID', 'Actor_name'], how='inner')
print(df_character_data.shape)
df_character_data.head(6)

(497, 17)


Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_release_date,Character_name,Actor_date_of_birth,Actor_gender,Actor_height_(in meters),Actor_ethnicity_(Freebase ID),Actor_name,Actor_age_at_movie_release,Freebase_character/actor_map_ID,Freebase_character_ID,Freebase_actor_ID,unique_character_name,character_types,character,movie
0,54173,/m/0f4_l,1994-05,Vincent Vega,1954.0,M,1.83,/m/0xnvg,John Travolta,,/m/01xlj4l,/m/030rbl,/m/0f502,,stupid_crooks,Vincent Vega,Pulp Fiction
1,53964,/m/0f3m1,1980-05-21,Han Solo,1942.0,M,1.85,/m/01qhm_,Harrison Ford,37.0,/m/01xpntj,/m/0fjn8,/m/0c0k1,Han Solo,trickster,Han Solo,Star Wars Episode V: The Empire Strikes Back
2,18952889,/m/0k5fg,1963-11-07,Lennie Pike,1925.0,M,1.77,,Jonathan Winters,37.0,/m/01z0cck,/m/0h5n8k6,/m/01gn36,,dumb_muscle,Lennie Pike,"It's a Mad, Mad, Mad, Mad World"
3,7261333,/m/0kv2hv,2007-02-09,Norbit,1961.0,M,1.75,/m/0x67,Eddie Murphy,45.0,/m/01z0rf5,/m/02nwkcr,/m/0f7hc,,henpecked_husband,Norbit,Norbit
4,1670736,/m/05m55b,1994-09-10,Guy,1963.0,M,1.71,,Frank Whaley,31.0,/m/021_h9h,/m/02nwn16,/m/07sjll,,crazy_jealous_guy,Guy,Swimming with Sharks
5,560511,/m/02q2jw,1981-06-26,Sgt. Hulka,1928.0,M,1.82,,Warren Oates,52.0,/m/022108v,/m/07034y7,/m/049tcm,,drill_sargeant_nasty,Sgt. Hulka,Stripes


Reordering the table

In [25]:
new_column_order = ['Actor_name','character_types','character','Freebase_character_ID',
                    'Freebase_actor_ID','Freebase_character/actor_map_ID','Actor_gender','Actor_age_at_movie_release',
                    'Actor_date_of_birth','Actor_height_(in meters)','Actor_ethnicity_(Freebase ID)',
                    'unique_character_name',
                    'movie','Wikipedia_movie_ID','Freebase_movie_ID',
                   ]

df_character_data_ordered = df_character_data[new_column_order]
df_character_data_ordered.head()

Unnamed: 0,Actor_name,character_types,character,Freebase_character_ID,Freebase_actor_ID,Freebase_character/actor_map_ID,Actor_gender,Actor_age_at_movie_release,Actor_date_of_birth,Actor_height_(in meters),Actor_ethnicity_(Freebase ID),unique_character_name,movie,Wikipedia_movie_ID,Freebase_movie_ID
0,John Travolta,stupid_crooks,Vincent Vega,/m/030rbl,/m/0f502,/m/01xlj4l,M,,1954.0,1.83,/m/0xnvg,,Pulp Fiction,54173,/m/0f4_l
1,Harrison Ford,trickster,Han Solo,/m/0fjn8,/m/0c0k1,/m/01xpntj,M,37.0,1942.0,1.85,/m/01qhm_,Han Solo,Star Wars Episode V: The Empire Strikes Back,53964,/m/0f3m1
2,Jonathan Winters,dumb_muscle,Lennie Pike,/m/0h5n8k6,/m/01gn36,/m/01z0cck,M,37.0,1925.0,1.77,,,"It's a Mad, Mad, Mad, Mad World",18952889,/m/0k5fg
3,Eddie Murphy,henpecked_husband,Norbit,/m/02nwkcr,/m/0f7hc,/m/01z0rf5,M,45.0,1961.0,1.75,/m/0x67,,Norbit,7261333,/m/0kv2hv
4,Frank Whaley,crazy_jealous_guy,Guy,/m/02nwn16,/m/07sjll,/m/021_h9h,M,31.0,1963.0,1.71,,,Swimming with Sharks,1670736,/m/05m55b


Extract a csv for actor data

In [26]:
df_character_data_ordered.to_csv('../../data/our_character2_data.csv', index=False)