# Merge Cast Datasets

In this notebook, we merge three distinct cast datasets to create a comprehensive and enriched dataset:

1. **Wikidata Dataset**: Scraped data providing supplementary information about casts.
2. **MovieSummaries**: Provides additional movie-actor-role information.
3. **IMDB Non-Commercial Dataset**: Provides further details about actors and roles, including actors importance order. Obtained from [IMDB Non-Commercial Datasets](https://developer.imdb.com/non-commercial-datasets/).

### Merging Process
- We first merge **Wikidata** with **MovieSummaries**. We use Wikidata as the base source and we supplement it with additional information from MovieSummaries. We ensure no movie-actor-role relationship is duplicated.
- Then, the resulting dataset is merged with **IMDB**.

In [1]:
import pandas as pd
import numpy as np
import json
import unicodedata
from datetime import datetime

from auxiliary_functions_for_merging import *

DATA_PATH = "./../../Data/"

In [None]:
# Import Wikidata dataset

cast_wikidata_cols = ['wikidata_id_movie', 'freebase_id_movie', 'imdb_id_movie', 'title_movie', 'wikidata_id_actor',
       'freebase_id_actor', 'imdb_id_actor', 'name_actor', 'role',
       'character_name']
cast_wikidata = pd.read_csv(DATA_PATH + "wikidata_cast_imdb.csv", header=0, names=cast_wikidata_cols)

In [None]:
# Import MovieSummaries dataset

cast_original_cols = ["wikipedia_id_movie", "freebase_id_movie", "release_date", "character_name", "date_of_birth", "gender", "height",
                           "freebase_id_etnicity", "name_actor", "age_actor", "freebase_id_character_actor", "freebase_id_character",
                           "freebase_id_actor"]
cast_original = pd.read_csv(DATA_PATH + 'character.metadata.tsv', sep='\t', header=None, names=cast_original_cols).dropna(subset=['freebase_id_actor'])
cast_original["role"] = "actor"
cols = ["wikipedia_id_movie", "freebase_id_movie", "freebase_id_actor", "name_actor", "character_name", "role"]
cast_original = cast_original[cols]

In [4]:
# Group both datasets by movie (using freebaseID). Cycle over the movies merging the casts. Merge first by freebaseID and then using actorName.
# Keep the cast of movies that appear in the MovieSummaries dataset but not in the Wikidata dataset.
# Concatenate the resulting dataframes.

movies_complete = pd.read_csv(DATA_PATH + "movies_complete.tsv", sep='\t')
translator = pd.Series(movies_complete.imdb_id_movie.values, index=movies_complete.freebase_id_movie.values)

cast_wikidata_grouped = cast_wikidata.groupby(by='freebase_id_movie')
cast_original_grouped = cast_original.groupby(by='freebase_id_movie')

dfs = []
groups_1 = set(cast_wikidata_grouped.groups)
groups_2 = set(cast_original_grouped.groups).difference(groups_1)
size_groups_1 = len(groups_1)
size_groups_2 = len(groups_2)
for i, freebase_id_movie in enumerate(groups_1):
    if i%100 == 0:
        print(f"group 1: {i}/{size_groups_1}")
    
    wikidata_df = cast_wikidata_grouped.get_group(freebase_id_movie).copy(deep=True)
    original_df = cast_original_grouped.get_group(freebase_id_movie).copy(deep=True) if freebase_id_movie in cast_original_grouped.groups else pd.DataFrame(columns=cast_original.columns)

    wikidata_df["name_key"] = create_key_series(wikidata_df, "name_actor").astype(str)
    duplicate_keys = wikidata_df["name_key"].value_counts()
    duplicate_keys = duplicate_keys[duplicate_keys > 1].index
    wikidata_df["name_key"] = wikidata_df["name_key"].apply(lambda x: pd.NA if x in duplicate_keys else x)

    original_df["name_key"] = create_key_series(original_df, "name_actor").astype(str)
    duplicate_keys = original_df["name_key"].value_counts()
    duplicate_keys = duplicate_keys[duplicate_keys > 1].index
    original_df["name_key"] = original_df["name_key"].apply(lambda x: pd.NA if x in duplicate_keys else x)

    df1 = pd.merge(wikidata_df, original_df, left_on="freebase_id_actor", right_on="freebase_id_actor", how="inner", suffixes=('', '_orig'))
    rest_of_wikidata_df = wikidata_df[~wikidata_df["freebase_id_actor"].isin(df1["freebase_id_actor"])].dropna(subset=["name_key"])
    df2 = pd.merge(rest_of_wikidata_df, original_df, left_on="name_key", right_on="name_key", how="left", suffixes=('', '_orig'))
    df = pd.concat([df1, df2]).drop(columns=['name_key', 'name_key_orig', 'role_orig'])

    df["freebase_id_movie"] = df.apply(lambda row: row["freebase_id_movie_orig"] if pd.isna(row["freebase_id_movie"]) else row["freebase_id_movie"], axis=1)
    df["name_actor"] = df.apply(lambda row: row["name_actor_orig"] if pd.isna(row["name_actor"]) else row["name_actor"], axis=1)
    df["character_name"] = df.apply(lambda row: row["character_name_orig"] if pd.isna(row["character_name"]) else row["character_name"], axis=1)
    df["freebase_id_actor"] = df.apply(lambda row: row["freebase_id_actor_orig"] if pd.isna(row["freebase_id_actor"]) else row["freebase_id_actor"], axis=1)

    df = df.drop(columns=["freebase_id_movie_orig", "name_actor_orig", "character_name_orig", "freebase_id_actor_orig"])
    
    dfs.append(df.copy(deep=True))

for i, freebase_id_movie in enumerate(groups_2):
    if i%100 == 0:
        print(f"group 2: {i}/{size_groups_2}")
    
    original_df = cast_original_grouped.get_group(freebase_id_movie).copy(deep=True)
    original_df["imdb_id_movie"] = original_df.freebase_id_movie.apply(lambda x: translator[x] if x in translator.index else pd.NA)
    
    dfs.append(original_df.copy(deep=True))

movie_actor_complete = pd.concat(dfs)

group 1: 0/64170
group 1: 100/64170
group 1: 200/64170
group 1: 300/64170
group 1: 400/64170
group 1: 500/64170
group 1: 600/64170
group 1: 700/64170
group 1: 800/64170
group 1: 900/64170
group 1: 1000/64170
group 1: 1100/64170
group 1: 1200/64170
group 1: 1300/64170
group 1: 1400/64170
group 1: 1500/64170
group 1: 1600/64170
group 1: 1700/64170
group 1: 1800/64170
group 1: 1900/64170
group 1: 2000/64170
group 1: 2100/64170
group 1: 2200/64170
group 1: 2300/64170
group 1: 2400/64170
group 1: 2500/64170
group 1: 2600/64170
group 1: 2700/64170
group 1: 2800/64170
group 1: 2900/64170
group 1: 3000/64170
group 1: 3100/64170
group 1: 3200/64170
group 1: 3300/64170
group 1: 3400/64170
group 1: 3500/64170
group 1: 3600/64170
group 1: 3700/64170
group 1: 3800/64170
group 1: 3900/64170
group 1: 4000/64170
group 1: 4100/64170
group 1: 4200/64170
group 1: 4300/64170
group 1: 4400/64170
group 1: 4500/64170
group 1: 4600/64170
group 1: 4700/64170
group 1: 4800/64170
group 1: 4900/64170
group 1: 500

  movie_actor_complete = pd.concat(dfs)


In [5]:
# Import and clean IMDB dataset. There are only three instances where in the same movie one actor plays two different roles, we select one manually.

title_principals = pd.read_csv(DATA_PATH + "title.principals.onlymovies.tsv", sep='\t')

title_principals.loc[9010173, "characters"] = '["Ulisses"]'
title_principals.loc[9010175, "characters"] = '["Heitor"]'
title_principals.loc[9010178, "characters"] = '["Hercules"]'

def extract_character(x):
    if pd.isna(x):
        return pd.NA
    return x.strip()[2:-2].strip()
title_principals["characters"] = title_principals.characters.apply(extract_character)
title_principals["ordering"] = title_principals["ordering"].astype("Int64")

In [6]:
# Merge Wikidata_MovieSummaries datasets by (imdbID_movie, imdbID_actor)

movie_actor_complete = pd.merge(movie_actor_complete, title_principals, left_on=['imdb_id_movie', 'imdb_id_actor'], right_on=["tconst", "nconst"], how="left")
movie_actor_complete = movie_actor_complete.drop(columns=['tconst', 'nconst', 'category', 'job'])

movie_actor_complete["character_name"] = movie_actor_complete.apply(lambda row: row["characters"] if pd.isna(row["character_name"]) else row["character_name"], axis=1)
movie_actor_complete = movie_actor_complete.drop(columns=['characters'])

In [7]:
# Export the combined dataset

cols = ['freebase_id_movie', 'wikidata_id_movie', 'wikipedia_id_movie', 'imdb_id_movie', 'title_movie', 'freebase_id_actor', 'wikidata_id_actor', 
        'imdb_id_actor', 'name_actor', 'role', 'character_name', 'ordering']
movie_actor_complete = movie_actor_complete[cols]

movie_actor_complete.to_csv(DATA_PATH + "movie_actor_complete.tsv", sep='\t', index=False)

In [4]:
# Run after creating people_complete. Assign each actor to its univocalID.

movie_actor_complete = pd.read_csv(DATA_PATH + "movie_actor_complete.tsv", sep='\t')

people_complete = pd.read_csv(DATA_PATH + "people_complete.tsv", sep='\t')
translator1 = pd.Series(people_complete.dropna(subset=["freebase_id_actor"]).univocal_id_actor.values, index=people_complete.dropna(subset=["freebase_id_actor"]).freebase_id_actor.values)
translator2 = pd.Series(people_complete.dropna(subset=["imdb_id_actor"]).univocal_id_actor.values, index=people_complete.dropna(subset=["imdb_id_actor"]).imdb_id_actor.values)
def get_univocal_id_actor(row):
    if row["freebase_id_actor"] in translator1.index:
        return translator1[row["freebase_id_actor"]]
    elif row["imdb_id_actor"] in translator2.index:
        return translator2[row["imdb_id_actor"]]
    else:
        return pd.NA

movie_actor_complete["univocal_id_actor"] = movie_actor_complete.apply(get_univocal_id_actor, axis=1)
movie_actor_complete = movie_actor_complete.dropna(subset=["univocal_id_actor"])

In [5]:
# Export the resulting dataset

cols = ['freebase_id_movie', 'wikidata_id_movie', 'wikipedia_id_movie', 'imdb_id_movie', 'title_movie', 'univocal_id_actor', 'freebase_id_actor', 'wikidata_id_actor', 
        'imdb_id_actor', 'name_actor', 'role', 'character_name', 'ordering']
movie_actor_complete = movie_actor_complete[cols]

movie_actor_complete.to_csv(DATA_PATH + "movie_actor_complete.tsv", sep='\t', index=False)