# Merge People Datasets

In this notebook, we merge two distinct people datasets to create a comprehensive and enriched dataset:

1. **MovieSummaries**: Provides anagraphic information.
2. **Wikidata Dataset**: Scraped data providing supplementary anagraphic information about people.

### Merging Process
- We first outer merge **MovieSummaries** with **Wikidata**.
- Then, we create an univocal key to identify people in our work.

In [2]:
import pandas as pd
import numpy as np
import json
import unicodedata
from datetime import datetime

from auxiliary_functions_for_merging import *

DATA_PATH = "./../../Data/"

In [10]:
# Import and clean MovieSummaries dataset

people_original_cols = ["wikipedia_id_movie", "freebase_id_movie", "release_date", "character_name", "date_of_birth", "gender", "height",
                           "freebase_id_etnicity", "name_actor", "age_actor", "freebase_id_character_actor", "freebase_id_character",
                           "freebase_id_actor"]
people_original = pd.read_csv(DATA_PATH + 'character.metadata.tsv', sep='\t', header=None, names=people_original_cols).dropna(subset=['freebase_id_actor']).drop_duplicates(subset=["freebase_id_actor"])
cols = ["freebase_id_actor", "name_actor", "gender", "date_of_birth", "height", "freebase_id_etnicity"]
people_original = people_original[cols]

people_original["year_of_birth"] = people_original["date_of_birth"].apply(extract_year)
people_original["date_of_birth"] = people_original["date_of_birth"].apply(lambda x: x if is_valid_date(x) else pd.NA)

In [11]:
# Import and clean Wikidata dataset

wikidata_people = pd.read_csv(DATA_PATH + "wikidata_people.csv").drop_duplicates(subset=["wikidataID"])

wikidata_people["birthDate"] = wikidata_people["birthDate"].apply(select_date)
wikidata_people["birthYear"] = wikidata_people["birthDate"].apply(extract_year)
wikidata_people["birthDate"] = wikidata_people["birthDate"].apply(lambda x: x if is_valid_date(x) else pd.NA)
wikidata_people["deathDate"] = wikidata_people["deathDate"].apply(select_date)
wikidata_people["deathYear"] = wikidata_people["deathDate"].apply(extract_year)
wikidata_people["deathDate"] = wikidata_people["deathDate"].apply(lambda x: x if is_valid_date(x) else pd.NA)
wikidata_people["gender"] = wikidata_people["gender"].apply(select_gender)

In [12]:
# Outer merge MovieSummaries and Wikidata datasets by freebaseID 

people_complete = pd.merge(wikidata_people, people_original, left_on="freebaseID", right_on="freebase_id_actor", how="outer", suffixes=('','_orig'))

In [13]:
# Combine columns that provide the same information

people_complete["freebaseID"] = people_complete.apply(lambda row: row["freebase_id_actor"] if pd.isna(row["freebaseID"]) else row["freebaseID"], axis=1)
people_complete["nameSurname"] = people_complete.apply(lambda row: row["name_actor"] if pd.isna(row["nameSurname"]) else row["nameSurname"], axis=1)
people_complete["gender"] = people_complete.apply(lambda row: row["gender_orig"] if pd.isna(row["gender"]) else row["gender"], axis=1)
people_complete["birthDate"] = people_complete.apply(lambda row: row["date_of_birth"] if pd.isna(row["birthDate"]) else row["birthDate"], axis=1)
people_complete["birthYear"] = people_complete.apply(lambda row: row["year_of_birth"] if pd.isna(row["birthYear"]) else row["birthYear"], axis=1)

people_complete = people_complete.drop(columns=["freebase_id_actor", "name_actor", "gender_orig", "date_of_birth", "year_of_birth"])

In [None]:
# Define univocal key to identify people. Export the resulting dataset.

renamed_cols = {
"imdbID": "imdb_id_actor",
"wikidataID": "wikidata_id_actor",
"freebaseID": "freebase_id_actor",
"wikipediaLink": "wikipediaLink_actor",
"nameSurname": "nameSurname_actor",
"givenName": "givenName_actor",
"familyName": "familyName_actor",
"birthDate": "date_of_birth",
"gender": "gender",
"citizenship": "citizenship",
"placeOfBirth": "place_of_birth",
"nativeLanguage": "language",
"deathDate": "date_of_death",
"birthYear": "year_of_birth",
"deathYear": "year_of_death",
"height": "height",
"freebase_id_etnicity": "freebase_id_etnicity",
}

people_complete = people_complete.rename(columns=renamed_cols)
people_complete["univocal_id_actor"] = people_complete.apply(lambda row: row["imdb_id_actor"] if pd.isna(row["freebase_id_actor"]) or len(row["freebase_id_actor"]) > 20 else row["freebase_id_actor"], axis=1)
people_complete = people_complete.drop_duplicates(subset=["univocal_id_actor"])
people_complete = people_complete[(~people_complete.duplicated(subset=["imdb_id_actor"])) | (people_complete['imdb_id_actor'].isna())]

cols = ['univocal_id_actor', 'freebase_id_actor', 'wikidata_id_actor', 'imdb_id_actor', 'wikipediaLink_actor', 'nameSurname_actor', 'givenName_actor',
        'familyName_actor', 'gender', 'date_of_birth', 'year_of_birth', 'date_of_death', 'year_of_death', 'place_of_birth', 'citizenship',
        'language', 'height', 'freebase_id_etnicity']
people_complete = people_complete[cols]

people_complete.to_csv(DATA_PATH + "people_complete.tsv", sep='\t', index=False)