# Actors

This notebook cleans the information about the actors. It does the following actions:
 - remove a few entries corresponding to non-actors
 - delete the actors that have played in less than ten movies
 - add the ethnicities in a human-readable format

The final dataset is saved in `Data/actors.csv`

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
column_names = [
    "wikipedia_movie_ID",
    "movie_ID", # Freebase ID
    "movie_release_date",
    "character_name",
    "actor_date_of_birth",
    "actor_gender",
    "actor_height", # in meters
    "actor_ethnicity",
    "actor_name",
    "actor_age_at_movie_release",
    "character_actor_map_ID", # Freebase ID
    "character_ID", # Freebase ID
    "actor_ID" # Freebase ID
]

characters = pd.read_csv("../Data/MovieSummaries/character.metadata.tsv", delimiter='\t', names=column_names)

# Remove a few non-actors
characters.drop(characters[characters["character_name"] == "Additional Voices"].index, inplace=True)

characters

In [None]:
# Check that the information about the actors is coherent: the information about a given actor is the same in every line.
actors_fields = ["actor_ID", "actor_name", "actor_gender", "actor_date_of_birth", "actor_height", "actor_ethnicity"]

for field in actors_fields:
    actor_counts = characters.groupby('actor_ID')[field].nunique()
    duplicate_actors = actor_counts[actor_counts > 1]
    assert(len(duplicate_actors) == 0)

In [None]:
# Dataframe containing only the actors
conv = {
    "actor_ID": "ID",
    "actor_name": "name",
    "actor_gender": "gender",
    "actor_date_of_birth": "date_of_birth",
    "actor_height": "height",
    "actor_ethnicity": "ethnicity"
}

actors = characters[actors_fields].copy()
actors.rename(columns=conv, inplace=True)

# Drop actors that have played in less than ten films
is_kept = actors.groupby("ID").size() >= 10
kept = is_kept[is_kept].keys()
actors = actors[actors["ID"].isin(kept)].drop_duplicates()

actors.set_index("ID", inplace=True)
actors

## Ethnicities

In [None]:
# Save the ethnicities in the dataframe
ethnicities = list(actors.ethnicity.unique())

# Remove NaN
ethnicities = [e for e in ethnicities if str(e) != 'nan']

for i, e in enumerate(ethnicities):
    ethnicities[i] = "m." + e[3:]

map_ethnicities = {
    'm.0338zd': 'Malaysian Chinese',
    'm.04jq32s': 'Nepali Indian',
    'm.0ffj5g': 'Austrian American',
    'm.01gr8h': 'Sherpa people',
    'm.022fdt': 'Armenians',
    'm.033njm': 'Irish Canadian',
    'm.0747611': 'Garhwali people',
    'm.02m0kh': 'Mandinka people',
    'm.03m9my8': 'Arab Mexican',
    'm.0ffjqy': 'Croatian American',
    'm.0960kn': 'Canadians of Danish descent',
    'm.0987ctr': 'Czesh american',
    'm.0c50f': 'Métis people',
    'm.016f5d': 'Blackfoot Confederacy',
    'm.01qhm_': 'German American',
    'm.051x6yk': 'Arabs in Bulgaria',
    'm.01vr3v': 'Romanians',
    'm.04kdwcx': 'Tahitian',
    'm.0ft9bs': 'Sri Lankan Tamil diaspora',
    'm.01hm_': 'Bohemian',
    'm.042gtr': 'Spanish Americans',
    'm.05l3g_': 'Dutch people',
    'm.0bjbszh': 'South Korean',
    'm.0dryh9k': 'Indian people',
    'm.027vy0s': 'White Latin American',
    'm.06rd7': 'Sami people',
    'm.012f86': 'Ukrainians',
    'm.02p7gyv': 'Brazilian American',
    'm.03bkbh': 'Irish people',
    'm.03h11s3': 'Indian Australian',
    'm.051wcch': 'Bulgarian Canadian',
    'm.0bfrrj': 'Mohyal',
    'm.02vys3l': 'Venezuelan American',
    'm.042199j': 'Spanish immigration to Mexico',
    'm.04mvp8': 'Malayali',
    'm.05ysft4': 'Algerian',
    'm.071drf': 'Saliya',
    'm.01_5cg': 'Apache',
    'm.03cdk7b': 'British Pakistanis',
    'm.04f581': 'Norwegian American',
    'm.013xrm': 'Germans',
    'm.078ds': 'Sinhalese people',
    'm.0738n4': 'Hmong American',
    'm.09lz9zx': 'mixed',
    'm.09m6hr': 'Telugu people',
    'm.017sq0': 'Eurasian',
    'm.0268d21': 'Serbian American',
    'm.02jvpv': 'Kiwi',
    'm.063k3h': 'Scotch-Irish American',
    'm.01c034': 'Lao people',
    'm.03d19xz': 'Canadians in the United Kingdom',
    'm.04dbw3': 'Cuban American',
    'm.04gfy7': 'Indian American',
    'm.086wp0': 'Ilocano people',
    'm.01g0y_': 'Ojibwe',
    'm.01j2qv': 'Korean Americans',
    'm.02rdfpy': 'Salvadoran American',
    'm.02vsw1': 'European American',
    'm.0165md': 'Ryukyuan people',
    'm.0268xtg': 'Serbian Australian',
    'm.07gzw5': 'Kapampangan people',
    'm.07n8wy': 'Agrawal',
    'm.0d2by': 'Chinese American',
    'm.0288fw3': 'Poles in the United Kingdom',
    'm.033qt1': 'Azerbaijanis',
    'm.0cm7w1': 'Biharis',
    'm.0dq1q': 'Cree',
    'm.012fh': 'Afrikaner',
    'm.027n1m6': 'Slovak American',
    'm.02p4q5p': 'Portuguese American',
    'm.03nvq': 'Hazaras',
    'm.04y8_bm': 'Multiracial American',
    'm.05qb937': 'Venezuelans',
    'm.09743': 'Pashtun',
    'm.01xttr': 'Kannada people',
    'm.03ty8_': 'Lithuanian Jews',
    'm.062_25': 'Italian Brazilian',
    'm.09vc4s': 'English American',
    'm.0ffk5n': 'Latvian American',
    'm.06y24j': 'Iranian American',
    'm.07j80c': 'Samoan American',
    'm.01g3rx': 'Aymara people',
    'm.02gx2x': 'Javanese people',
    'm.03lnnd': 'Thai American',
    'm.0828vj': 'Telugu Brahmins',
    'm.08v2k7': 'Italian Australian',
    'm.059_w': 'Native Americans in the United States',
    'm.0ffkb4': 'Lithuanian American',
    'm.0fk1z': 'Hispanic',
    'm.0fq6zlv': 'Australians in the United Kingdom',
    'm.046j25': 'Lumbee',
    'm.04fh1b': 'Afro-Asian',
    'm.05cc9h': 'Somalis',
    'm.062szv5': 'Japanese/latina',
    'm.06w4lv': 'Indo-Guyanese',
    'm.0dj8k3': 'British Jews',
    'm.0463n9y': 'Jordanian People',
    'm.06vb7b': 'Iranian Canadian',
    'm.026zlyd': 'Africans',
    'm.09gp4': 'Belarusians',
    'm.047948f': 'Scottish Australian',
    'm.06j2v': 'Romani people',
    'm.026c9dq': 'Chilean American',
    'm.03gy1h2': 'Panamanian American',
    'm.065z7w_': 'Moroccan American',
    'm.0d8qh0': 'Indonesian American',
    'm.0dn1_0': 'Italian immigration to Mexico',
    'm.03hjx6f': 'Black Hispanic and Latino Americans',
    'm.03sx6v': 'German Canadian',
    'm.04mmhj': 'British Chinese',
    'm.0fpjs3j': 'Uruguayans',
    'm.01kg2v': 'Brahmin',
    'm.02g7sp': 'Irish people in Great Britain',
    'm.03zcwh': 'Ukrainian Canadian',
    'm.04y29': 'Muslim',
    'm.0x67': 'African American',
    'm.026kx7g': 'Colombians',
    'm.039z49': 'Dinka people',
    'm.0dv5vw': 'Pacific Islands American',
    'm.08gzsf': 'Konkani people',
    'm.0bvjpj': 'Nigerian American',
    'm.01kb9y': 'Multiracial',
    'm.032m0b': 'Chettiar',
    'm.041rx': 'Jewish people',
    'm.0ftwg': 'Buryats',
    'm.0283js_': 'Jamaican American',
    'm.04_tz7': 'Slovene Americans',
    'm.0b98sy': 'Anglo-Scot',
    'm.0dm3xpw': 'White Canadian',
    'm.0fqp6zk': 'Bengali Hindus',
    'm.04pnf': 'Latino',
    'm.01tyl3': 'Vietnamese American',
    'm.025x6k1': 'Serbs of the Republic of Macedonia',
    'm.04hqxn': 'Transylvanian Saxons',
    'm.09zyn5': 'Norwegians',
    'm.0432mrk': 'Indigenous peoples of the Americas',
    'm.08c25t': 'Wolof people',
    'm.05vhv7': 'Mudaliar',
    'm.09v5bdn': 'Puerto Ricans',
    'm.0318mh': 'Finns',
    'm.05g0f1': 'Akan people',
    'm.013z8m': 'Manchu people',
    'm.025tvhm': 'Egyptians',
    'm.02ctzb': 'White people',
    'm.059v8': 'Nez Perce people',
    'm.0h1nk0k': 'Latin American',
    'm.01srl7': 'Ossetians',
    'm.02q206y': 'Iranians in the United Kingdom',
    'm.09g34_': 'Malagasy people',
    'm.012c1l': 'French Canadian',
    'm.021pd': 'Choctaw',
    'm.04jtjvt': 'Irish',
    'm.0g48m4': 'Lebanese American',
    'm.013b6_': 'Ashkenazi Jews',
    'm.01hphz': 'Non-resident Indian and person of Indian origin',
    'm.02ry8mk': 'Sierra Leone Creole people',
    'm.092h2qt': 'Dutch Jewish',
    'm.09fqz7': 'Croatian Australian',
    'm.01p7s6': 'Poles',
    'm.01vsch': 'Aromanians',
    'm.03bx0k4': 'Hong Kong people',
    'm.03vghh': 'Portuguese people',
    'm.03yk6g': 'Afro-Cuban',
    'm.0466nw8': 'Lebanese people',
    'm.074w_m': 'Tulu people',
    'm.0c3wsgg': 'Moroccans',
    'm.027936c': 'Gin people',
    'm.03gskx0': 'Australian American',
    'm.03pqwy': 'Finnish American',
    'm.06gbnc': 'Welsh people',
    'm.04k02l': 'Marwari people',
    'm.05bzpzx': 'Czech Australians',
    'm.01trsl': 'First Nations',
    'm.03hf_6z': 'Gibraltarians',
    'm.0665pp': 'Rohilla',
    'm.01d7kx': 'Swedish-speaking population of Finland',
    'm.029q52': 'Georgians',
    'm.04ggbzy': 'White Africans of European ancestry',
    'm.065b6q': 'Swedish American',
    'm.03fk0c': 'Icelanders',
    'm.0bfjm7': 'Welsh American',
    'm.04lfc70': 'Armenians in Italy',
    'm.029f2r': 'Sindhis',
    'm.03fvrb': 'Khatri',
    'm.0cx3p': 'Berbers',
    'm.01xhh5': 'Koreans',
    'm.0dllcfn': 'Afghans in India',
    'm.0462jl6': 'Pathani',
    'm.0dtkkb': 'British African-Caribbean people',
    'm.04q7gbh': 'Americans',
    'm.09tqq8q': 'Ghanaian people',
    'm.03x_lpj': 'Sudanese Arabs',
    'm.05ztd1': 'Dogra',
    'm.0f0gt_': 'Asian people',
    'm.078vc': 'Sikh',
    'm.07hwkr': 'White American',
    'm.08xbxs': 'Pakistani Canadian',
    'm.02y_9mh': 'Kanyakubja Brahmins',
    'm.03r_k': 'Inuit',
    'm.047q05d': 'Lebanese people in the United Kingdom',
    'm.095mw2': 'Soviet people',
    'm.0cnvdq1': 'Scandinavian American',
    'm.02vkw95': 'Ecuadorian American',
    'm.02wcbj_': 'Bolivian American',
    'm.02wz7j': 'Oneida people',
    'm.04dzwby': 'French Chilean',
    'm.0fk55': 'Cheyenne',
    'm.01sq7s': 'Mohawk people',
    'm.01ywdy': 'Estonian',
    'm.04zjjt': 'History of the Jews in India',
    'm.0c29q8': 'Irani',
    'm.0ftlzz': 'Luxembourg American',
    'm.0h8mzsl': 'Native American',
    'm.02cm28': 'English Canadian',
    'm.067lrj': 'Greek Canadians',
    'm.0gf5k1': 'Italians in the United Kingdom',
    'm.038723': 'Greek American',
    'm.0j4w_': 'Croats',
    'm.08hpk0': 'Hungarian American',
    'm.02vkd28': 'Haitian American',
    'm.05sycg': 'British Asian',
    'm.0640_7q': 'Moroccan Jews',
    'm.065577s': 'Italian',
    'm.0bbz66j': 'Australians',
    'm.0g5rkt4': 'Corsicans',
    'm.04kbvpz': 'Serbian',
    'm.05748': 'Māori people',
    'm.04sfz4s': 'viennese',
    'm.07wsyr': 'Thai people',
    'm.03ndvw': 'Vietnamese people',
    'm.0278pqj': 'British Nigerian',
    'm.0bdynxs': 'Sinhala- Tamil',
    'm.0br_9j': 'Romani people in Spain',
    'm.02qv_h_': 'Syrian American',
    'm.06dy2k': 'Serbs of Bosnia and Herzegovina',
    'm.09cqth': 'Romanichal',
    'm.09r2kh': 'Peoples of the Caucasus',
    'm.0301y_': 'Yoruba people',
    'm.034s7b': 'Anglo-Indian',
    'm.026cybk': 'Serbian Canadians',
    'm.06fczy': 'Black Irish',
    'm.0fk3s': 'Sioux',
    'm.05c60ml': 'Chileans',
    'm.0268_k': 'Danes',
    'm.0b0gzf': 'Dravida',
    'm.03cl2pz': 'Serbs in the United Kingdom',
    'm.03x_fq7': 'Aboriginal Australians',
    'm.0912ll': 'Dominican American',
    'm.0b3zsn': 'Greeks in the United Kingdom'
}

actors.ethnicity = actors.ethnicity.apply(lambda eth: map_ethnicities.get("m." + eth[3:] if type(eth) == type("") else "", None))
actors.ethnicity.unique()

In [None]:
# pd.set_option('display.max_rows', None)
actors.ethnicity.value_counts()

## Age, date of birth

## Save the final dataset

In [None]:
actors.to_csv('../Data/actors.csv')

actors