# Completing actor data

## 1) Load actor data from csv

In [1]:
import pandas as pd
from src.utils import actor_index_distribution

actor_df = pd.read_csv('data/actor_data.csv', index_col=0)
actor_df

Unnamed: 0_level_0,Actor date of birth,Actor gender,Actor height,Actor ethnicity,Actor age at movie release,Actor Score Index
Actor name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
denzel_washington,1954-12-28,M,1.84,/m/0x67,26.0,10.000000
matt_damon,1970-10-08,M,1.78,/m/0cnvdq1,18.0,9.912690
tom_hanks,1956-07-09,M,1.83,/m/02p4q5p,27.0,9.741555
eddie_murphy,1961-04-03,M,1.75,/m/0x67,21.0,9.415060
tom_cruise,1962-07-03,M,1.70,/m/02ctzb,18.0,9.362255
...,...,...,...,...,...,...
j._kenneth_campbell,1947-07-22,M,,,43.0,0.714754
evelyn_keyes,1916-11-20,F,1.63,,72.0,0.700949
james_dixon,,M,,,,0.700949
reilly_murphy,,,,,,0.251820


## 2) Set sample size

In [2]:
#test_sample = actor_df.dropna(subset='Actor height').copy()
test_sample = actor_df.iloc[:5].copy()

## 3) Convert the ethnicity IDs

In [3]:
from src.converter import converter
from tqdm import tqdm

tqdm.pandas() # for progress_apply

test_sample.loc[:, 'Actor ethnicity'] = test_sample['Actor ethnicity'].progress_apply(converter.get_ethnicity)
test_sample

100%|██████████| 5/5 [00:01<00:00,  2.94it/s]


Unnamed: 0_level_0,Actor date of birth,Actor gender,Actor height,Actor ethnicity,Actor age at movie release,Actor Score Index
Actor name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
denzel_washington,1954-12-28,M,1.84,African Americans,26.0,10.0
matt_damon,1970-10-08,M,1.78,Scandinavian Americans,18.0,9.91269
tom_hanks,1956-07-09,M,1.83,Portuguese Americans,27.0,9.741555
eddie_murphy,1961-04-03,M,1.75,African Americans,21.0,9.41506
tom_cruise,1962-07-03,M,1.7,White people,18.0,9.362255


## 4) Run the scraper

In [4]:
from src.webscraping import spider

spider.run_scraping(test_sample)
test_sample

100%|██████████| 5/5 [00:14<00:00,  2.80s/it]


Unnamed: 0_level_0,Actor date of birth,Actor gender,Actor height,Actor ethnicity,Actor age at movie release,Actor Score Index,Gender,University,Theater,Sports,Birth City,Date of Birth,Citizenship,Number of Children,Career Start
Actor name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Denzel Washington,1954-12-28,M,1.84,African Americans,26.0,10.0,Male,Fordham University BAAmerican Conservatory The...,Yes,Basketball,Mount Vernon New York US,1954-12-28,,4,1975
Matt Damon,1970-10-08,M,1.78,Scandinavian Americans,18.0,9.91269,Male,Harvard University dropped out,Yes,,Cambridge Massachusetts US,1970-10-08,,4,1987
Tom Hanks,1956-07-09,M,1.83,Portuguese Americans,27.0,9.741555,Male,Chabot College,Yes,,Concord California US,1956-07-09,"United States, Greece1",4,1977
Eddie Murphy,1961-04-03,M,1.75,African Americans,21.0,9.41506,Male,,,,New York City US,1961-04-03,,10,1976
Tom Cruise,1962-07-03,M,1.7,White people,18.0,9.362255,Male,,,Football,Syracuse New York US,1962-07-03,,3,1980


In [5]:
#test_sample.to_csv('data/actor_data_scraped.csv')

scraped_data = pd.read_csv('full_actor_data_scraped.csv', index_col=0)
scraped_data

Unnamed: 0_level_0,Actor date of birth,Actor gender,Actor height,Actor ethnicity,Actor age at movie release,Actor Score Index,Gender,University,Theater,Sports,Birth City,Date of Birth,Citizenship,Number of Children,Career Start
Actor name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Denzel Washington,1954-12-28,M,1.840,African Americans,26.0,10.000000,Male,Fordham University BAAmerican Conservatory The...,Yes,Basketball,Mount Vernon New York US,1954-12-28,,4.0,1975.0
Matt Damon,1970-10-08,M,1.780,Scandinavian Americans,18.0,9.912690,Male,Harvard University dropped out,Yes,,Cambridge Massachusetts US,1970-10-08,,4.0,1987.0
Tom Hanks,1956-07-09,M,1.830,Portuguese Americans,27.0,9.741555,Male,Chabot College,Yes,,Concord California US,1956-07-09,"United States, Greece1",4.0,1977.0
Eddie Murphy,1961-04-03,M,1.750,African Americans,21.0,9.415060,Male,,,,New York City US,1961-04-03,,10.0,1976.0
Tom Cruise,1962-07-03,M,1.700,White people,18.0,9.362255,Male,,,Football,Syracuse New York US,1962-07-03,,3.0,1980.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Lindsay Hollister,1977-06-03,F,1.750,,30.0,1.103153,Female,Miami University,,,Columbus Ohio United States,1977-06-03,,,2001.0
Terrence Howard,1969-03-11,M,1.840,African Americans,26.0,0.915104,Male,,,Track,Chicago Illinois US,1969-03-11,,,1992.0
Madonna,1958-08-16,F,1.613,French Canadians,26.0,0.838557,Female,,,,Bay City Michigan US,1958-08-16,,6.0,1979.0
Evelyn Keyes,1916-11-20,F,1.630,,72.0,0.700949,Female,,,,Port Arthur Texas US,1916-11-20,,,1938.0
