# Completing actor data

## 1) Load actor data from csv

In [1]:
import pandas as pd
from src.utils import actor_index_distribution

actor_df = pd.read_csv('data/actor_data.csv', index_col=0)
actor_df

Unnamed: 0_level_0,Actor date of birth,Actor gender,Actor height,Actor ethnicity,Actor age at movie release,Actor Score Index
Actor name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
denzel_washington,1954-12-28,M,1.84,/m/0x67,26.0,10.000000
matt_damon,1970-10-08,M,1.78,/m/0cnvdq1,18.0,9.912690
tom_hanks,1956-07-09,M,1.83,/m/02p4q5p,27.0,9.741555
eddie_murphy,1961-04-03,M,1.75,/m/0x67,21.0,9.415060
tom_cruise,1962-07-03,M,1.70,/m/02ctzb,18.0,9.362255
...,...,...,...,...,...,...
j._kenneth_campbell,1947-07-22,M,,,43.0,0.714754
evelyn_keyes,1916-11-20,F,1.63,,72.0,0.700949
james_dixon,,M,,,,0.700949
reilly_murphy,,,,,,0.251820


## 2) Set sample size

In [2]:
#test_sample = actor_df.dropna(subset='Actor height').copy()
test_sample = actor_df.iloc[:5].copy()

## 3) Convert the ethnicity IDs

In [3]:
from src.converter import converter
from tqdm import tqdm

tqdm.pandas() # for progress_apply

test_sample.loc[:, 'Actor ethnicity'] = test_sample['Actor ethnicity'].progress_apply(converter.get_ethnicity)
test_sample

100%|██████████| 5/5 [00:01<00:00,  2.94it/s]


Unnamed: 0_level_0,Actor date of birth,Actor gender,Actor height,Actor ethnicity,Actor age at movie release,Actor Score Index
Actor name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
denzel_washington,1954-12-28,M,1.84,African Americans,26.0,10.0
matt_damon,1970-10-08,M,1.78,Scandinavian Americans,18.0,9.91269
tom_hanks,1956-07-09,M,1.83,Portuguese Americans,27.0,9.741555
eddie_murphy,1961-04-03,M,1.75,African Americans,21.0,9.41506
tom_cruise,1962-07-03,M,1.7,White people,18.0,9.362255


## 4) Run the scraper

In [4]:
from src.webscraping import spider

spider.run_scraping(test_sample)
test_sample

100%|██████████| 5/5 [00:14<00:00,  2.80s/it]


Unnamed: 0_level_0,Actor date of birth,Actor gender,Actor height,Actor ethnicity,Actor age at movie release,Actor Score Index,Gender,University,Theater,Sports,Birth City,Date of Birth,Citizenship,Number of Children,Career Start
Actor name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Denzel Washington,1954-12-28,M,1.84,African Americans,26.0,10.0,Male,Fordham University BAAmerican Conservatory The...,Yes,Basketball,Mount Vernon New York US,1954-12-28,,4,1975
Matt Damon,1970-10-08,M,1.78,Scandinavian Americans,18.0,9.91269,Male,Harvard University dropped out,Yes,,Cambridge Massachusetts US,1970-10-08,,4,1987
Tom Hanks,1956-07-09,M,1.83,Portuguese Americans,27.0,9.741555,Male,Chabot College,Yes,,Concord California US,1956-07-09,"United States, Greece1",4,1977
Eddie Murphy,1961-04-03,M,1.75,African Americans,21.0,9.41506,Male,,,,New York City US,1961-04-03,,10,1976
Tom Cruise,1962-07-03,M,1.7,White people,18.0,9.362255,Male,,,Football,Syracuse New York US,1962-07-03,,3,1980


In [5]:
#test_sample.to_csv('data/actor_data_scraped.csv')

scraped_data = pd.read_csv('full_actor_data_scraped.csv', index_col=0)
scraped_data

Unnamed: 0_level_0,Actor date of birth,Actor gender,Actor height,Actor ethnicity,Actor age at movie release,Actor Score Index,Gender,University,Theater,Sports,Birth City,Date of Birth,Citizenship,Number of Children,Career Start
Actor name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Denzel Washington,1954-12-28,M,1.840,African Americans,26.0,10.000000,Male,Fordham University BAAmerican Conservatory The...,Yes,Basketball,Mount Vernon New York US,1954-12-28,,4.0,1975.0
Matt Damon,1970-10-08,M,1.780,Scandinavian Americans,18.0,9.912690,Male,Harvard University dropped out,Yes,,Cambridge Massachusetts US,1970-10-08,,4.0,1987.0
Tom Hanks,1956-07-09,M,1.830,Portuguese Americans,27.0,9.741555,Male,Chabot College,Yes,,Concord California US,1956-07-09,"United States, Greece1",4.0,1977.0
Eddie Murphy,1961-04-03,M,1.750,African Americans,21.0,9.415060,Male,,,,New York City US,1961-04-03,,10.0,1976.0
Tom Cruise,1962-07-03,M,1.700,White people,18.0,9.362255,Male,,,Football,Syracuse New York US,1962-07-03,,3.0,1980.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Lindsay Hollister,1977-06-03,F,1.750,,30.0,1.103153,Female,Miami University,,,Columbus Ohio United States,1977-06-03,,,2001.0
Terrence Howard,1969-03-11,M,1.840,African Americans,26.0,0.915104,Male,,,Track,Chicago Illinois US,1969-03-11,,,1992.0
Madonna,1958-08-16,F,1.613,French Canadians,26.0,0.838557,Female,,,,Bay City Michigan US,1958-08-16,,6.0,1979.0
Evelyn Keyes,1916-11-20,F,1.630,,72.0,0.700949,Female,,,,Port Arthur Texas US,1916-11-20,,,1938.0


In [6]:
show_data = pd.read_csv('webscraping/actor_data_scraped.csv')

In [7]:
pd.read_excel('data/2024_QS_World_University_Rankings.xlsx').to_csv('data/2024_QS_World_University_Rankings_csv')

In [None]:
rankings = pd.read_csv('data/2024_QS_World_University_Rankings_csv', header=0)
rankings

Unnamed: 0.1,Unnamed: 0,2024 QS World University Rankings,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,...,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28
0,0,2024,2023,Institution Name,Location,,Classification,,,,...,,International Students,,International Research Network,,Employment Outcomes,,Sustainability,,Overall
1,1,RANK,RANK,,,Location,SIZE,FOCUS,RES.,AGE,...,RANK,SCORE,RANK,SCORE,RANK,SCORE,RANK,Score,Rank,SCORE
2,2,rank display,rank display2,institution,location code,location,size,focus,research,age band,...,ifr rank,isr score,isr rank,irn score,irn rank,ger score,ger rank,SUS SCORE,SUS RANK,Overall Score
3,3,1,1,Massachusetts Institute of Technology (MIT),US,United States,M,CO,VH,5,...,56,88.2,128,94.3,58,100,4,95.2,51,100
4,4,2,2,University of Cambridge,UK,United Kingdom,L,FC,VH,5,...,64,95.8,85,99.9,7,100,6,97.3,33=,99.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,1495,1401+,1401+,University of Oradea,RO,Romania,L,FC,MD,4,...,701+,7,701+,1.8,701+,5.9,701+,2.2,701+,-
1496,1496,1401+,,University of Stavanger,NO,Norway,,CO,,2,...,,,,26.3,489,,,,,-
1497,1497,1401+,1201-1400,"University Politehnica of Timisoara, UPT",RO,Romania,M,FO,VH,5,...,701+,2.9,701+,1.3,701+,8.5,701+,1,701+,-
1498,1498,1401+,1201-1400,Western Washington University,US,United States,L,CO,HI,5,...,676,1.4,701+,1,701+,7.3,701+,,,-


In [None]:
from rapidfuzz import process

In [27]:
# Match the names found from webscraping to the names as they are registered in the QS Rankings

def match_universities(uni_name, qs_uni):
    if not isinstance(uni_name, str):  # Check if the value is NaN
        return None
    result = process.extractOne(uni_name, qs_uni)
    if result:
        match, score, _ = result  # Unpack match, score, and index
        return match if score > 88 else None  # Return match only if confidence > 80
    return None

qs_uni_names = rankings['Institution Name'].to_list()
show_data['Matched Uni'] = show_data['University'].apply(lambda x: match_universities(x, qs_uni_names))

In [29]:
show_data['University'] = show_data.apply(
    lambda row: row['Matched Uni'] if pd.notna(row['Matched Uni']) else row['University'], axis = 1
)

show_data['found_match'] = show_data.apply(lambda row: True if pd.notna(row['Matched Uni']) else False, axis=1)

show_data.drop(columns=['Matched Uni'], inplace=True)

In [34]:
# Replace all values containing 'State University of New York at Purchase' with 'Purchase College SUNY'
show_data.loc[
    show_data['University'].str.contains('Purchase', case=False, na=False),
    'University'
] = 'Purchase College SUNY'


In [49]:
# Replace all values containing 'High School' with 'None'
show_data.loc[
    show_data['University'].str.contains('high school', case=False, na=False),
    'University'
] = None

# Replace Academy Award by None

show_data.loc[
    show_data['University'].str.contains('Academy Award', case=False, na=False),
    'University'
] = None

In [61]:
show_data.loc[
    show_data['University'].str.contains('drama', case=False, na=False),
    'University'
] = 'Specialised Drama School'

show_data.loc[
    show_data['University'].str.contains('dramatic', case=False, na=False),
    'University'
] = 'Specialised Drama School'

show_data.loc[
    show_data['University'].str.contains('theatre', case=False, na=False),
    'University'
] = 'Specialised Drama School'

show_data.loc[
    show_data['University'].str.contains('theater', case=False, na=False),
    'University'
] = 'Specialised Drama School'

show_data.loc[
    show_data['University'].str.contains('acting', case=False, na=False),
    'University'
] = 'Specialised Acting School'

show_data.loc[
    show_data['University'].str.contains('film', case=False, na=False),
    'University'
] = 'Specialised Acting School'

show_data.loc[
    show_data['University'].str.contains('performing arts', case=False, na=False),
    'University'
] = 'Specialised Drama School'

In [63]:
show_data.loc[
    show_data['University'].str.contains('oxford', case=False, na=False),
    'University'
] = 'University of Oxford'

show_data.loc[
    show_data['University'].str.contains('college cambridge', case=False, na=False),
    'University'
] = 'University of Cambridge'

show_data.loc[
    show_data['University'].str.contains('trinity hall', case=False, na=False),
    'University'
] = 'University of Cambridge'

show_data.loc[
    show_data['University'].str.contains('University of California Los Angeles', case=False, na=False),
    'University'
] = "University of California, Los Angeles (UCLA)"

show_data.loc[
    show_data['University'].str.contains('UCLA', case=False, na=False),
    'University'
] = "University of California, Los Angeles (UCLA)"

show_data.loc[
    show_data['University'].str.contains('University of California Santa Barbara', case=False, na=False),
    'University'
] = "University of California, Santa Barbara (UCSB)"

In [62]:
show_data.loc[
    show_data['University'].str.contains('University College London', case=False, na=False),
    'University'
] = 'UCL'

show_data.loc[
    show_data['University'].str.contains('Kings College London', case=False, na=False),
    'University'
] = "King's College London"

show_data.loc[
    show_data['University'].str.contains('University of Toronto', case=False, na=False),
    'University'
] = 'University of Toronto'

show_data.loc[
    show_data['University'].str.contains('University of Georgia', case=False, na=False),
    'University'
] = 'The University of Georgia'

show_data.loc[
    show_data['University'].str.contains('Harvard', case=False, na=False),
    'University'
] = 'University of Harvard'

show_data.loc[
    show_data['University'].str.contains('University of Missouri', case=False, na=False),
    'University'
] = 'University of Missouri'

show_data.loc[
    show_data['University'].str.contains('University of North Carolina', case=False, na=False),
    'University'
] = 'University of North Carolina'

show_data.loc[
    show_data['University'].str.contains('San Diego State', case=False, na=False),
    'University'
] = 'San Diego State University'

show_data.loc[
    show_data['University'].str.contains('University of Texas', case=False, na=False),
    'University'
] = 'University of Texas'

show_data.loc[
    show_data['University'].str.contains('University of Alabama', case=False, na=False),
    'University'
] = 'University of Alabama'

show_data.loc[
    show_data['University'].str.contains('University of Michigan', case=False, na=False),
    'University'
] = 'University of Michigan-Ann Arbor'

show_data.loc[
    show_data['University'].str.contains('Texas AM', case=False, na=False),
    'University'
] = 'Texas A&M University'

show_data.loc[
    show_data['University'].str.contains('Fordham', case=False, na=False),
    'University'
] = 'Fordham University'

show_data.loc[
    show_data['University'].str.contains('Rutgers', case=False, na=False),
    'University'
] = 'Rutgers University–New Brunswick'

In [57]:
# Handle Music Schools

show_data.loc[
    show_data['University'].str.contains('Music', case=False, na=False),
    'University'
] = 'Specialised Music School'

show_data.loc[
    show_data['University'].str.contains('Conservatory', case=False, na=False),
    'University'
] = 'Specialised Music School'

show_data.loc[
    show_data['University'].str.contains('Conservatoire', case=False, na=False),
    'University'
] = 'Specialised Music School'

show_data.loc[
    show_data['University'].str.contains('Juilliard', case=False, na=False),
    'University'
] = 'Specialised Music School'

In [59]:
# Ballet schools

show_data.loc[
    show_data['University'].str.contains('ballet', case=False, na=False),
    'University'
] = 'Specialised Dance School'

show_data.loc[
    show_data['University'].str.contains('dance', case=False, na=False),
    'University'
] = 'Specialised Dance School'

In [64]:
# Handle Arts students

show_data.loc[
    show_data['University'].str.contains('art', case=False, na=False),
    'University'
] = 'Specialised Arts School'

show_data.loc[
    show_data['University'].str.contains('arts', case=False, na=False),
    'University'
] = 'Specialised Arts School'

In [65]:
# Match the names found from webscraping to the names as they are registered in the QS Rankings

def match_universities(uni_name, qs_uni):
    if not isinstance(uni_name, str):  # Check if the value is NaN
        return None
    result = process.extractOne(uni_name, qs_uni)
    if result:
        match, score, _ = result  # Unpack match, score, and index
        return match if score > 88 else None  # Return match only if confidence > 80
    return None

qs_uni_names = rankings['Institution Name'].to_list()
show_data['Matched Uni'] = show_data['University'].apply(lambda x: match_universities(x, qs_uni_names))

In [66]:
show_data['University'] = show_data.apply(
    lambda row: row['Matched Uni'] if pd.notna(row['Matched Uni']) else row['University'], axis = 1
)

show_data['found_match'] = show_data.apply(lambda row: True if pd.notna(row['Matched Uni']) else False, axis=1)

show_data.drop(columns=['Matched Uni'], inplace=True)

In [67]:
show_data.loc[
    show_data['University'].str.contains('vassar', case=False, na=False),
    'University'
] = 'Vassar College'

In [None]:
# Update 'University' where conditions are met
show_data.loc[
    (show_data['found_match'] == False) &  # Check if 'found_match' is False
    (~show_data['University'].str.lower().str.startswith('specialised', na=False)) &  # Does NOT start with 'specialized'
    (~show_data['University'].str.lower().str.startswith('vassar', na=False)),  # Does NOT start with 'vassar'
    'University'
] = 'sub 1500 school'
