# Ali's Part

In [1]:
import requests
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
import time

print("All libraries imported successfully!")

All libraries imported successfully!


In [11]:
# Load the TSV file
file_path = 'character_metadata_forWS.tsv'  # Replace with the correct path if necessary
data = pd.read_csv(file_path, sep='\t')

# Extract the 'ActorName' column
actor_names = data['ActorName']

# Remove duplicates and handle missing values (optional)
actor_names = actor_names.dropna().unique()



  data = pd.read_csv(file_path, sep='\t')


array(['Wanda De Jesus', 'Natasha Henstridge', 'Ice Cube', ...,
       'Susan Byrkett', 'Hal Cleaveland', 'Roberta Paterson'],
      dtype=object)

In [14]:
# Convert the NumPy array to a DataFrame
actors_df = pd.DataFrame(actor_names, columns=['actor_name'])

# Display the first few rows
actors_df.head()

Unnamed: 0,actor_name
0,Wanda De Jesus
1,Natasha Henstridge
2,Ice Cube
3,Jason Statham
4,Clea DuVall


In [15]:
# Initialize the 'wikidata_id' column with None
actors_df['wikidata_id'] = None

# Display the updated DataFrame
actors_df.head()

Unnamed: 0,actor_name,wikidata_id
0,Wanda De Jesus,
1,Natasha Henstridge,
2,Ice Cube,
3,Jason Statham,
4,Clea DuVall,


In [16]:
import requests
from tqdm.notebook import tqdm
import time

In [17]:
def get_wikidata_id(actor_name):
    """
    Fetches the Wikidata ID (Q number) for a given actor's name.
    """
    url = "https://www.wikidata.org/w/api.php"
    params = {
        'action': 'wbsearchentities',
        'format': 'json',
        'language': 'en',
        'search': actor_name,
        'limit': 1,
        'type': 'item'
    }
    headers = {
        'User-Agent': 'ActorDataProject/1.0 (ali.benchekroun@epfl.ch)'
    }
    try:
        response = requests.get(url, params=params, headers=headers)
        data = response.json()
        if data['search']:
            return data['search'][0]['id']
        else:
            return None
    except Exception as e:
        print(f"Error fetching ID for {actor_name}: {e}")
        return None

In [21]:
# Map Actor Names to Wikidata IDs
# Total number of actors
total_actors = actors_df.shape[0]

# Initialize a progress bar
progress_bar = tqdm(total=total_actors, desc="Mapping Actors to Wikidata IDs")

for idx, row in actors_df.iterrows():
    actor_name = row['actor_name']
    wikidata_id = get_wikidata_id(actor_name)
    actors_df.at[idx, 'wikidata_id'] = wikidata_id
    progress_bar.update(1)

progress_bar.close()

Mapping Actors to Wikidata IDs:   0%|          | 0/134078 [00:00<?, ?it/s]

Error fetching ID for James Brill: ('Connection aborted.', OSError(65, 'No route to host'))
Error fetching ID for Geórgia Reck: HTTPSConnectionPool(host='www.wikidata.org', port=443): Max retries exceeded with url: /w/api.php?action=wbsearchentities&format=json&language=en&search=Ge%C3%B3rgia+Reck&limit=1&type=item (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x177f387a0>, 'Connection to www.wikidata.org timed out. (connect timeout=None)'))
Error fetching ID for Jo Rowbottom: HTTPSConnectionPool(host='www.wikidata.org', port=443): Max retries exceeded with url: /w/api.php?action=wbsearchentities&format=json&language=en&search=Jo+Rowbottom&limit=1&type=item (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x177f85f10>, 'Connection to www.wikidata.org timed out. (connect timeout=None)'))
Error fetching ID for Jean Négroni: HTTPSConnectionPool(host='www.wikidata.org', port=443): Max retries exceeded with url: /w/api.php?action=wb

In [22]:
# Count the number of successfully mapped actors
mapped_count = actors_df['wikidata_id'].notnull().sum()
total_count = actors_df.shape[0]
print(f"Successfully mapped: {mapped_count} out of {total_count} actors.")

# Identify actors without a Wikidata ID
missing_ids_df = actors_df[actors_df['wikidata_id'].isnull()]
missing_count = missing_ids_df.shape[0]
print(f"Actors without a Wikidata ID: {missing_count}")

Successfully mapped: 94882 out of 134078 actors.
Actors without a Wikidata ID: 39196


In [23]:
# save everything in a csv file
actors_df.to_csv('actors_wikiID.csv', index=False)

In [36]:
len(actors_df)

134078

In [24]:
# Focus on Education, Occupation, Religious belief, Place of birth, Languages Spoken
def fetch_actor_attributes(wikidata_ids):
    """
    Fetches selected attributes for a list of Wikidata IDs using SPARQL.
    """
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    
    # Prepare the VALUES clause with the list of Wikidata IDs
    values_clause = " ".join([f"wd:{wd_id}" for wd_id in wikidata_ids])
    
    # SPARQL Query
    query = f"""
    SELECT ?actor ?actorLabel ?educationLabel ?occupationLabel ?religiousBeliefLabel ?placeOfBirthLabel ?languagesSpokenLabel WHERE {{
      VALUES ?actor {{ {values_clause} }}
      
      OPTIONAL {{ ?actor wdt:P69 ?education. }}
      OPTIONAL {{ ?actor wdt:P106 ?occupation. }}
      OPTIONAL {{ ?actor wdt:P140 ?religiousBelief. }}
      OPTIONAL {{ ?actor wdt:P19 ?placeOfBirth. }}
      OPTIONAL {{ ?actor wdt:P1412 ?languagesSpoken. }}
      
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    
    try:
        results = sparql.query().convert()
        return results
    except Exception as e:
        print(f"Error executing SPARQL query: {e}")
        return None

In [27]:
# Define batch size
BATCH_SIZE = 100

# Prepare the list of Wikidata IDs
wikidata_ids = actors_df['wikidata_id'].dropna().tolist()  # Ensure no NaN values

# Initialize an empty list to store results
all_results = []

# Process in batches
for i in tqdm(range(0, len(wikidata_ids), BATCH_SIZE), desc="Fetching Actor Attributes"):
    batch_ids = wikidata_ids[i:i+BATCH_SIZE]
    results = fetch_actor_attributes(batch_ids)
    
    if results:
        for result in results["results"]["bindings"]:
            actor_data = {
                'wikidata_id': result['actor']['value'].split('/')[-1],
                'actor_name': result['actorLabel']['value'],
                'education': result.get('educationLabel', {}).get('value', None),
                'occupation': result.get('occupationLabel', {}).get('value', None),
                'religious_belief': result.get('religiousBeliefLabel', {}).get('value', None),
                'place_of_birth': result.get('placeOfBirthLabel', {}).get('value', None),
                'languages_spoken': result.get('languagesSpokenLabel', {}).get('value', None)
            }
            all_results.append(actor_data)
    
    # Sleep to respect rate limits
    time.sleep(1)  # Adjust as needed based on performance and API responsiveness

Fetching Actor Attributes:   0%|          | 0/949 [00:00<?, ?it/s]

Error executing SPARQL query: HTTP Error 429: Too Many Requests
Error executing SPARQL query: HTTP Error 429: Too Many Requests
Error executing SPARQL query: HTTP Error 429: Too Many Requests
Error executing SPARQL query: HTTP Error 429: Too Many Requests
Error executing SPARQL query: HTTP Error 429: Too Many Requests
Error executing SPARQL query: HTTP Error 429: Too Many Requests
Error executing SPARQL query: HTTP Error 429: Too Many Requests
Error executing SPARQL query: HTTP Error 429: Too Many Requests
Error executing SPARQL query: HTTP Error 429: Too Many Requests
Error executing SPARQL query: HTTP Error 429: Too Many Requests
Error executing SPARQL query: HTTP Error 429: Too Many Requests
Error executing SPARQL query: HTTP Error 429: Too Many Requests
Error executing SPARQL query: HTTP Error 429: Too Many Requests
Error executing SPARQL query: HTTP Error 429: Too Many Requests
Error executing SPARQL query: HTTP Error 429: Too Many Requests
Error executing SPARQL query: HTTP Error

In [38]:
len(wikidata_ids)

94882

In [33]:
# Create a DataFrame from the results
data_df = pd.DataFrame(all_results)

# Display the first few entries
print(len(data_df))
    
data_df.head(30)

349293


Unnamed: 0,wikidata_id,actor_name,education,occupation,religious_belief,place_of_birth,languages_spoken
0,Q11378,Shiva,,,Shaivism,,
1,Q36105,Charles Bronson,,screenwriter,,Ehrenfeld,English
2,Q36105,Charles Bronson,,actor,,Ehrenfeld,English
3,Q36105,Charles Bronson,,character actor,,Ehrenfeld,English
4,Q36105,Charles Bronson,,television actor,,Ehrenfeld,English
5,Q36105,Charles Bronson,,film actor,,Ehrenfeld,English
6,Q36105,Charles Bronson,,screenwriter,,Ehrenfeld,Russian
7,Q36105,Charles Bronson,,actor,,Ehrenfeld,Russian
8,Q36105,Charles Bronson,,character actor,,Ehrenfeld,Russian
9,Q36105,Charles Bronson,,television actor,,Ehrenfeld,Russian


In [35]:
# Group by 'wikidata_id' and aggregate values into a single row per ID
aggregated_df = data_df.groupby('wikidata_id').agg({
    'actor_name': 'first',  # Take the first occurrence of actor_name
    'education': lambda x: ', '.join(filter(None, x.unique())),  # Combine unique non-null values
    'occupation': lambda x: ', '.join(filter(None, x.unique())),
    'religious_belief': lambda x: ', '.join(filter(None, x.unique())),
    'place_of_birth': lambda x: ', '.join(filter(None, x.unique())),
    'languages_spoken': lambda x: ', '.join(filter(None, x.unique()))
}).reset_index()

# Display the resulting DataFrame
len(aggregated_df)


89706

In [52]:
# Replace blank strings and whitespaces with NaN
aggregated_df.replace(r'^\s*$', None, regex=True, inplace=True)

# Calculate the percentage of missing values for each column
missing_percentages = aggregated_df.isnull().sum() / len(aggregated_df) * 100

# Print the percentages with two decimal places
print(missing_percentages.round(2))

wikidata_id          0.00
actor_name           0.00
education           65.38
occupation           5.61
religious_belief    96.88
place_of_birth      26.76
languages_spoken    40.06
dtype: float64


In [47]:
print(f"Percentage of lost actors during process : {(1 - len(aggregated_df)/len(actors_df)) * 100}%")
print(f"So the percentage of actors we have informtions for is : {len(aggregated_df)/len(actors_df) * 100}%")

Percentage of lost actors during process : 33.09416906576769%
So the percentage of actors we have informtions for is : 66.90583093423231%


In [53]:
aggregated_df.head(5)

Unnamed: 0,wikidata_id,actor_name,education,occupation,religious_belief,place_of_birth,languages_spoken
0,Q100001260,Henk Rigters,,"actor, stage actor, director, television actor...",,Den Helder,
1,Q100005,Tadeusz Borowski,University of Warsaw,"writer, poet, journalist, opinion journalist, ...",,Zhytomyr,Polish
2,Q100028,Jörg Hube,,"actor, stage actor, film director, television ...",,Neuruppin,German
3,Q1000433,Bud,,,,,
4,Q1000435,Peter Sweeney,,association football player,,Glasgow,English


In [57]:
# Remove duplicates from actors_df and aggregated_df by actor_name
actors_df = actors_df.drop_duplicates(subset=['actor_name'])
aggregated_df = aggregated_df.drop_duplicates(subset=['actor_name'])

# Print lengths after removing duplicates
print(f"Unique actors in actors_df: {len(actors_df)}")
print(f"Unique actors in aggregated_df: {len(aggregated_df)}")

Unique actors in actors_df: 134078
Unique actors in aggregated_df: 89620


In [58]:
# Find actor names in actors_df that are not in aggregated_df
missing_actors = actors_df[~actors_df['actor_name'].isin(aggregated_df['actor_name'])]

# Create a DataFrame with the missing actors and set all other columns to None
missing_actors_df = missing_actors[['actor_name']].copy()
missing_actors_df['wikidata_id'] = None
missing_actors_df['education'] = None
missing_actors_df['occupation'] = None
missing_actors_df['religious_belief'] = None
missing_actors_df['place_of_birth'] = None
missing_actors_df['languages_spoken'] = None

# Append the missing actors to aggregated_df
final_df = pd.concat([aggregated_df, missing_actors_df], ignore_index=True)

# Verify the result
print(f"Original actors_df length: {len(actors_df)}")
print(f"Original aggregated_df length: {len(aggregated_df)}")
print(f"Missing actors added: {len(missing_actors_df)}")
print(f"Final DataFrame length: {len(final_df)}")

Original actors_df length: 134078
Original aggregated_df length: 89620
Missing actors added: 58796
Final DataFrame length: 148416


In [60]:
# Filter final_df to keep only rows where actor_name exists in actors_df
final_filtered_df = final_df[final_df['actor_name'].isin(actors_df['actor_name'])].reset_index(drop=True)

# Verify the length of the filtered DataFrame
print(f"Filtered final DataFrame length: {len(final_filtered_df)}")

Filtered final DataFrame length: 134078


In [61]:
# Calculate the percentage of missing values for each column
missing_percentages0 = final_filtered_df.isnull().sum() / len(final_filtered_df) * 100

# Print the percentages with two decimal places
print(missing_percentages0.round(2))

wikidata_id         43.85
actor_name           0.00
education           80.63
occupation          46.25
religious_belief    98.24
place_of_birth      56.85
languages_spoken    64.45
dtype: float64


### Let's try to retrieve the biography of each actor from wikipedia

In [63]:
import wikipediaapi
from tqdm.notebook import tqdm

In [65]:
# Initialize Wikipedia API
headers = {
    'User-Agent': 'AppliedDataAnalysis_Project_EPFL/1.0 (ali.benchekroun@epfl.ch)'
}
wiki_wiki = wikipediaapi.Wikipedia('en', headers=headers)

def fetch_biography(actor_name):
    """
    Fetches the biography of an actor from Wikipedia.
    """
    try:
        page = wiki_wiki.page(actor_name)
        if page.exists():
            return page.summary  # Returns the summary (biography) of the page
        else:
            return None  # Actor not found on Wikipedia
    except Exception as e:
        print(f"Error fetching biography for {actor_name}: {e}")
        return None

In [67]:
actorsB_df = actors_df

# Fetch biographies for each actor and update the column
for idx, row in tqdm(actorsB_df.iterrows(), total=len(actorsB_df), desc="Fetching Biographies"):
    actor_name = row['actor_name']
    actorsB_df.at[idx, 'biography'] = fetch_biography(actor_name)

Fetching Biographies:   0%|          | 0/134078 [00:00<?, ?it/s]

Error fetching biography for Indrajith: HTTPSConnectionPool(host='en.wikipedia.org', port=443): Read timed out. (read timeout=10.0)
Error fetching biography for Sonia Todd: HTTPSConnectionPool(host='en.wikipedia.org', port=443): Read timed out. (read timeout=10.0)
Error fetching biography for Douglas Lambert: HTTPSConnectionPool(host='en.wikipedia.org', port=443): Read timed out. (read timeout=10.0)
Error fetching biography for Annie Sorrell: HTTPSConnectionPool(host='en.wikipedia.org', port=443): Read timed out. (read timeout=10.0)
Error fetching biography for Alan White: HTTPSConnectionPool(host='en.wikipedia.org', port=443): Read timed out. (read timeout=10.0)
Error fetching biography for María Miguel: HTTPSConnectionPool(host='en.wikipedia.org', port=443): Read timed out. (read timeout=10.0)
Error fetching biography for Marita Nordberg: HTTPSConnectionPool(host='en.wikipedia.org', port=443): Read timed out. (read timeout=10.0)
Error fetching biography for Craig Robinson: HTTPSConne

In [69]:
# Calculate the percentage of missing values for each column
missing_percentage1 = actorsB_df.isnull().sum() / len(actorsB_df) * 100

# Print the percentages with two decimal places
print(missing_percentage1.round(2))

actor_name     0.00
biography     50.82
dtype: float64


In [70]:
actorsB_df.to_csv('actors_with_biographies.csv', index=False)
print("Data saved to 'actors_with_biographies.csv'.")

Data saved to 'actors_with_biographies.csv'.


In [71]:
actorsB_df.head(10)

Unnamed: 0,actor_name,biography
0,Wanda De Jesus,"Wanda De Jesus (born August 26, 1958) is an Am..."
1,Natasha Henstridge,"Natasha Tonya Henstridge (born August 15, 1974..."
2,Ice Cube,"O'Shea Jackson Sr. (born June 15, 1969), known..."
3,Jason Statham,Jason Statham ( STAY-thəm; born 26 July 1967) ...
4,Clea DuVall,Clea Helen D'Etienne DuVall (born September 25...
5,Pam Grier,"Pamela Suzette Grier (born May 26, 1949) is an..."
6,Joanna Cassidy,Joanna Cassidy (born Joanna Virginia Caskey; A...
7,Richard Cetrone,
8,Liam Waite,
9,Duane Davis,Duane Davis is an American actor who has been ...


In [73]:
len(actorsB_df)

134078

## Now we have two dataframes : 
- final_filtered_df where we'll keep only education and occupation for the actors (because others have too many missing values, or are not that relevant)
- actorsB_df where we have the biographies of the actors

--> We'll use NLP techniques to categorize these THREE features of the actors, make them simpler and more meaningful

### 1. Education and Occupation

In [112]:
final_filtered_df.head(5)

Unnamed: 0,wikidata_id,actor_name,education,occupation,religious_belief,place_of_birth,languages_spoken
0,Q100001260,Henk Rigters,,"actor, stage actor, director, television actor...",,Den Helder,
1,Q100005,Tadeusz Borowski,University of Warsaw,"writer, poet, journalist, opinion journalist, ...",,Zhytomyr,Polish
2,Q100028,Jörg Hube,,"actor, stage actor, film director, television ...",,Neuruppin,German
3,Q1000433,Bud,,,,,
4,Q1000435,Peter Sweeney,,association football player,,Glasgow,English


In [113]:
actors_educ_occ_df = final_filtered_df.drop(['religious_belief', 'place_of_birth', 'languages_spoken'], axis=1)
actors_educ_occ_df.head(5)

Unnamed: 0,wikidata_id,actor_name,education,occupation
0,Q100001260,Henk Rigters,,"actor, stage actor, director, television actor..."
1,Q100005,Tadeusz Borowski,University of Warsaw,"writer, poet, journalist, opinion journalist, ..."
2,Q100028,Jörg Hube,,"actor, stage actor, film director, television ..."
3,Q1000433,Bud,,
4,Q1000435,Peter Sweeney,,association football player


In [114]:
# Let's focus on education
unique_values = actors_educ_occ_df['education'].unique()
print(len(unique_values))
print(unique_values)

15220
[None 'University of Warsaw'
 'Madách Imre High School, Budapest University of Technology and Economics, University of Theatre and Film Arts'
 ... 'Rostov State University' 'Dartmouth College, Choate Rosemary Hall'
 'Rise Park Junior School']


##### From our previous analyses, in only 56.15% of actors we have, we found their wikidata ID, and among these with a wikidata ID, all those with education set to 'None' did NOT HAVE an eduction, while for the actors with NO wikidata ID have 'None' as value in 'education' wwe just didn't find information about them.

Let's first, among those with a wikidata ID, make those with a school education to '1' and those with no school education to '0'. We create a new column first then drop the old after finishing.

In [115]:
# Focus only on rows where 'wikidata_id' is not 'None'
# Create a new column 'education_binary' based on the conditions
actors_educ_occ_df['academic_background'] = actors_educ_occ_df.apply(
    lambda row: 1 if row['wikidata_id'] != None and row['education'] != None else
                (0 if row['wikidata_id'] != None and row['education'] == None else None),
    axis=1
)

In [116]:
actors_educ_occ_df.sample(n=10)

Unnamed: 0,wikidata_id,actor_name,education,occupation,academic_background
92496,,Juliano Cazarre,,,
116124,,Brandon H. Chapman,,,
54750,Q573332,Anthony Heald,Michigan State University,"stage actor, television actor, film actor",1.0
121946,,Charles J. Corrado Jr.,,,
76013,,Kate Reid,,,
101298,,Alan Bardsley,,,
109918,,Carsten Jörgensen,,,
109679,,Charles Knox Robinson III,,,
32167,Q3103631,Georges Wague,,"actor, teacher, mime artist, theatrical director",0.0
30927,Q2992071,Juozas Budraitis,"Vilnius University, Vilnius University Faculty...","actor, television actor",1.0


Now let's do the same for 'occupation', among those with a 'wikidata_id', those with 'None' or one acting type have no other occupation that their unique acting role (whatever type it is), then those with more than one acting type have either diverse occupations in the acting field or diverse occupation in other domains than acting too. So we'll derive 3 different categories among actors with 'wikidata_id' not 'None' :
- No other acting occupation (when 'None' or only one occupation with 'actor' in it) : 'oneActingOcc'
- Diverse acting occupations (many occupations all with 'actor' in it) : 'diverseActingOcc'
- Other occupations than only acting (at least one occupation without 'actor' in it)  : 'otherOcc'

In [117]:
# Function to categorize occupations
def categorize_occupation(row):
    if row['wikidata_id'] == None:
        return None  # Exclude rows with wikidata_id 'None'
    
    if row['wikidata_id'] != None and row['occupation'] == None:
        return 'oneActingOcc'  # Only one acting role or 'None'
    
    # Split the occupation into a list for analysis
    occupations = row['occupation'].split(", ")
    
    # Check conditions
    if len([occ for occ in occupations if 'actor' in occ]) == 1 and all('actor' in occ for occ in occupations):
        return 'oneActingOcc'  # Only one acting role or 'None'
    elif all('actor' in occ for occ in occupations):
        return 'diverseActingOcc'  # All occupations are acting-related
    elif any('actor' not in occ for occ in occupations):
        return 'otherOcc'  # Contains non-acting occupations
    return None

In [118]:
# Apply the function to create a new column
actors_educ_occ_df['occupation_category'] = actors_educ_occ_df.apply(categorize_occupation, axis=1)

In [119]:
actors_educ_occ_df.sample(n=15)

Unnamed: 0,wikidata_id,actor_name,education,occupation,academic_background,occupation_category
128940,,Catherine Peppers,,,,
119373,,Benjamin Jones,,,,
129020,,Stan McReynolds,,,,
9474,Q126327635,George Sperdakos,,actor,0.0,oneActingOcc
60121,Q652171,Brian Lopes,,"writer, sport cyclist",0.0,otherOcc
9538,Q1265420,Duncan Lamont,,"actor, television actor, film actor",0.0,diverseActingOcc
10754,Q13113810,Manavalan Joseph,,"actor, film actor",0.0,diverseActingOcc
28335,Q275958,Tracy Pollan,"Lee Strasberg Theatre and Film Institute, Syos...","actor, stage actor, television actor, film actor",1.0,diverseActingOcc
45172,Q466580,Débora Falabella,,"actor, television actor, film actor",0.0,diverseActingOcc
23416,Q232340,Edie Sedgwick,The Branson School,"fashion model, model, film actor, actor, socia...",1.0,otherOcc


##### Seems all good, let's drop the old columns

In [120]:
actors_educ_occ_df.drop(['occupation', 'education'], axis= 1, inplace= True)

In [121]:
actors_educ_occ_df.sample(n=10)

Unnamed: 0,wikidata_id,actor_name,academic_background,occupation_category
100979,,Rakesh Bapat,,
52930,Q5534242,Geoff Morrell,0.0,diverseActingOcc
64096,Q723101,Mark Ryan,0.0,otherOcc
128679,,Disi Alba,,
114948,,Clément Dupré,,
8364,Q122889716,Fausto Lombardi,0.0,oneActingOcc
72259,Q944366,Matt Hill,0.0,otherOcc
111606,,Yoshiro Umezawa,,
2361,Q107253789,Jim Bray,0.0,otherOcc
20627,Q2087638,Philippe Laudenbach,1.0,diverseActingOcc


In [122]:
# Calculate the percentage of NaN values in each column
nan_percentages = actors_educ_occ_df.isnull().mean() * 100

# Display the percentages
print(nan_percentages)

wikidata_id            43.852086
actor_name              0.000000
academic_background    43.852086
occupation_category    43.852086
dtype: float64


Great, as we expected, now we can drop the wikidata_id too, and choose for a way of filling the Nan values for each of 'academic_background' and 'occupation_category'

In [123]:
actors_educ_occ_df.drop(['wikidata_id'], axis= 1, inplace= True)
actors_educ_occ_df.sample(n=5)

Unnamed: 0,actor_name,academic_background,occupation_category
17220,Manuel Zarzo,0.0,oneActingOcc
75615,Farha Naaz,,
88031,Eugène Bech,,
100191,Gregory Golubeff,,
51038,Carlo Verdone,1.0,otherOcc


#### Focus on 'academic_background'

In [124]:
# Calculate the count of each unique value, including NaN
academic_distribution = actors_educ_occ_df['academic_background'].value_counts(dropna=False)

# Calculate the percentage of each unique value
academic_distribution_percentage = (academic_distribution / len(actors_educ_occ_df)) * 100

# Combine counts and percentages into a single DataFrame for clarity
distribution_df = pd.DataFrame({
    'Count': academic_distribution,
    'Percentage': academic_distribution_percentage
}).round(2)

print(distribution_df)

                     Count  Percentage
academic_background                   
NaN                  58796       43.85
0.0                  49308       36.78
1.0                  25974       19.37


For now, we thought of **filling Nan's with the Most Frequent Value (0)**, because 0 is already dominating and the actors are so more likely to have not studied than to have, so we'll do that for now and maybe use Machine Learning methods later instead (using other features to predict the missing values of this one).

(We also thought of **Proportional Sampling** but while this reflects the observed distribution, it introduces random noise, which can degrade the predictive power of this specific feature)

In [125]:
actors_educ_occ_df['academic_background'] = actors_educ_occ_df['academic_background'].fillna(0)

In [126]:
# Calculate the count of each unique value, including NaN
academic_distribution = actors_educ_occ_df['academic_background'].value_counts(dropna=False)

# Calculate the percentage of each unique value
academic_distribution_percentage = (academic_distribution / len(actors_educ_occ_df)) * 100

# Combine counts and percentages into a single DataFrame for clarity
distribution_df = pd.DataFrame({
    'Count': academic_distribution,
    'Percentage': academic_distribution_percentage
}).round(2)

print(distribution_df)

                      Count  Percentage
academic_background                    
0.0                  108104       80.63
1.0                   25974       19.37


#### Focus on 'occupation_category'

In [127]:
# Calculate the count of each unique value, including NaN
occupation_category = actors_educ_occ_df['occupation_category'].value_counts(dropna=False)

# Calculate the percentage of each unique value
occupation_category_percentage = (occupation_category / len(actors_educ_occ_df)) * 100

# Combine counts and percentages into a single DataFrame for clarity
distribution_occ_df = pd.DataFrame({
    'Count': occupation_category,
    'Percentage': occupation_category_percentage
}).round(2)

print(distribution_occ_df)

                     Count  Percentage
occupation_category                   
None                 58796       43.85
otherOcc             38644       28.82
diverseActingOcc     18888       14.09
oneActingOcc         17750       13.24


To help us choose how to choose the missing values, we can check for the correlation between 'occupation_category' and 'academic_background', see if we can use 'academic_background' as a predictor for 'occupation_category'

In [141]:
# Encode the categorical variables numerically
from sklearn.preprocessing import LabelEncoder

# Encode the categorical variables numerically using .loc to avoid the warning
filtered_data = actors_educ_occ_df[actors_educ_occ_df['occupation_category'].notna()].copy()

le_occupation = LabelEncoder()
le_academic = LabelEncoder()

filtered_data.loc[:, 'occupation_category_encoded'] = le_occupation.fit_transform(filtered_data['occupation_category'])
filtered_data.loc[:, 'academic_background_encoded'] = le_academic.fit_transform(filtered_data['academic_background'])

# Compute correlation
correlation = filtered_data[['occupation_category_encoded', 'academic_background_encoded']].corr()

# Display the correlation matrix
print(correlation)

                             occupation_category_encoded  \
occupation_category_encoded                     1.000000   
academic_background_encoded                     0.118649   

                             academic_background_encoded  
occupation_category_encoded                     0.118649  
academic_background_encoded                     1.000000  


The results indicate that the correlation between academic_background and occupation_category is approximately 0.1186, which suggests a weak positive relationship and means that 'academic_background' provides limited predictive power for 'occupation_category'.

So we'll use the mode as chosen previously :

In [142]:
actors_educ_occ_df['occupation_category'] = actors_educ_occ_df['occupation_category'].fillna('otherOcc')

In [143]:
# Calculate the count of each unique value, including NaN
occupation_category = actors_educ_occ_df['occupation_category'].value_counts(dropna=False)

# Calculate the percentage of each unique value
occupation_category_percentage = (occupation_category / len(actors_educ_occ_df)) * 100

# Combine counts and percentages into a single DataFrame for clarity
distribution_occ_df = pd.DataFrame({
    'Count': occupation_category,
    'Percentage': occupation_category_percentage
}).round(2)

print(distribution_occ_df)

                     Count  Percentage
occupation_category                   
otherOcc             97440       72.67
diverseActingOcc     18888       14.09
oneActingOcc         17750       13.24


### 2. Biographies

In [145]:
print(actorsB_df.head(5))

           actor_name                                          biography
0      Wanda De Jesus  Wanda De Jesus (born August 26, 1958) is an Am...
1  Natasha Henstridge  Natasha Tonya Henstridge (born August 15, 1974...
2            Ice Cube  O'Shea Jackson Sr. (born June 15, 1969), known...
3       Jason Statham  Jason Statham ( STAY-thəm; born 26 July 1967) ...
4         Clea DuVall  Clea Helen D'Etienne DuVall (born September 25...


#### The idea is to use NLP techniques to retrieve insights on actors' social backgrounds from their biographies

Let's first define the features we want to retrieve from that and the specific corresponding possible categories for each :

- new column **'socioeconomic_background'** with 3 possible categories :
    - **'Underprivileged'** :  Mentions of poverty, financial hardship, growing up in disadvantaged neighborhoods, or limited resources.
    - **'MiddleIncome'** : References to holding regular jobs or coming from a typical middle-class environment without extreme hardship or wealth.
    - **'Privileged'** :  Indications of affluence, attending elite institutions, private tutors, or family wealth and connections.

- new column **'education_level'** with 3 possible categories :
    - **'HighlyEducated'** : Mentions of completing university-level education, prestigious schools, or advanced degrees.
    - **'FormallyEducated'** : Mentions of finishing high school or equivalent, possibly some formal training (e.g., drama school).
    - **'InformallyEducated'** : No mention of education, dropped out early, or self-taught skills.

- new column **'early_life_opportunities'** with 3 possible categories :
    - **'FacedSignificantAdversity'** : References to overcoming severe hardship (illness, discrimination, war-torn areas, loss of parents).
    - **'Ordinary'** : No extreme challenges or privileges noted.
    - **'PrivilegedStart'** : Direct access to the industry through family connections, early mentorship from established figures, or other unique advantages.

##### Explanation of our method

1. **Representative Phrases**: For each category, we define several phrases that strongly characterize that category.

2. **Embedding Model**: We use the "all-MiniLM-L6-v2" model from sentence-transformers to convert both the biography text and each category's representative phrases into embeddings (these embeddings capture semantic meaning).

3. **Scoring Method**: We compute the cosine similarity between the biography embedding and each category's representative phrase embeddings. We then average the scores and the category whose representative phrases have the highest average similarity with the biography is selected.

4. **Assigning Categories**: We append our categories (**socioeconomic_background**, **education_level** and **early_life_opportunities**) as new columns in the DataFrame.

In [151]:
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm  # For progress bar

# use pip install sentence-transformers

# 1. 

# Socioeconomic Background
socio_categories = {
    'Underprivileged': [
        "grew up in poverty",
        "raised in a poor neighborhood",
        "struggled financially during childhood",
        "came from a very poor family",
        "lived with limited resources"
    ],
    'MiddleIncome': [
        "came from a typical middle-class family",
        "grew up in a regular working-class home",
        "had a normal upbringing with no major financial struggles"
    ],
    'Privileged': [
        "born into a wealthy family",
        "affluent upbringing",
        "attended elite schools",
        "had private tutors",
        "grew up with considerable financial support"
    ]
}

# Education Level
edu_categories = {
    'HighlyEducated': [
        "completed university-level education",
        "attended a prestigious university",
        "earned an advanced degree",
        "studied at a top college"
    ],
    'FormallyEducated': [
        "finished high school",
        "attended a drama school",
        "received standard formal education",
        "went to a normal high school"
    ],
    'InformallyEducated': [
        "no mention of formal education",
        "self-taught",
        "no degree",
        "did not complete formal schooling"
    ]
}

# Early Life Opportunities
opps_categories = {
    'FacedSignificantAdversity': [
        "overcame severe hardship",
        "faced major adversity",
        "childhood marked by significant challenges",
        "experienced discrimination or a war-torn environment",
        "had to overcome illness or loss of parents early on"
    ],
    'Ordinary': [
        "ordinary childhood with no extreme challenges",
        "no notable adversity or privilege",
        "upbringing was stable and fairly normal"
    ],
    'PrivilegedStart': [
        "had industry connections from the start",
        "family connections to the entertainment industry",
        "mentored by established figures early in career",
        "had direct access to acting opportunities"
    ]
}

# ------------------------------------------------------------------------------

# Function to create embeddings and classify based on similarity


def classify_with_embeddings(text, category_dict, model):
    """
    Classify the input text into one category based on semantic similarity.
    We take the average similarity of the text with each category's representative phrases.
    The category with the highest average similarity is chosen.
    """
    if text is None:
        return None
    
    # Embed the biography
    text_embedding = model.encode(text, convert_to_tensor=True)

    best_category = None
    best_score = -1.0

    # For each category, compute the average similarity with reference phrases
    for category, phrases in category_dict.items():
        phrase_embeddings = model.encode(phrases, convert_to_tensor=True)
        # Compute cosine similarities
        cos_sim = util.pytorch_cos_sim(text_embedding, phrase_embeddings)
        # Average similarity score
        avg_score = float(cos_sim.mean().item())
        if avg_score > best_score:
            best_score = avg_score
            best_category = category

    return best_category

In [152]:
# We load pre-trained sentence embedding model

model = SentenceTransformer('all-MiniLM-L6-v2')

tqdm.pandas(desc="Processing biographies")  # Initialize tqdm progress bar

# We apply the classification functions to each biography

actorsB_df['socioeconomic_background'] = actorsB_df['biography'].progress_apply(
    lambda x: classify_with_embeddings(x, socio_categories, model))

actorsB_df['education_level'] = actorsB_df['biography'].progress_apply(
    lambda x: classify_with_embeddings(x, edu_categories, model))

actorsB_df['early_life_opportunities'] = actorsB_df['biography'].progress_apply(
    lambda x: classify_with_embeddings(x, opps_categories, model))

Processing biographies: 100%|██████████| 134078/134078 [38:39<00:00, 57.81it/s] 
Processing biographies: 100%|██████████| 134078/134078 [38:16<00:00, 58.38it/s] 
Processing biographies: 100%|██████████| 134078/134078 [40:06<00:00, 55.72it/s] 


In [153]:
actorsB_df.sample(n=10)

Unnamed: 0,actor_name,biography,socioeconomic_background,education_level,early_life_opportunities
29686,Wataru Takagi,"Wataru Takagi (高木 渉, Takagi Wataru, born July ...",MiddleIncome,FormallyEducated,Ordinary
77999,A. Soboleva,,,,
108292,Stone Gossard,"Stone Carpenter Gossard (born July 20, 1966) i...",Privileged,HighlyEducated,PrivilegedStart
84069,Nuccio Siano,,,,
29187,Karan Patel,Karan Patel (born 23 November 1983) is an Indi...,Privileged,HighlyEducated,PrivilegedStart
45712,Michael Starke,Michael Starke (born 13 November 1957) is a Br...,Underprivileged,InformallyEducated,FacedSignificantAdversity
32808,Sweta Dutta,,,,
78516,Rinus de Wilde,,,,
52805,Gregory Sporleder,"Gregory Sporleder (born April 14, 1964) is an ...",Privileged,HighlyEducated,PrivilegedStart
863,Daniel Seltzer,Daniel Seltzer (13 February 1933 — 2 March 198...,Privileged,FormallyEducated,PrivilegedStart


In [154]:
# Calculate the percentage of NaN values in each column
nan_percentage = actorsB_df.isna().mean() * 100

# Display the percentages
print(nan_percentage)

actor_name                   0.000000
biography                   50.824893
socioeconomic_background    50.824893
education_level             50.824893
early_life_opportunities    50.824893
dtype: float64


In [156]:
# Define the columns to analyze
columns_to_analyze = ['socioeconomic_background', 'education_level', 'early_life_opportunities']

# Calculate the percentage of each value for each column
for column in columns_to_analyze:
    print(f"\nPercentage distribution for '{column}':")
    value_counts = actorsB_df[column].value_counts(dropna=False, normalize=True) * 100
    print(value_counts)


Percentage distribution for 'socioeconomic_background':
socioeconomic_background
None               50.824893
Privileged         31.511508
Underprivileged    11.168126
MiddleIncome        6.495473
Name: proportion, dtype: float64

Percentage distribution for 'education_level':
education_level
None                  50.824893
FormallyEducated      21.451692
HighlyEducated        19.334268
InformallyEducated     8.389147
Name: proportion, dtype: float64

Percentage distribution for 'early_life_opportunities':
early_life_opportunities
None                         50.824893
PrivilegedStart              43.553006
Ordinary                      2.970659
FacedSignificantAdversity     2.651442
Name: proportion, dtype: float64


#### For now, we fill Nans with **Proportional Sampling**, which ensures that the filled values align with the observed distributions in the non-missing data, maintaining the overall balance of categories, **BUT** introduces **random noise**. In our taste, following the specific features we're studying, it is better slightly suited than filling with the most common value.

In [158]:
# We define the observed distributions for each column
socio_distribution = {
    'Privileged': 31.511508,
    'Underprivileged': 11.168126,
    'MiddleIncome': 6.495473
}
edu_distribution = {
    'FormallyEducated': 21.451692,
    'HighlyEducated': 19.334268,
    'InformallyEducated': 8.389147
}
opps_distribution = {
    'PrivilegedStart': 43.553006,
    'Ordinary': 2.970659,
    'FacedSignificantAdversity': 2.651442
}

In [159]:
# Function to fill NaN values proportionally
def fill_na_proportionally(column, distribution):
    """
    Fill NaN values in a column using proportional sampling based on the given distribution.
    """
    # Create a list of categories and their corresponding probabilities
    categories = list(distribution.keys())
    probabilities = np.array(list(distribution.values())) / sum(distribution.values())

    # Fill NaN values using proportional sampling
    return column.apply(lambda x: np.random.choice(categories, p=probabilities) if pd.isna(x) else x)

In [160]:
# We pply proportional sampling to fill NaN values in all three columns
actorsB_df['socioeconomic_background'] = fill_na_proportionally(
    actorsB_df['socioeconomic_background'], socio_distribution
)
actorsB_df['education_level'] = fill_na_proportionally(
    actorsB_df['education_level'], edu_distribution
)
actorsB_df['early_life_opportunities'] = fill_na_proportionally(
    actorsB_df['early_life_opportunities'], opps_distribution
)

In [161]:
# Define the columns to analyze
columns_to_analyze = ['socioeconomic_background', 'education_level', 'early_life_opportunities']

# Calculate the percentage of each value for each column
for column in columns_to_analyze:
    print(f"\nPercentage distribution for '{column}':")
    value_counts = actorsB_df[column].value_counts(dropna=False, normalize=True) * 100
    print(value_counts)


Percentage distribution for 'socioeconomic_background':
socioeconomic_background
Privileged         64.185027
Underprivileged    22.563732
MiddleIncome       13.251242
Name: proportion, dtype: float64

Percentage distribution for 'education_level':
education_level
FormallyEducated      43.649965
HighlyEducated        39.409896
InformallyEducated    16.940139
Name: proportion, dtype: float64

Percentage distribution for 'early_life_opportunities':
early_life_opportunities
PrivilegedStart              88.567103
Ordinary                      6.063635
FacedSignificantAdversity     5.369263
Name: proportion, dtype: float64


In [162]:
actorsB_df.sample(n=10)

Unnamed: 0,actor_name,biography,socioeconomic_background,education_level,early_life_opportunities
41861,Véronique Joly,,Privileged,HighlyEducated,PrivilegedStart
129559,Seiichi Morita,,Privileged,FormallyEducated,PrivilegedStart
85119,Lew Gallo,"Lewis D. Gallo (June 12, 1928 – June 11, 2000)...",Privileged,FormallyEducated,PrivilegedStart
117934,Hilary Crane,Hilary Crane (2 February 1933 – 4 June 2009) w...,Privileged,HighlyEducated,PrivilegedStart
114911,Michel Batilliet,,MiddleIncome,InformallyEducated,PrivilegedStart
122381,Charles Duke,"Charles Moss Duke Jr. (born October 3, 1935) i...",Privileged,HighlyEducated,PrivilegedStart
77061,Jimmy Santos,Jimmy Santos may refer to:\n\nJimmy Santos (si...,Privileged,InformallyEducated,PrivilegedStart
24507,Morio Kazama,"Morio Kazama (風間杜夫, Kazama Morio) is a Japanes...",Privileged,HighlyEducated,PrivilegedStart
34720,Barbie Wilde,Hellbound: Hellraiser II is a 1988 supernatura...,Underprivileged,FormallyEducated,PrivilegedStart
130178,Milo Brown,,Underprivileged,FormallyEducated,PrivilegedStart


# Merge and drop non useful columns

In [171]:
merged_df = pd.merge(actors_educ_occ_df, actorsB_df, on='actor_name')
merged_df = merged_df.drop(columns=['occupation_category_imputed', 'biography', 'academic_background'])
merged_df

Unnamed: 0,actor_name,occupation_category,socioeconomic_background,education_level,early_life_opportunities
0,Henk Rigters,otherOcc,Privileged,HighlyEducated,FacedSignificantAdversity
1,Tadeusz Borowski,otherOcc,Underprivileged,InformallyEducated,FacedSignificantAdversity
2,Jörg Hube,otherOcc,Privileged,HighlyEducated,PrivilegedStart
3,Bud,oneActingOcc,Underprivileged,InformallyEducated,Ordinary
4,Peter Sweeney,otherOcc,Underprivileged,FormallyEducated,PrivilegedStart
...,...,...,...,...,...
134073,Violet Bronte,otherOcc,Privileged,FormallyEducated,PrivilegedStart
134074,Sonny Byrkett,otherOcc,MiddleIncome,FormallyEducated,PrivilegedStart
134075,Susan Byrkett,otherOcc,Privileged,HighlyEducated,PrivilegedStart
134076,Hal Cleaveland,otherOcc,Privileged,HighlyEducated,FacedSignificantAdversity


Finally we drop occupation_category as we already have it (more full and more accurate) with ImdB dataset as primaryProfession

In [172]:
merged_df = merged_df.drop(columns=['occupation_category'])
merged_df

Unnamed: 0,actor_name,socioeconomic_background,education_level,early_life_opportunities
0,Henk Rigters,Privileged,HighlyEducated,FacedSignificantAdversity
1,Tadeusz Borowski,Underprivileged,InformallyEducated,FacedSignificantAdversity
2,Jörg Hube,Privileged,HighlyEducated,PrivilegedStart
3,Bud,Underprivileged,InformallyEducated,Ordinary
4,Peter Sweeney,Underprivileged,FormallyEducated,PrivilegedStart
...,...,...,...,...
134073,Violet Bronte,Privileged,FormallyEducated,PrivilegedStart
134074,Sonny Byrkett,MiddleIncome,FormallyEducated,PrivilegedStart
134075,Susan Byrkett,Privileged,HighlyEducated,PrivilegedStart
134076,Hal Cleaveland,Privileged,HighlyEducated,FacedSignificantAdversity


#### Verifying consistency

In [173]:
# Calculate the percentage of NaN values for each column
nan_percentages = merged_df.isna().mean() * 100

# Print the percentages
print(nan_percentages)

actor_name                  0.0
socioeconomic_background    0.0
education_level             0.0
early_life_opportunities    0.0
dtype: float64


In [174]:
# Save the merged DataFrame to a CSV file
merged_df.to_csv('socialBackground_data.csv', index=False)

In [175]:
# Check if actor names are unique
are_names_unique = merged_df['actor_name'].is_unique

# Output result
if are_names_unique:
    print("All actor names are unique.")
else:
    print("There are duplicate actor names.")

All actor names are unique.
