In [5]:
import pandas as pd
import numpy as np
from SPARQLWrapper import SPARQLWrapper, JSON
characters= pd.read_csv('../Clean data/character_classification.csv')

In [6]:
def get_ethnicity(freebase_id):
    # Set up the SPARQL query to retrieve ethnicity information based on Freebase ID
    url = "https://query.wikidata.org/sparql"
    query = ("""
    SELECT ?item ?itemLabel WHERE {
      ?item wdt:P646 '"""+ str(freebase_id)+ """'.
      SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
    }
    """
    )
    
    # Initialize SPARQLWrapper with the query and endpoint
    sparql = SPARQLWrapper(url)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)

    try:
        # Execute the SPARQL query and retrieve results
        results = sparql.query().convert()
        if results["results"]["bindings"]:
            # Extract and return the label representing the ethnicity
            label = results["results"]["bindings"][0]["itemLabel"]["value"]
            return label
        else:
            # Return 'Unknown' if no ethnicity information is found
            return "Unknown"
    except Exception as e:
        # Return NaN in case of an error during the query execution
        return np.nan


In [8]:
# Create a copy of the 'characters' DataFrame
map_ethnicity = characters.copy()

# Remove duplicate entries based on the 'Actor ethnicity' column, keeping the first occurrence
map_ethnicity.drop_duplicates(subset='Actor ethnicity', keep='first', inplace=True)

# Apply the 'get_ethnicity' function to derive the 'Ethnicity' column based on 'Actor ethnicity'
map_ethnicity['Ethnicity'] = map_ethnicity['Actor ethnicity'].apply(lambda x: get_ethnicity(x))

# Rearrange the DataFrame to only include the 'Actor ethnicity' and 'Ethnicity' columns
map_ethnicity = map_ethnicity[['Actor ethnicity', 'Ethnicity']]

map_ethnicity


Unnamed: 0,Actor ethnicity,Ethnicity
0,,Unknown
1,/m/09vc4s,English Americans
2,/m/0x67,African Americans
3,/m/02w7gg,English people
4,/m/025rpb0,Hispanic and Latino Americans
...,...,...
31486,/m/0br_8h,Galicians
32736,/m/046j25,Lumbee
32975,/m/026cybk,Serbian Canadians
33341,/m/013z8m,Manchu


In [9]:
# Export the 'map_ethnicity' DataFrame to a CSV file
map_ethnicity.to_csv("../Clean data/map_ethnicity.csv", index=False)