In [1]:
# Importing necessary libraries. You will  most likely need to install: JUPYTER, PANDAS, FUZZYWUZZY
import pandas as pd
import numpy as np
import unicodedata
import re
from dictionaries.ethnicity_dictionary import ethnicity_dictionary
from dictionaries.ethnicity_granularity_2 import granularity2_dictionary
from dictionaries.ethnicity_granularity_3 import granularity3_dictionary
from adjust_ethnicites import adjust_ethnicites
from fuzzywuzzy import fuzz
from collections import defaultdict
pd.set_option("display.max_rows", 500)

In [2]:
#loading data into a dataframe (a type of data structure)
#if you get errors, make sure the file is a csv, try changing the "sep=" to a comma ",".
df_ethn = pd.read_csv('ethnicity_data.csv', sep=';')

In [3]:
# Method to normalize the data detected. This means first capitalizing the string and removing all punctuation. 
# However we have to be careful here. Blindly removing punctuation will remove ALL special characters, including 
# chinese characters, arab scripts etc. Regular accents tend to have an ascii number lower than 900, so this 
#differentiates accents from different writings/scripts.
def normalize_data(country):
#     if type(country) is float: return # find better way to check for this
    country_upper_case = country.upper().strip()
    country_removed_punctuation = re.sub("[-|_|:|·|.|,|\"|\“|\”|/|+|(|)|?|'|’|&|)|(|?|!|%|*|·]", " ", country_upper_case)
    for char in country_removed_punctuation:
        if ord(char) < 900:
            continue
        else:
            return country_removed_punctuation
    country_removed_accents = unicodedata.normalize('NFD', country_removed_punctuation).encode('ascii', 'ignore').decode("utf-8")
    return country_removed_accents.strip()

In [4]:
# Method to dynamically reverse the dictionnaries to increase run time. This is only stored in memory 
# during the run time of the program. Easier to look up a key in a dict then to loop through all the values 
def reverse_dict(dict_to_reverse):
    reversed_dict = defaultdict(list)
    for key, value in dict_to_reverse.items():
        for val in value:
            reversed_dict[val] = (key)
        reversed_dict[key] = (key)
    return reversed_dict
# 3 dictionnaries we need (all reversed)
ethn_rev = reverse_dict(ethnicity_dictionary)
ethn2_rev = reverse_dict(granularity2_dictionary)
ethn3_rev = reverse_dict(granularity3_dictionary)

In [5]:
# Splitting the repsonse to look up each word individually (to allow for multiple ethnicites)
def find_ethnicity(ethnicity):
    ethn_split = ethnicity.split()
    ethn_split.append(ethnicity)
    ethnicites = []
    for ethn in ethn_split:
        if ethn in ethn_rev.keys():
            ethnicites.append(ethn_rev.get(ethn))
    ethnicites = list(set(ethnicites))
    if len(ethnicites) is 0 or 'MISSING' in ethnicites: return np.nan
    return np.array(ethnicites, dtype=object)

In [6]:
def fuzzy_match(ethn_matched, ethn_normalized):
    if type(ethn_matched) is not float: return ethn_matched
    ethn_split = ethn_normalized.split()
    if len(ethn_split) > 1: ethn_split.append(ethn_normalized)
    ethnicities = []
    for ethn in ethn_split:
        for key in ethnicity_dictionary.keys():
            ratio = fuzz.ratio(ethn, key)
            if ratio >= 92:
                ethnicities.append(ethn_rev.get(key))
    if len(ethnicities) == 0: return np.nan
    return np.array(list(set(ethnicities)), dtype=object)

In [7]:
def find_ethnicity_granularity_2(ethnicities_found):
    if type(ethnicities_found) is float: return ethnicities_found
    ethnicities_found = ethnicities_found.tolist()
    ethnicity_granularity_2 = []
    for ethn in ethnicities_found:
        ethn_matched = ethn2_rev.get(ethn)
        if ethn_matched is not None:
            ethnicity_granularity_2.append(ethn_matched)
    return np.array(list(set(ethnicity_granularity_2)), dtype=object)

In [8]:
def find_ethnicity_granularity_3(ethnicities_found):
    if type(ethnicities_found) is float: return ethnicities_found
    ethnicities_found = ethnicities_found.tolist()
    ethnicity_granularity_3 = []
    for ethn in ethnicities_found:
        ethn_matched = ethn3_rev.get(ethn)
        if ethn_matched is not None:
            ethnicity_granularity_3.append(ethn_matched)
    return np.array(list(set(ethnicity_granularity_3)), dtype=object)

In [9]:
df_ethn = df_ethn[~df_ethn['ethn'].isna()]

In [10]:
df_ethn['ethn_normalized'] = np.vectorize(normalize_data)(df_ethn['ethn'])

In [11]:
df_ethn['ethn_matched'] = np.vectorize(find_ethnicity)(df_ethn['ethn_normalized'])

In [12]:
df_ethn['ethn_fuzzy_matched'] = np.vectorize(fuzzy_match)(df_ethn['ethn_matched'],df_ethn['ethn_normalized'])

In [13]:
df_ethn['ethn_adjusted'] = np.vectorize(adjust_ethnicites)(df_ethn['ethn_fuzzy_matched'],df_ethn['ethn_normalized'])

In [14]:
df_ethn['gran_2'] = np.vectorize(find_ethnicity_granularity_2)(df_ethn['ethn_adjusted'])

In [15]:
df_ethn['gran_3'] = np.vectorize(find_ethnicity_granularity_3)(df_ethn['gran_2'])

In [16]:
# Dropping all the extra columns we created that are needed for the final dataset
df_ethn = df_ethn.drop(['ethn_matched'], axis = 1)

In [17]:
# Renaming column with the normalizedc country name.
df_ethn = df_ethn.rename(columns={'ethn_adjusted':'gran_1'})

In [None]:
# Saving our dataframe to a csv file (this can be to another format as well such as SAS, excel).
# This will be in the same directory as the code and inital dataset.
df_country.to_csv('ethnicity_sorted.csv')

In [27]:
df_ethn.head(500)

Unnamed: 0,startlanguage,wave,id,ethn,country,ethn_normalized,ethn_fuzzy_matched,gran_1,gran_2,gran_3
1,fr,7,1000157642w7S34,Occidental blanc,Canada,OCCIDENTAL BLANC,"[WESTERN_OCCIDENTALE, CAUCASIAN]","[WESTERN_OCCIDENTALE, CAUCASIAN]","[WESTERN_OCCIDENTALE, CAUCASIAN]","[WESTERN_OCCIDENTALE, CAUCASIAN]"
2,en,1,1000177645S1,Canadian,Canada,CANADIAN,[CANADIAN],[CANADIAN],[NORTHERN_AMERICAN],[AMERICAS]
3,es,1,1000216534S3,Ninguno,Colombia,NINGUNO,,,,
4,fr,1,1000237275S5,Québécoise,Canada,QUEBECOISE,[CANADIAN],[CANADIAN],[NORTHERN_AMERICAN],[AMERICAS]
6,en,1,1000312213S26,Caucasian,Canada,CAUCASIAN,[CAUCASIAN],[CAUCASIAN],[CAUCASIAN],[CAUCASIAN]
7,pt-BR,3,1000332646w2S10,amazonense,Brasil,AMAZONENSE,[SOUTH_AMERICA],[SOUTH_AMERICA],[SOUTH_AMERICAN],[AMERICAS]
8,zh-Hant-TW,2,1000453187w2S12,Taiwanese,Taiwan,TAIWANESE,[TAIWANESE],[TAIWANESE],[EASTERN_ASIAN],[ASIAN]
12,fr,3,1000571648w2S16,Métisse Européen Asiatique,France,METISSE EUROPEEN ASIATIQUE,"[ASIAN, MIXED, EUROPEAN]","[ASIAN, MIXED, EUROPEAN]","[ASIAN, MIXED, EUROPEAN]","[ASIAN, MIXED, EUROPEAN]"
14,en,4,10006361583w4S19,Chinese,Malaysia,CHINESE,[CHINESE],[CHINESE],[EASTERN_ASIAN],[ASIAN]
15,es,2,1000636421w2S21,Mestizo,Colombia,MESTIZO,[MESTIZO],[MESTIZO],[MESTIZO],[MESTIZO]


In [26]:
# Printing the list of countries that were not categorized. Usually gibberish.
# If there is a name that should have been detected, add it to the dictionnary under the right country.
unclassified_ethnicities = df_ethn.loc[df_ethn['gran_1'].isna()]
unclassified_ethnicities_list = unclassified_ethnicities['ethn_normalized'].tolist()
print("There are a total of", len(list(set(unclassified_ethnicities_list))), 
      " different unclassified Responses and ", len(unclassified_ethnicities_list), 
      " unclassified total responses...")
print(list(set(unclassified_ethnicities_list)))

There are a total of 596  different unclassified Responses and  1874  unclassified total responses...
['', 'GATAU', 'DUNYA VATANDASI', 'COVEK', 'GRAANIN', 'O  RH', 'مش فاهم', 'ZIADNA', 'CARTESIEN', 'ANCRE DEPUIS 1640', 'NORMAL FOR AN AREA WHERE I LIVE', 'RIEN A SIGNALER', 'BUUU', 'HMONG', 'MAJORITNA', 'DON T UNDERSTAND THE QUESTION', 'MAGHIAR', 'هم ناس بسيطون جيدون متكافلون يعملون لمصلحة المجتمع والبشرية', 'PREFERISCO NON RISPONDERE', 'NORTH', 'FINE', 'لااعلم', 'N', 'NE SAIS PAS', 'NI', 'IMMIGRANT', 'אחר', 'РУСКАЯ', 'NO DIFFERENCE', 'HUIHJKJJK', 'OHNE BEKENNTNIS', 'NINGUNA', 'BRANCQ', 'NIET  IK DOE NIET AAN IDENTITEITSPOLITIEK', 'ΟΙ ΠΡΟΛΈΤΑΡΙΟΙ ΔΕΝ ΈΧΟΥΝΕ ΠΑΤΡΊΔΑ', 'CULTURALE', 'YES', 'GOCMENIM BEN', 'JE NE CONNAIS PAS LES CATEGORIES  JE NE SAIS PAS', 'MIRZA', 'KIRSAL', '不回答', 'NICE', 'GRAGANIN', 'BKKOJJL', 'ISSUE DE LA DIVERSITE', 'AUCUNE IDEE', 'ESPECIE HUMANA', 'PAS DE REPOSNE', 'VANILLA', 'AUXINE', 'HOMO SAPIENS', 'KATH', 'LOCAL', 'COVEK TJ ZENA  GRADJANKA SVETA', 'NA', 'مدني', 'نن