<font size="3">Now I train the previous model on the Facebook data

In [1]:
from glob import glob
import io
import re
import zipfile
import rarfile
import os 
from string import punctuation, ascii_letters

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.utils import resample
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, plot_confusion_matrix, cohen_kappa_score, balanced_accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from joblib import dump, load


#dictionnary mapping non ascii to ascii characters
caract_mapping = pd.read_html('https://docs.oracle.com/cd/E29584_01/webhelp/mdex_basicDev/src/rbdv_chars_mapping.html')
carac_dict = pd.DataFrame(np.concatenate([caract_mapping[i].iloc[:,[1,2]].values for i in (0,1)])).set_index(0).to_dict('dict')[1]

def name_cleaner(name):
    clean = ""
    for letter in name:
        if (letter.lower() in ascii_letters) or (letter.lower() in """ ''`'-  """):
            clean += letter
        elif letter in punctuation:
            return 'error'
        elif letter in carac_dict.keys():
            clean += carac_dict[letter]
        else:
            return 'error'
            
    return clean.upper()


def islatin(string):
    try:
        a = re.search("[A-Za-z]+",string.replace(" ","")).group(0)
        if len(a) == len(string.replace(" ","")):
            isok= 1
        else:
            isok= 2
    except:
        isok = 0
    return isok

def print_metrics(y_test,y_pred):
    bas = balanced_accuracy_score(y_test,y_pred)
    cks = cohen_kappa_score(y_test,y_pred)
    print("Balanced accuracy score : ", round(bas,3))
    print("Cohen Kappa score : ", round(cks, 3))


    report = classification_report(y_test, y_pred, output_dict=True)
    df = pd.DataFrame(report).transpose()
    results = df.sort_values("f1-score",ascending=False)
    return results


# Building the dataset

In [420]:
files = glob("../data/surnames/fb_surnames/*.csv")
files.sort()

df_list = []

for file in files:
    
    country = file.replace("../data/surnames/fb_surnames/","").replace("_surnames.csv","")
    
    
    df = pd.read_csv(file,skiprows=1,header=None)
    df.columns = ["name","count"]
    latin = df["name"].apply(islatin) #removing names from another alphabet
    df = df[latin != 0]
    
    df["name"] = df["name"].apply(name_cleaner).str.upper()
    df = df[df["name"] != "ERROR"]
    df = df.groupby("name",as_index=False)["count"].sum()
    if len(df) < 5000:
        df = df[df["count"] >= 2]
    else:
        df = df[df["count"] >= 7] 
    len_df = len(df)
    
    print(country,":",len_df,end= ", ")

    
    df["country"] = country
    df_list.append(df)
    
df = pd.concat(df_list)

df = df[~df["name"].str.contains(".|,|?",regex=False)]

df = df[df["name"].str.len() > 1] #removing one characters names

df = df.groupby(["country","name"],as_index=False)["count"].sum().dropna()

df.to_csv("../data/surnames/fb_surnames.csv.zip",compression="zip",index=False)
print("")
print("Over. Length of file : ", len(df))


Afghanistan : 4475, Albania : 9064, Algeria : 100610, Angola : 7069, Argentina : 28039, Austria : 27468, Azerbaijan : 1387, Bahrain : 16995, Bangladesh : 21074, Belgium : 61747, Bolivia : 32655, Botswana : 6439, Brazil : 55988, Brunei : 4108, Bulgaria : 4226, Burkina Faso : 587, Burundi : 321, Cambodia : 241, Cameroon : 30700, Canada : 52648, Chile : 80884, China : 3085, Costa Rica : 19957, Croatia : 14477, Cyprus : 1821, Czechia : 29543, Denmark : 10572, Djibouti : 194, Ecuador : 3751, Egypt : 92407, El Salvador : 410, Estonia : 1714, Ethiopia : 210, Fiji : 535, Finland : 20554, France : 280780, Georgia : 2363, Germany : 102151, Ghana : 11894, Greece : 11055, Guatemala : 10504, Haiti : 271, Honduras : 836, Hong Kong : 11358, Hungary : 5828, Iceland : 597, India : 45988, Indonesia : 2588, Iran : 49031, Ireland : 13172, Israel : 38743, Italy : 237300, Jamaica : 3725, Japan : 6473, Jordan : 24301, Kazakhstan : 19526, Lebanon : 17828, Libya : 23357, Lithuania : 5496, Luxembourg : 3548, Ma

# Improving the dataset

The dataset is particularly dirty : Not only do people put fake names and fake locations but also surnames of people from a country don't always originate from that country because of colonialism, migrations or the country being a composite of several ethnicities in the first place.

For example, a lot of names from Haiti are french, from Angola are portuguese and from Philippines are spanish. The US dataset contains a lot of spanish names and the french dataset a lot of african and arabic names. Countries from the arabic peninsula have extremely high rates of immigration from the Indian subcontinent so much that a majority of surnames from their dataset isn't arabic. Belgium surnames are either french or dutch and swiss surnames either french, germanic or italian.

I follow the approach of Mazière and Roth and keep only the names that are highly concentrated in one country. I then add the names from their dataset for countries missing in the facebook data.

## Data cleaning

In [2]:
df = pd.read_csv("../data/surnames/fb_surnames.csv.zip",dtype={"country" : "category", "name" : 'string', "count" : 'int32'})
print("Length of dataframe : ", len(df))

#lots of italian names in the french data that can be easily removed.
italian_end = "INI ANI ONI TTI OLI LLI GHI INO ERA ERO"
df.loc[(df["country"] == "France") & (df["name"].str[-3:].isin(italian_end.split())),"country"] = "Italy"

#Lots of wrong ethnicities in the Italian dataset for some reason, slavic in particular, cleaning it.
df_italy = df[df["country"]== "Italy"]
df_other = df[df["country"] != "Italy"]

mask = (df_italy["count"] > 10) & (df_italy["name"].str[-1:].isin(["A","E","I","O",'N'])) & (~df_italy["name"].str.contains('[YWKXJ]'))
df_italy = df_italy[mask]
df = pd.concat([df_other,df_italy])
print("Italy done")

#Polish names were scattered across the world. Making the assumption that the names in SKI and SKY are polish which is mostly right.
df.loc[df["name"].str[-3:].isin(["SKI",'SKY']),"country"] = "Poland"

#The Iranian data was mixed with a lot of slavic data 
df_iran = df[df["country"].str.lower() == "Iran"]
df_other = df[df["country"].str.lower() != "Iran"]

consonants = "[KLMCNDPKRSTWVSXYZ]"
mask = (df_iran["name"].str.len() > 5) & (~df_iran['name'].isin(df_other["name"])) & (~df_iran["name"].str.contains(consonants*3 + "|W|Z[BCDFGKLMNPKRSTWXYZ]|K$|Y$|OS$|J$|O$|A$|ZK|ZV|OV$|VL|ORF$|CHA$|SV|OVA$|SKI|SKY|EK$|SKA|VA$|IK$|EC$|KA$|NA$|KY$|C$|TA$|AN$|ER$"))
df_iran = df_iran[mask]
df = pd.concat([df_other,df_iran])
print("Iran done")

#removing countries that are ambiguous because of a lot of immigration or mixed ethnicities inside them.
df= df[~df["country"].isin(["Switzerland","Belgium",'United States','Maldives','Philippines','Canada','Israel','Palestine','Papua New Guinea','Luxembourg','Fiji','Qatar','United Arab Emirates','Bahrain'])]
print("Countries removed")

df.to_csv("intermediate_surnames.csv.zip",compression="zip",index=None)
print('Length of dataframe : ', len(df))


Length of dataframe :  3556056
Italy done
Iran done
Countries removed
Length of dataframe :  2848834


## Filtering the names

I modify the Mazière Roth approach by combining the Herfindahl-Hirschman Index with the Gini index to determine the threshold below which to remove names. 

Using only the hhi causes the most common names to be removed from the dataset. The gini index puts more emphasis on the upper tail of the distribution. I make the assumption that if the top 2~3 countries have a much higher prevalence of a name than the rest then the name originates from those countries. I find that using the mean of the hhi and gini yields the best results.

In [3]:
def gini_coefficient(x):
    diffsum = 0
    for i, xi in enumerate(x[:-1], 1):
        diffsum += np.sum(np.abs(xi - x[i:]))
    return diffsum / (len(x)**2 * np.mean(x))


#computing the country level name frequency, removing names that have a frequency lower than 0.0000005
df["freq"] = df["count"]/df.groupby("country")["count"].transform(sum)
df = df[df["freq"] > 0.0000005]

#computing the hhi
standardized_freq = df["freq"]/df.groupby("name")["freq"].transform(sum)
df["hhi"] = (standardized_freq**2).groupby(df['name']).transform(sum)
print("hhi done")

#computing the gini index
gini = df.groupby('name')["freq"].apply(np.array).apply(gini_coefficient)
df = df.merge(gini.rename("gini"),left_on="name",right_index=True)
print("gini done")

df["mean"] = df[["gini","hhi"]].mean(axis=1)


#for some reason names that are most prevalent in France, Angola and Libya combined are all highly common west african names that would be removed otherwise
# reclassifying them in Burkina Fasso
a = df.sort_values(["name",'freq'],ascending=False).groupby('name').head(5) #keeping the top 5 countries for each name
a= a.groupby('name')["country"].apply(np.array).astype(str)
african_names = a[(a.astype(str).str.contains("France")) & (a.astype(str).str.contains("Angola")) & (a.astype(str).str.contains("Libya"))].index

df.loc[(df["name"].isin(african_names)), ["country","gini"]] = ["Burkina Faso",0]
print("West Africa done")

df_unique = df[df["gini"] == 0]
df_other= df[(df["gini"] != 0)]

df_other = df_other[df_other["mean"] > 0.50] #rather arbitrary threshold that I find works well in practice.

df_other = df_other.sort_values(["name","freq"],ascending=False).drop_duplicates("name")
df = pd.concat([df_unique,df_other])

df = df[["country","name"]]
df.to_csv("ready_data.csv.zip",index=False,compression="zip")
print("Done. Length of dataframe : ", len(df))

hhi done
gini done
West Africa done
Done. Length of dataframe :  1359594


# Training the model

## Training on facebook data

In [4]:
df = pd.read_csv("ready_data.csv.zip",dtype={"country" : 'category', 'name' : "string"})
labels = pd.read_csv("../data/other_data/labels2.csv",dtype={"country" : 'category','region' : 'category'})
df = df.merge(labels[["country",'region']]).dropna()


def padding(name):
    padded_text = "^" + name + "$"
    return padded_text

padder = FunctionTransformer(padding)
vectorizer = CountVectorizer(analyzer="char_wb", ngram_range=(2,8))
SGD_model = SGDClassifier(class_weight="balanced")


FB_clf = Pipeline(steps=[("padding",padder),
                      ('vectorizer', vectorizer),
                      ('model', SGD_model)])

X = df["name"]
y = df["region"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)


FB_clf.fit(X_train,y_train)

y_pred = FB_clf.predict(X_test)
print_metrics(y_test,y_pred)

Balanced accuracy score :  0.778
Cohen Kappa score :  0.775


Unnamed: 0,precision,recall,f1-score,support
CentralSouthEuropean,0.922684,0.845253,0.882273,66528.0
Slavic,0.819367,0.867137,0.842576,15437.0
Arabian,0.841252,0.827542,0.834341,40375.0
weighted avg,0.83313,0.820285,0.825196,203940.0
NorthEuropean,0.804873,0.840544,0.822322,37929.0
accuracy,0.820285,0.820285,0.820285,0.820285
African,0.796608,0.78619,0.791365,27604.0
macro avg,0.719016,0.778038,0.735002,203940.0
Indian,0.628411,0.671129,0.649068,14343.0
Asian,0.219916,0.608469,0.323067,1724.0


## Training on mixed data

Let's add the data from the Pubmed dataset for Asian and Indian regions where the facebook data is lacking and performance is poor.

In [49]:
df = pd.read_csv("ready_data.csv.zip",dtype={"country" : 'category', 'name' : "string"})
labels = pd.read_csv("../data/other_data/labels2.csv",dtype={"country" : 'category','region' : 'category'})
df = df.merge(labels[["country",'region']])

#Lets oversample east asian countries which have both about a couple hundred extremely common names so that they don't get drowned in the data
df_east_asia = resample(df[df["country"].isin(["China",'South Korea',"Taiwan",'Macao'])], n_samples=20000)


df_maz = pd.read_csv('../data/other_data/maziere_roth_dataset.csv' ,dtype={"country" : 'category', 'name' : "string"})
df_maz = df_maz[df_maz["region"].isin(['Asian','Indian'])]

df = pd.concat([df,df_maz,df_east_asia]).dropna()

padder = FunctionTransformer(padding)
vectorizer = CountVectorizer(analyzer="char_wb", ngram_range=(2,8))
SGD_model = SGDClassifier(class_weight="balanced")

MX_clf = Pipeline(steps=[("padding",padder),
                      ('vectorizer', vectorizer),
                      ('model', SGD_model)])

X = df["name"]
y = df["region"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)


MX_clf.fit(X_train,y_train)

y_pred = MX_clf.predict(X_test)
print_metrics(y_test,y_pred)

Balanced accuracy score :  0.814
Cohen Kappa score :  0.779


Unnamed: 0,precision,recall,f1-score,support
CentralSouthEuropean,0.916199,0.845268,0.879305,66431.0
Slavic,0.80685,0.872742,0.838503,15331.0
Arabian,0.826987,0.818468,0.822705,40676.0
weighted avg,0.822411,0.818507,0.819368,223860.0
accuracy,0.818507,0.818507,0.818507,0.818507
NorthEuropean,0.790631,0.843648,0.816279,37729.0
macro avg,0.790744,0.813649,0.800615,223860.0
African,0.766605,0.788021,0.777166,27564.0
Asian,0.677766,0.819521,0.741933,11475.0
Indian,0.750172,0.707877,0.728411,24654.0


In [50]:
#saving the model
from joblib import dump
dump(MX_clf, 'mixed_model.joblib') 


['mixed_model.joblib']