<font size="3">Here I reproduce the data transformation and machine learning parts of Antoine Mazières and Camille Roth's paper *'Large-scale diversity estimation through surname origin inference'* : https://namograph.antonomase.fr/ .
First doing the same thing on the same data without reusing their code and trying a different model for the sake of fun.</font>

In [1]:
import pandas as pd
import numpy as np
from string import punctuation, ascii_letters
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, plot_confusion_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
import pickle
#dictionnary mapping non ascii to ascii characters
caract_mapping = pd.read_html('https://docs.oracle.com/cd/E29584_01/webhelp/mdex_basicDev/src/rbdv_chars_mapping.html')
carac_dict = pd.DataFrame(np.concatenate([caract_mapping[i].iloc[:,[1,2]].values for i in (0,1)])).set_index(0).to_dict('dict')[1]

def name_cleaner(name):
    clean = ""
    for letter in name:
        if (letter.lower() in ascii_letters) or (letter.lower() in """ ''`'-  """):
            clean += letter
        elif letter in punctuation:
            return 'ERROR'
        elif letter in carac_dict.keys():
            clean += carac_dict[letter]
        else:
            return 'ERROR'
            
    return clean.upper()

#reusing exactly the same clusters
clusters = {
    "African": ["Zimbabwe","Rwanda","Zambia","Malawi","Tanzania","Uganda","Kenya","Dem. Rep. Congo","Congo","South Africa","Gambia","Botswana","Mozambique","Mali","Trinidad and Tobago","Gabon","Cameroon","Benin","Côte d'Ivoire","Burkina Faso","Togo","Senegal","Nigeria","Ghana","Ethiopia"],
    "Asian": ["Vietnam","China","Thailand","Cambodia","Taiwan","Korea","Lao PDR","Japan","Indonesia","Philippines"],
    "Indian": ["Nepal","India","Sri Lanka","Mongolia","Pakistan","Malaysia","Bangladesh","Iran"],
    "Arabian": ["Sudan","Libya","Egypt","Tunisia","Morocco","Algeria","United Arab Emirates","Qatar","Lebanon","Syria","Jordan","Palestine","Saudi Arabia","Kuwait","Iraq","Oman","Yemen"],
    "Slavic": ["Poland","Macedonia","Ukraine","Belarus","Russia","Kazakhstan","Bulgaria","Slovakia","Czech Rep.","Croatia","Bosnia and Herz.","Serbia","Montenegro"],
    "NorthEuropean": ["Norway","Jamaica","Denmark","Sweden","Netherlands","Belgium","Germany","Austria","United Kingdom","Australia","Canada","New Zealand","United States","Ireland","Israel","Switzerland","Luxembourg","France","Iceland"],
    "CentralSouthEuropean": ["Slovenia","Hungary","Turkey","Latvia","Estonia","Finland","Italy","Albania","Romania","Lithuania","Greece","Cyprus","Georgia","Venezuela","Puerto Rico","Costa Rica","Spain","Mexico","Cuba","Colombia","Guatemala","Peru","Chile","Ecuador","Bolivia","Uruguay","Argentina","Panama","Portugal","Brazil"],
}

region_dict = pd.Series(clusters).explode().reset_index().iloc[:,[1,0]].set_index(0).to_dict("dict")["index"]

def print_metrics(y_test,y_pred):
    report = classification_report(y_test, y_pred, output_dict=True)
    df = pd.DataFrame(report).transpose()
    results = df.sort_values("f1-score",ascending=False)
    return results


# Creating the dataset

## loading data

In [2]:
#load and clean the dataframe

file_path = "../data/other_data/pubmed_name_country.csv"
length = 0
df_list = []
chunksize=2000000

#processing the csv in chunks because it's too heavy
for chunk in pd.read_csv(file_path,sep=";",header=None, dtype={0 : 'string',1 : "category"},chunksize=chunksize):
    df = chunk
    df.columns = ["name","country"]
    len1 = len(df)
    
    #cleaning the name
    df["name"] = df["name"].fillna("ERROR").apply(name_cleaner).str.strip() 
    df = df[df["name"] != "ERROR"]
    len2 = len(df)
    print(str(len1 - len2) + " incorrect names removed", end=", ")
    
    #handling arabic names starting with EL/AL
    df_arabic = df[df["name"].str[:3].str.contains("^[AE]L[- ]")]  
    df_arabic["name"] = df_arabic['name'].str.replace("-"," ").str.strip()
    print("arabic names split", end=", ")

    df = df[~df.index.isin(df_arabic.index)]
    
    #splitting names with dashes into multiple names
    df["name"] = df["name"].str.split("-") 
    df = df.explode("name")
    df = pd.concat([df_arabic,df])
    print("dashes handled", end=", ")

    df= df[~df["name"].str.contains("^[A-Z] [A-Z]$")] #removing one letter space one letter names
    df = df[(df["name"].str.len() > 1) & (df["name"].str.len() <=30)] #removing too short and too long names
    df_list.append(df)
    
    length +=len(df)
    print("Chunk processed, total rows processed : ", length)
df = pd.concat(df_list)
del df_list
df = df.groupby(["country","name"])["name"].count().rename("count").reset_index()
df
#ps : the paper's code says that names present less than 1000 times are removed but that's not the case. The dataframe would be 68 000 rows long otherwise!

230 incorrect names removed, 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_arabic["name"] = df_arabic['name'].str.replace("-"," ").str.strip()


arabic names split, dashes handled, Chunk processed, total rows processed :  2073455
272 incorrect names removed, arabic names split, dashes handled, Chunk processed, total rows processed :  4145818
228 incorrect names removed, arabic names split, dashes handled, Chunk processed, total rows processed :  6217048
214 incorrect names removed, arabic names split, dashes handled, Chunk processed, total rows processed :  8289071
254 incorrect names removed, arabic names split, dashes handled, Chunk processed, total rows processed :  10360287
234 incorrect names removed, arabic names split, dashes handled, Chunk processed, total rows processed :  12431940
223 incorrect names removed, arabic names split, dashes handled, Chunk processed, total rows processed :  14503179
229 incorrect names removed, arabic names split, dashes handled, Chunk processed, total rows processed :  16577115
195 incorrect names removed, arabic names split, dashes handled, Chunk processed, total rows processed :  1864908

Unnamed: 0,country,name,count
0,Afghanistan,AAMOON,1
1,Afghanistan,ABDUL,1
2,Afghanistan,ADEGBOYE,2
3,Afghanistan,ADEL,1
4,Afghanistan,AHADI,1
...,...,...,...
1913037,Zimbabwe,ZISHIRI,3
1913038,Zimbabwe,ZVANDASARA,9
1913039,Zimbabwe,ZVAUYA,2
1913040,Zimbabwe,ZVINAVASHE,1


## Cleaning the dataset

As explained in the paper, we assume that names that highly cluster in one country originate from this country. We use the Herfindahl-Hirschman Index to check this.

In [3]:
data  = df.copy()

#computing country level frequency
data["freq"] = data["count"]/data.groupby("country")["count"].transform(sum)

#computing normalized frequency
standardized_freq = data["freq"]/data.groupby("name")["freq"].transform(sum)

#computing the hhi
data["hhi"] = (standardized_freq**2).groupby(data['name']).transform(sum)

data = data[(data["hhi"] > 0.8) & (data['freq'] > 0.000001)]
data = data.sort_values(['name','freq'],ascending=False).drop_duplicates("name")

data["region"] = data["country"].apply(lambda x : region_dict[x] if x in region_dict.keys() else None)
data

Unnamed: 0,country,name,count,freq,hhi,region
1438045,Uganda,ZZIWA,1,0.000087,1.000000,African
965870,Malaysia,ZZ,1,0.000015,1.000000,Indian
1124428,Poland,ZYZYNSKA,3,0.000011,1.000000,Slavic
1088809,Panama,ZYZNIEUSKI,1,0.000367,0.998398,CentralSouthEuropean
1124427,Poland,ZYZELEWICZ,2,0.000008,1.000000,Slavic
...,...,...,...,...,...,...
1218130,Spain,DUENAS,1,0.000001,1.000000,CentralSouthEuropean
1218129,Spain,CEBALLOS,1,0.000001,1.000000,CentralSouthEuropean
1096665,Poland,BLASZCZYK,1,0.000004,1.000000,Slavic
791287,Israel,BEN,2,0.000009,1.000000,NorthEuropean


# Training the model

## Reproducing the original model

In [7]:
#traing the model

def padding(name):
    padded_text = "^" + name + "$"
    return padded_text

padder = FunctionTransformer(padding)
vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 4))
NB_model = MultinomialNB(alpha=0.01, fit_prior=True)


clf = Pipeline(steps=[("padding",padder),
                      ('vectorizer', vectorizer),
                      ('model', NB_model)])


data = data.dropna()

X = data["name"]
y = data["region"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)


clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)
print_metrics(y_test,y_pred)
#checks out!

Unnamed: 0,precision,recall,f1-score,support
CentralSouthEuropean,0.80752,0.715045,0.758474,28415.0
Slavic,0.634583,0.835229,0.721211,9583.0
weighted avg,0.71622,0.69501,0.698376,96590.0
accuracy,0.69501,0.69501,0.69501,0.69501
NorthEuropean,0.774949,0.623629,0.691103,32561.0
Asian,0.610533,0.76618,0.679558,6582.0
Indian,0.633328,0.716803,0.672485,10159.0
macro avg,0.63162,0.712179,0.662521,96590.0
Arabian,0.523134,0.719406,0.605768,4715.0
African,0.437294,0.608962,0.509044,4575.0


## Improving performance

Let's start by switching to a SGDClassifier with a wider ngram range

In [8]:
data = data.dropna()

vectorizer = CountVectorizer(analyzer="char_wb", ngram_range=(2,8))
SGD_model = SGDClassifier(class_weight="balanced",max_iter=1000)

clf = Pipeline(steps=[("padding",padder),
                      ('vectorizer', vectorizer),
                      ('model', SGD_model)])

X = data["name"]
y = data["region"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)


clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)
print_metrics(y_test,y_pred)

Unnamed: 0,precision,recall,f1-score,support
CentralSouthEuropean,0.831892,0.788914,0.809833,28415.0
Asian,0.716462,0.807353,0.759197,6582.0
Slavic,0.674447,0.865387,0.758079,9583.0
weighted avg,0.761115,0.743038,0.74528,96590.0
Indian,0.703357,0.791909,0.745011,10159.0
accuracy,0.743038,0.743038,0.743038,0.743038
NorthEuropean,0.822631,0.648383,0.725187,32561.0
macro avg,0.682047,0.759845,0.712448,96590.0
Arabian,0.542563,0.759703,0.63303,4715.0
African,0.482975,0.657268,0.5568,4575.0


Then by training the model on a dataset where the United States, Canada and Switzerland have removed before computing the hhi because they are too heterogenous and are confusing the model.


In [9]:
# I've simply rerun the above step of cleaning data without those countries. Not shown here.
data = data.dropna()

X = data["name"]
y = data["region"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)
print_metrics(y_test,y_pred)

Unnamed: 0,precision,recall,f1-score,support
CentralSouthEuropean,0.829712,0.790146,0.809446,28415.0
Slavic,0.680053,0.860586,0.759742,9583.0
Asian,0.683072,0.820267,0.745409,6582.0
weighted avg,0.76103,0.740284,0.74272,96590.0
Indian,0.696878,0.790924,0.740929,10159.0
accuracy,0.740284,0.740284,0.740284,0.740284
NorthEuropean,0.833059,0.637849,0.722501,32561.0
macro avg,0.676619,0.759977,0.708673,96590.0
Arabian,0.548877,0.751432,0.634378,4715.0
African,0.464682,0.668634,0.548306,4575.0


In [11]:
#saving the model
from joblib import dump
dump(clf, 'MR_model.joblib') 


['MR_model.joblib']

That's all !