In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import os
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

2022-12-19 11:57:31.115021: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
pd.options.display.max_seq_items = 2000
pd.options.display.max_columns = 500
pd.options.display.max_rows = 800
pd.options.display.max_colwidth = 200
pd.options.display.width = 800

In [45]:
current_dir = %pwd
head, _ = os.path.split(current_dir)
app_path = os.path.join(head, "app")
print(app_path)
# move to parent /app working directory for local imports
os.chdir(app_path)
# this approach looks complicated wrt os.chdir("..") but you don't have to worry to NOT run this cell twice

/Users/francesco/REPOS/nam-nat-CNN-clf/app


In [13]:
from utils.dev_utils import load_raw_data, drop_duplicates
from utils.prod_utils import unidecode_string, remove_digits_punctuation_doublespaces

In [14]:
filepath = os.path.join(app_path, "data", "names-by-nationality.csv")
data = load_raw_data(filepath)
data.rename(columns={"sex":"gender"}, inplace=True)
data.head()

csv loaded into dataframe of shape:  (4695, 3)


Unnamed: 0,name,gender,nationality
0,Aarav,Male,Indian
1,Aaryan,Male,Indian
2,Abha,Female,Indian
3,Abhay,Male,Indian
4,Abhilash,Male,Indian


In [15]:
data.dropna(inplace=True)
# unidecode (remove french accents)
#data["name"] = data["name"].apply(lambda x: unidecode_string(x))
# lower
data["name"] = data["name"].str.lower()
# remove noise
data["name"] = data["name"].apply(lambda x: remove_digits_punctuation_doublespaces(x))

In [16]:
print(data.loc[data["nationality"]=="French"])

               name   gender nationality
2329           aadi   Female      French
2330        abelard     Male      French
2331       abrielle   Female      French
2332         acadia   Female      French
2333        aceline   Female      French
2334         adelie   Female      French
2335      afrodille   Female      French
2336          aimee   Female      French
2337          aimon     Male      French
2338          alain     Male      French
2339         alaina   Female      French
2340      albertine   Female      French
2341      alexandre     Male      French
2342         alezae   Female      French
2343         aliane   Female      French
2344         alizeé   Female      French
2345         allete   Female      French
2346         alodie   Female      French
2347       alphonse     Male      French
2348        alsatia   Female      French
2349       amandine   Female      French
2350        amarine   Female      French
2351          ambre   Female      French
2352       ambro

In [17]:
data = drop_duplicates(data, ["name"])

No duplicates found


In [18]:
print(data.shape)
print(data.dtypes)
data.describe()

(4695, 3)
name           object
gender         object
nationality    object
dtype: object


Unnamed: 0,name,gender,nationality
count,4695,4695,4695
unique,4695,3,7
top,aarav,Female,American
freq,1,2573,1782


In [19]:
print(data["name"].str.len().max())

13



## Gender

### Split

In [20]:
# Split data into train val (85, 15)
X_train_gen, X_val_gen, y_train_gen, y_val_gen = train_test_split(
    data["name"],
    data["gender"],
    test_size=0.15,
    stratify=data["gender"],
)

assert y_train_gen.nunique() == y_val_gen.nunique()
print("training examples: ", len(X_train_gen))
print("validation examples: ", len(X_val_gen))

training examples:  3990
validation examples:  705


### Text Vectorization

In [21]:
# Tokenizer
tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')

In [22]:
tk.fit_on_texts(X_train_gen)

In [23]:
tk.word_index

{'UNK': 1,
 'a': 2,
 'e': 3,
 'i': 4,
 'n': 5,
 'r': 6,
 'l': 7,
 'o': 8,
 's': 9,
 't': 10,
 'h': 11,
 'k': 12,
 'm': 13,
 'y': 14,
 'u': 15,
 'd': 16,
 'c': 17,
 'v': 18,
 'b': 19,
 'j': 20,
 'g': 21,
 'f': 22,
 'p': 23,
 'z': 24,
 'w': 25,
 'x': 26,
 'q': 27,
 'é': 28,
 'è': 29,
 'î': 30,
 'ç': 31,
 ' ': 32,
 'ë': 33}

In [25]:
# Save tokenizer
with open(os.path.join(app_path, "artifacts/training_data", "tokenizer.pkl"), "wb") as f:
    pickle.dump(tk, f, protocol=pickle.HIGHEST_PROTOCOL)

### Characters to index

In [26]:
# Convert string to index
train_sequences_gen = tk.texts_to_sequences(X_train_gen)
val_sequences_gen = tk.texts_to_sequences(X_val_gen)

In [27]:
print(type(train_sequences_gen))
train_sequences_gen

<class 'list'>


[[5, 4, 10, 3, 9, 11],
 [10, 4, 14, 2],
 [2, 16, 4, 10, 2],
 [19, 3, 7, 18, 2],
 [20, 2, 12, 2, 14, 7, 2],
 [13, 8, 10, 8, 14, 8],
 [9, 3, 5, 25, 3],
 [13, 4, 16, 5, 4, 21, 11, 10],
 [19, 7, 8, 5, 16, 3, 7, 7, 3],
 [10, 8, 12, 14, 8],
 [6, 4, 9, 11, 4],
 [23, 6, 3, 9, 9],
 [13, 2, 21, 2, 7, 4, 3],
 [7, 4, 7, 4, 14, 2],
 [12, 3, 3, 5, 2],
 [2, 4],
 [12, 2, 7, 12, 4, 5],
 [20, 4, 5],
 [2, 7, 3, 26, 4, 2],
 [12, 2, 14, 7, 2, 5, 2],
 [9, 11, 14, 2, 13],
 [24, 3, 5, 9, 11, 4, 6, 8],
 [2, 13, 2, 6, 2, 5, 10, 3],
 [19, 6, 2, 14, 2, 5],
 [3, 7, 8, 4, 9, 3],
 [16, 4, 23, 2, 12, 2],
 [17, 11, 3, 6, 6, 4, 9],
 [6, 2, 9, 11, 2, 25, 5],
 [14, 2, 10, 4, 5],
 [21, 2, 18, 6, 4, 7, 7, 2],
 [17, 11, 6, 4, 9, 2, 5, 5, 2],
 [2, 14, 8],
 [5, 2, 7, 4, 5],
 [16, 2, 7, 7, 14, 17, 3],
 [8, 5, 4, 12, 2],
 [7, 8, 15, 4, 9],
 [21, 7, 2, 22, 4, 6, 2],
 [11, 4, 5, 2, 12, 8],
 [12, 8, 10, 2, 6, 8],
 [13, 8, 5, 4, 9, 9, 2],
 [17, 8, 6, 3, 10, 10, 2],
 [13, 8, 5, 10, 3, 7],
 [2, 6, 2, 14, 5, 2],
 [20, 2, 5, 4, 9, 8, 5

In [28]:
print(max(len(sequence_train) for sequence_train in train_sequences_gen))
print(max(len(sequence_val) for sequence_val in val_sequences_gen))

13
10


In [29]:
max_len = 16

In [30]:
# Padding with 0
X_train_gen = pad_sequences(train_sequences_gen, maxlen=max_len, padding='post')
X_val_gen = pad_sequences(val_sequences_gen, maxlen=max_len, padding='post')

In [31]:
# Convert to numpy array
X_train_gen = np.array(X_train_gen, dtype='float32')
X_val_gen = np.array(X_val_gen, dtype='float32')
print(X_train_gen[0])

[ 5.  4. 10.  3.  9. 11.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]


### One-hot encode classes

In [32]:
# One hot encoding of classes
ohe = OneHotEncoder(sparse=False)
y_train_gen = ohe.fit_transform(y_train_gen.to_numpy().reshape(-1, 1))
y_val_gen = ohe.fit_transform(y_val_gen.to_numpy().reshape(-1, 1))
print("one hot encoded categories: ", ohe.categories_)

one hot encoded categories:  [array(['Female', 'Male', 'Neutral'], dtype=object)]


In [33]:
# Save encoder (for inverse transform on prediction)
with open(os.path.join(app_path, "artifacts/training_data", "ohe_gender.pkl"), "wb") as f:
    pickle.dump(ohe, f)

In [34]:
# Save training data
np.savez_compressed(
    os.path.join(app_path, "artifacts/training_data", "gender_data.npz"),
    x_train=X_train_gen,
    y_train=y_train_gen,
    x_val=X_val_gen,
    y_val=y_val_gen,
)

## Nationality

### Split

In [35]:
# Split data into train val test (85, 15)
X_train_nat, X_val_nat, y_train_nat, y_val_nat = train_test_split(
    data["name"],
    data["nationality"],
    test_size=0.15,
    stratify=data["nationality"],
)

assert y_train_nat.nunique() == y_train_nat.nunique()
print("training examples: ", len(X_train_nat))
print("validation examples: ", len(X_val_nat))

training examples:  3990
validation examples:  705


### Characters to index

In [36]:
# Convert string to index (we use the same dictionary as in training. If there's an unknown character we have UNK)
train_sequences_nat = tk.texts_to_sequences(X_train_nat)
val_sequences_nat = tk.texts_to_sequences(X_val_nat)

In [37]:
print(max(len(sequence_train) for sequence_train in train_sequences_nat))
print(max(len(sequence_val) for sequence_val in val_sequences_nat))

13
10


In [38]:
# Padding with 0
X_train_nat = pad_sequences(train_sequences_nat, maxlen=max_len, padding='post')
X_val_nat = pad_sequences(val_sequences_nat, maxlen=max_len, padding='post')

In [39]:
# Convert to numpy array
X_train_nat = np.array(X_train_nat, dtype='float32')
X_val_nat = np.array(X_val_nat, dtype='float32')

### One-hot encode classes

In [40]:
# One hot encoding of classes
ohe = OneHotEncoder(sparse=False)
y_train_nat = ohe.fit_transform(y_train_nat.to_numpy().reshape(-1, 1))
y_val_nat = ohe.fit_transform(y_val_nat.to_numpy().reshape(-1, 1))
print("one hot encoded categories: ", ohe.categories_)

one hot encoded categories:  [array(['African', 'American', 'French', 'German', 'Indian', 'Japanese',
       'Russian'], dtype=object)]


In [41]:
# Save encoder (for inverse transform on prediction)
with open(os.path.join(app_path, "artifacts/training_data", "ohe_nationality.pkl"), "wb") as f:
    pickle.dump(ohe, f)

In [42]:
# Save training data
np.savez_compressed(
    os.path.join(app_path, "artifacts/training_data", "nationality_data.npz"),
    x_train=X_train_nat,
    y_train=y_train_nat,
    x_val=X_val_nat,
    y_val=y_val_nat,
)