In [2]:
#read data and data clean
import pandas as pd

df=pd.read_csv('name_gender.csv')

print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95025 entries, 0 to 95024
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    95025 non-null  object
 1   gender  95025 non-null  object
dtypes: object(2)
memory usage: 1.5+ MB
None
        name gender
0    Aaban&&      M
1     Aabha*      F
2      Aabid      M
3  Aabriella      F
4      Aada_      F


In [3]:
dfm=df.copy()

dfm['name']=dfm.name.str.strip()
dfm['gender']=dfm.gender.str.strip()
dfm['name']=dfm.name.str.replace('[^a-zA-Z]', '') #only keep the string


print(dfm.isnull().sum()) #check null

print(dfm['name'].value_counts())#check duplication
print(dfm['gender'].value_counts()) # check balance between two group

#dfm.to_csv('dfm.csv')
print(dfm.head())

name      0
gender    0
dtype: int64
Aaban      1
Mural      1
Murdock    1
Murdoch    1
Murdoc     1
          ..
Giya       1
Givonni    1
Givonna    1
Givon      1
Zzyzx      1
Name: name, Length: 95025, dtype: int64
F    60304
M    34721
Name: gender, dtype: int64
        name gender
0      Aaban      M
1      Aabha      F
2      Aabid      M
3  Aabriella      F
4       Aada      F


  dfm['name']=dfm.name.str.replace('[^a-zA-Z]', '') #only keep the string


In [4]:

def data_encode(df,training=True,normalize=True):
    '''
    This function takes in dataframe and returns an datafraem with encoding name and gender
    @param df: dataframe(n,2)
    @param training: bool,default=True
    @param normalize: bool, default=True(sklearn),False(LSTM)
    
    @return: dataframe(m+n, 2)
    '''
    
    df['name'] = df['name'].str.lower()
    df['name'] = [list(name) for name in df['name']]

    name_length = 50
    df['name'] = [
        (name + [' ']*name_length)[:name_length] 
        for name in df['name']
    ]

    if normalize:
        df['name'] = [
            [
                max(0,(ord(char)-ord('a'))/(ord('z')-ord('a'))) 
                for char in name
            ]
            for name in df['name']
        ]
    else:
        df['name'] = [
            [
                max(0.0, ord(char)-96.0)  
                for char in name
            ]
            for name in df['name']
        ]

    if training:
        df['gender'] = [0.0 if gender=='F' else 1.0 for gender in df['gender']]
    
    return df

                        
dfm=data_encode(dfm,normalize=False)

                    
print(dfm.head())
print(dfm.info())


                                                name  gender
0  [1.0, 1.0, 2.0, 1.0, 14.0, 0.0, 0.0, 0.0, 0.0,...     1.0
1  [1.0, 1.0, 2.0, 8.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...     0.0
2  [1.0, 1.0, 2.0, 9.0, 4.0, 0.0, 0.0, 0.0, 0.0, ...     1.0
3  [1.0, 1.0, 2.0, 18.0, 9.0, 5.0, 12.0, 12.0, 1....     0.0
4  [1.0, 1.0, 4.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...     0.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95025 entries, 0 to 95024
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   name    95025 non-null  object 
 1   gender  95025 non-null  float64
dtypes: float64(1), object(1)
memory usage: 1.5+ MB
None


In [7]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.optimizers import Adam

def lstm_model(num_alphabets=27, name_length=50, embedding_dim=256):
    # embedding function for LSTM
    model = Sequential([
        Embedding(num_alphabets, embedding_dim, input_length=name_length),
        Bidirectional(LSTM(units=128, recurrent_dropout=0.2, dropout=0.2)),
        Dense(1, activation="sigmoid")
    ])

    model.compile(loss='binary_crossentropy',
                  optimizer=Adam(learning_rate=0.001),
                  metrics=['accuracy'])

    return model

In [None]:
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

# Step 1: Instantiate the model
model = lstm_model(num_alphabets=27, name_length=50, embedding_dim=256)

# Step 2: Split Training and Test Data
X = np.asarray(dfm['name'].values.tolist())
y = np.asarray(dfm['gender'].values.tolist())

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0,
                                                    stratify=y                                                
                                                   )

# Step 3: Train the model
callbacks = [
    EarlyStopping(monitor='val_accuracy',
                  min_delta=1e-3,
                  patience=5,
                  mode='max',
                  restore_best_weights=True,
                  verbose=1),
]

history = model.fit(x=X_train,
                    y=y_train,
                    batch_size=640,
                    epochs=30,
                    validation_data=(X_test, y_test),
                    callbacks=callbacks)

# Step 4: Save the model
model.save('gender.h5')

# Step 5: Plot accuracies
plt.figure(figsize=(12,8))
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='val')
plt.xlabel('Epochs',fontsize=12)
plt.ylabel('Accuracy',fontsize=12)
plt.legend()
plt.title('LSTM Model',fontsize=12)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30

In [6]:
from tensorflow.keras.models import load_model
import pandas as pd
import numpy as np

pred_model = load_model('gender.h5')

# Input names
names = ['Joe', 'Biden', 'Kamala', 'Harris']

# Convert to dataframe
pred_df = pd.DataFrame({'name': names})

# Preprocess
pred_df = data_encode(pred_df,training=False)

# Predictions
result = pred_model.predict(np.asarray(
    pred_df['name'].values.tolist())).squeeze(axis=1)

pred_df['F or M?'] = [
    'M' if logit > 0.5 else 'F' for logit in result
]

pred_df['Probability'] = [
    logit if logit > 0.5 else 1.0 - logit for logit in result
]

# Format the output
pred_df['name'] = names
pred_df.rename(columns={'name': 'Name'}, inplace=True)
pred_df['Probability'] = pred_df['Probability'].round(2)
pred_df.drop_duplicates(inplace=True)

pred_df.head()



Unnamed: 0,Name,F or M?,Probability
0,Joe,F,0.51
1,Biden,F,0.51
2,Kamala,F,0.51
3,Harris,F,0.51


In [4]:
#ENSEMble model
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold


dfm=df.copy()

dfm['name']=dfm.name.str.strip()
dfm['gender']=dfm.gender.str.strip()
dfm['name']=dfm.name.str.replace('[^a-zA-Z]', '') #only keep the string


print(dfm.isnull().sum()) #check null

print(dfm['name'].value_counts())#check duplication
print(dfm['gender'].value_counts()) # check balance between two group

name      0
gender    0
dtype: int64
Aaban      1
Mural      1
Murdock    1
Murdoch    1
Murdoc     1
          ..
Giya       1
Givonni    1
Givonna    1
Givon      1
Zzyzx      1
Name: name, Length: 95025, dtype: int64
F    60304
M    34721
Name: gender, dtype: int64


  dfm['name']=dfm.name.str.replace('[^a-zA-Z]', '') #only keep the string


In [5]:
def data_encode(df,training=True,normalize=True):
    '''
    This function takes in dataframe and returns an datafraem with encoding name and gender
    @param df: dataframe(n,2)
    @param training: bool,default=True
    @param normalize: bool, default=True(sklearn),False(LSTM)
    
    @return: dataframe(m+n, 2)
    '''
    
    df['name'] = df['name'].str.lower()
    df['name'] = [list(name) for name in df['name']]

    name_length = 50
    df['name'] = [
        (name + [' ']*name_length)[:name_length] 
        for name in df['name']
    ]

    if normalize:
        df['name'] = [
            [
                max(0,(ord(char)-ord('a'))/(ord('z')-ord('a'))) 
                for char in name
            ]
            for name in df['name']
        ]
    else:
        df['name'] = [
            [
                max(0.0, ord(char)-96.0)  
                for char in name
            ]
            for name in df['name']
        ]

    if training:
        df['gender'] = [0.0 if gender=='F' else 1.0 for gender in df['gender']]
    
    return df

                        
dfm=data_encode(dfm)

                    
print(dfm.head())
print(dfm.info())

                                                name  gender
0  [0, 0, 0.04, 0, 0.52, 0, 0, 0, 0, 0, 0, 0, 0, ...     1.0
1  [0, 0, 0.04, 0.28, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...     0.0
2  [0, 0, 0.04, 0.32, 0.12, 0, 0, 0, 0, 0, 0, 0, ...     1.0
3  [0, 0, 0.04, 0.68, 0.32, 0.16, 0.44, 0.44, 0, ...     0.0
4  [0, 0, 0.12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...     0.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95025 entries, 0 to 95024
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   name    95025 non-null  object 
 1   gender  95025 non-null  float64
dtypes: float64(1), object(1)
memory usage: 1.5+ MB
None


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import KFold
import numpy as np

X = np.asarray(dfm['name'].values.tolist())
y = np.asarray(dfm['gender'].values.tolist())
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0,
                                                    stratify=y                                                
                                                )

seed = 1073
results = []
names = []
scoring = 'accuracy'

models = [
        ('ET', ExtraTreesClassifier()),
        ('RF', RandomForestClassifier()),
        ]

for name, model in models:
    kfold = KFold(n_splits=10, random_state=seed,shuffle=True)
    cv_results = cross_val_score(model, X, y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)



ET: 0.778353 (0.004888)
RF: 0.784046 (0.004836)


In [7]:
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report
import joblib

# Random forest hyperparameter tuning
# n_jobs=-1 to allow run it on all cores
params = {
    'n_estimators': [100, 200, 500],
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [1,2,4,5],
    'min_samples_leaf': [1,2,4,5],
    'max_leaf_nodes': [4,10,20,50,None]
}

gs1 = GridSearchCV(RandomForestClassifier(n_jobs=-1), params, n_jobs=-1, cv=KFold(n_splits=3), scoring='roc_auc')
gs1.fit(X_train, y_train)

#ExtraTree hyperparameter tuning
params = {
    'n_estimators': [100, 200, 500],
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [1,2,4,5],
    'min_samples_leaf': [1,2,4,5],
    'max_leaf_nodes': [4,10,20,50,None]
}

gs3 = GridSearchCV(ExtraTreesClassifier(n_jobs=-1), params, n_jobs=-1, cv=KFold(n_splits=3), scoring='roc_auc')
gs3.fit(X_train, y_train)

#ensemble
votes = [
    ('rf', gs1.best_estimator_),
    ('xt', gs3.best_estimator_)
]

# soft voting based on weights
votesClass = VotingClassifier(estimators=votes, voting='soft', n_jobs=-1)
votesClass.fit(X_train, y_train)

model = votesClass
y_test_hat = model.predict(X_test)
print(classification_report(y_test, y_test_hat))

joblib.dump(model, 'gender.pkl') 

360 fits failed out of a total of 1440.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
360 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 442, in fit
    trees = Parallel(
  File "/usr/local/lib/python3.9/site-packages/joblib/parallel.py", line 1054, in __call__
    self.retrieve()
  File "/usr/local/lib/python3.9/site-packages/joblib/parallel.py", line 933, in retrieve
    self._output.extend(job.get(timeout=self.timeout))
  File "/usr/local/Cellar/python@3.9/3.9.10/

              precision    recall  f1-score   support

         0.0       0.80      0.89      0.84     12061
         1.0       0.76      0.62      0.69      6944

    accuracy                           0.79     19005
   macro avg       0.78      0.76      0.76     19005
weighted avg       0.79      0.79      0.79     19005



OSError: [Errno 28] No space left on device