In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
data = pd.read_csv("../datasets/NationalNames.csv")

In [3]:
data.head()

Unnamed: 0,Id,Name,Year,Gender,Count
0,1,Mary,1880,F,7065
1,2,Anna,1880,F,2604
2,3,Emma,1880,F,2003
3,4,Elizabeth,1880,F,1939
4,5,Minnie,1880,F,1746


In [4]:
data["Gender"] = LabelEncoder().fit_transform(data.Gender)

In [5]:
df = data.groupby(["Name"]).mean()[["Gender"]].reset_index()

In [6]:
df

Unnamed: 0,Name,Gender
0,Aaban,1.0
1,Aabha,0.0
2,Aabid,1.0
3,Aabriella,0.0
4,Aadam,1.0
...,...,...
93884,Zytavious,1.0
93885,Zyvion,1.0
93886,Zyyanna,0.0
93887,Zyyon,1.0


In [7]:
import string

In [8]:
letters = list(string.ascii_lowercase)

In [9]:
vocab = dict(zip(letters, range(1, 27)))

In [10]:
rev_vocab = dict(zip(range(1, 27), letters))

In [11]:
name = "Elizabeth".lower()

In [12]:
seq = [vocab[ch] for ch in name]

In [13]:
[rev_vocab[item] for item in seq]

['e', 'l', 'i', 'z', 'a', 'b', 'e', 't', 'h']

In [14]:
X = []
for name in df.Name:
    X.append([vocab[ch] for ch in name.lower()])

In [15]:
y = df.Gender

In [16]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [17]:
X = pad_sequences(X, maxlen=10)

In [18]:
X.shape

(93889, 10)

In [19]:
y.shape

(93889,)

In [20]:
from tensorflow.keras.layers import Input, Dense, SimpleRNN, Embedding
from tensorflow.keras.models import Model

In [21]:
in_layer = Input(shape=(10,))
embedding = Embedding(input_dim=27, output_dim=5)(in_layer)
rnn = SimpleRNN(units=30)(embedding)
out_layer = Dense(units=1, activation="sigmoid")(rnn)

In [22]:
model = Model(in_layer, out_layer)

In [23]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 10)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 10, 5)             135       
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 30)                1080      
_________________________________________________________________
dense (Dense)                (None, 1)                 31        
Total params: 1,246
Trainable params: 1,246
Non-trainable params: 0
_________________________________________________________________


In [24]:
model.compile(optimizer="adam", loss="mse", metrics=["accuracy"])

In [25]:
model.fit(X, y.values, batch_size=1000, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f8e68e0b0d0>

In [26]:
name = "Shivani"
seq = [vocab[ch] for ch in name.lower()]
x_test = pad_sequences([seq], maxlen=10)

model.predict(x_test)

array([[0.14325114]], dtype=float32)