In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

In [2]:
## Loading dataset
df = pd.read_csv("name_gender_dataset.csv")

In [3]:
## checking Dataset
df.head(10)
#print(df_names[df_names['Count'] == 10])

Unnamed: 0,Name,Gender,Count,Probability
0,James,M,5304407,0.014517
1,John,M,5260831,0.014398
2,Robert,M,4970386,0.013603
3,Michael,M,4579950,0.012534
4,William,M,4226608,0.011567
5,Mary,F,4169663,0.011411
6,David,M,3787547,0.010366
7,Joseph,M,2695970,0.007378
8,Richard,M,2638187,0.00722
9,Charles,M,2433540,0.00666


In [4]:
# To drop the columns where the count for male or female is smaller 

# Step 1: Group by Name and then keep the row with the highest Count within each group
def keep_max_count(group):
    return group.loc[group['Count'].idxmax()]

# Apply the function to each group
df_names = df.groupby('Name').apply(keep_max_count).reset_index(drop=True)

df_names.head(10)

Unnamed: 0,Name,Gender,Count,Probability
0,A,F,2,5.47348e-09
1,A'Aff,F,1,2.73674e-09
2,A'Aron,M,1,2.73674e-09
3,A'Dele,F,1,2.73674e-09
4,A'Isha,F,1,2.73674e-09
5,A'Ishah,F,1,2.73674e-09
6,A'Jana,F,1,2.73674e-09
7,A'Janae,F,1,2.73674e-09
8,A'Lmos,M,1,2.73674e-09
9,A'Nette,F,1,2.73674e-09


In [5]:
n_rows = df_names[df_names['Name'] == 'James']
n_rows

Unnamed: 0,Name,Gender,Count,Probability
52308,James,M,5304407,0.014517


In [6]:
name_row = df[df['Name'] == 'James']
name_row

Unnamed: 0,Name,Gender,Count,Probability
0,James,M,5304407,0.014517
1615,James,F,23963,6.6e-05


In [7]:
## Checking dimensions of the dataframe
print(df_names.shape)

(133910, 4)


In [8]:
df_names['length'] = df_names['Name'].str.len()
df_names.sort_values('length', ascending=True, inplace=True)
df_names.head(100)

Unnamed: 0,Name,Gender,Count,Probability,length
0,A,F,2,5.473480e-09,1
94371,P,M,6,1.642040e-08,1
13782,B,M,3,8.210220e-09,1
97891,R,F,1,2.736740e-09,1
47969,I,M,3,8.210220e-09,1
...,...,...,...,...,...
125392,Va,M,69,1.888350e-07,2
57393,Jl,M,37,1.012590e-07,2
32930,Dj,M,764,2.090870e-06,2
57375,Jj,M,544,1.488790e-06,2


In [9]:
df_names = df_names.sort_values(by='length', ascending=False)
df_names


Unnamed: 0,Name,Gender,Count,Probability,length
20117,Carmelo-Antonio-Francesco,M,1,2.736740e-09,25
105736,Sandrine-Marie-Madeleine,F,1,2.736740e-09,24
89351,Nathan-Conway-Barrington,M,1,2.736740e-09,24
80486,Maria-Mandy-Antoninetta,F,1,2.736740e-09,23
84178,Merlin-Siegfried-Daniel,M,1,2.736740e-09,23
...,...,...,...,...,...
18874,C,M,1,2.736740e-09,1
41501,G,M,2,5.473480e-09,1
92784,O,F,1,2.736740e-09,1
104207,S,M,2,5.473480e-09,1


In [10]:
## We founnd probability is useless here so we drop the column
df_names = df_names.drop('Probability', axis=1)
df_names

Unnamed: 0,Name,Gender,Count,length
20117,Carmelo-Antonio-Francesco,M,1,25
105736,Sandrine-Marie-Madeleine,F,1,24
89351,Nathan-Conway-Barrington,M,1,24
80486,Maria-Mandy-Antoninetta,F,1,23
84178,Merlin-Siegfried-Daniel,M,1,23
...,...,...,...,...
18874,C,M,1,1
41501,G,M,2,1
92784,O,F,1,1
104207,S,M,2,1


In [11]:
## Changing the gender to numerical classifiers

df_names['Gender'] = df_names['Gender'].replace({'M': 1, 'F': 0})
df_names.head(100)

Unnamed: 0,Name,Gender,Count,length
20117,Carmelo-Antonio-Francesco,1,1,25
105736,Sandrine-Marie-Madeleine,0,1,24
89351,Nathan-Conway-Barrington,1,1,24
80486,Maria-Mandy-Antoninetta,0,1,23
84178,Merlin-Siegfried-Daniel,1,1,23
...,...,...,...,...
79365,Malgosia-Dominique,0,1,18
118240,Tarlitah-Katreece,0,1,17
25195,Constantina-Maria,0,1,17
9086,Antonia-Elizabeth,0,1,17


In [12]:
## Remove names with one letter and bigger than 15
df_names.drop(df_names[df_names['length'] > 15].index, inplace = True)
df_names.drop(df_names[df_names['length'] == 1].index, inplace = True)
df_names.head(10)

Unnamed: 0,Name,Gender,Count,length
106986,Seanchristopher,1,10,15
58514,Jordanchristoph,1,5,15
41559,Gabrielalexande,1,6,15
105285,Samantha-Evonne,0,1,15
95163,Patrick-Rajakna,1,1,15
95149,Patricia-Marion,0,1,15
28092,Darcy-Alexander,1,1,15
114552,Stella-Adelaide,0,1,15
114555,Stellafortunata,0,1,15
59579,Julie-Elizabeth,0,1,15


In [13]:
# Group by all columns and calculate size (count) for each group
df_names.drop(df_names[df_names['Count'] <= 5].index, inplace = True)
df_names.sort_values(by="length", ascending=True).head(10)


Unnamed: 0,Name,Gender,Count,length
2528,Ah,1,75,2
93096,Ok,0,35,2
113312,So,0,56,2
29241,De,1,705,2
77079,Ly,0,304,2
37632,En,1,17,2
34190,Du,1,24,2
112989,Sj,1,11,2
47973,Ia,0,164,2
133060,Zi,0,98,2


In [14]:
df_names = df_names[~df_names['Name'].str.contains('-')]
df_names = df_names[~df_names['Name'].str.contains('@')]
df_names

Unnamed: 0,Name,Gender,Count,length
106986,Seanchristopher,1,10,15
41559,Gabrielalexande,1,6,15
104002,Ryanchristopher,1,45,15
23690,Christiananthon,1,12,15
82341,Matthewalexande,1,11,15
...,...,...,...,...
97894,Ra,1,113,2
123855,Tu,1,357,2
133678,Zy,1,48,2
44862,Ha,0,468,2


In [15]:
## Null check
nan_check = df_names.isna().sum()
nan_check

Name      0
Gender    0
Count     0
length    0
dtype: int64

In [16]:
import pandas as pd

# Sample data (replace with your actual DataFrame)
data = {'name': ['Alice', 'Bob', 'Charlie', 'David', 'Emily']}
df = pd.DataFrame(data)

# New column to store last three letters
df['last_three_letters'] = df['name'].str[-3:]

# Print the DataFrame with the new column
print(df)


      name last_three_letters
0    Alice                ice
1      Bob                Bob
2  Charlie                lie
3    David                vid
4    Emily                ily


In [17]:
## names to lowercase to dont have problems with letters 
df_names['Name'] = df_names['Name'].apply(lambda x: x.upper() if isinstance(x, str) else x)
df_names.head(15)

Unnamed: 0,Name,Gender,Count,length
106986,SEANCHRISTOPHER,1,10,15
41559,GABRIELALEXANDE,1,6,15
104002,RYANCHRISTOPHER,1,45,15
23690,CHRISTIANANTHON,1,12,15
82341,MATTHEWALEXANDE,1,11,15
57955,JOHNCHRISTOPHER,1,132,15
23699,CHRISTIANMICHAE,1,11,15
23697,CHRISTIANJOSEPH,1,22,15
11422,ASHLEYELIZABETH,0,8,15
23796,CHRISTOPHERANTH,1,6,15


In [18]:
# df_names['l3_letters'] = df_names['Name'].str[-3:]
# df_names['l2_letters'] = df_names['Name'].str[-2:]
# df_names['l1_letters'] = df_names['Name'].str[-1:]
df_names.sort_values(by="length", ascending=False).head(20)

## print(df_names.shape)

Unnamed: 0,Name,Gender,Count,length
106986,SEANCHRISTOPHER,1,10,15
23800,CHRISTOPHERJOHN,1,118,15
41559,GABRIELALEXANDE,1,6,15
80521,MARIADELROSARIO,0,22,15
80518,MARIADELOSANGEL,0,36,15
81014,MARKCHRISTOPHER,1,6,15
41120,FRANCISCOJAVIER,1,130,15
23691,CHRISTIANDANIEL,1,7,15
23806,CHRISTOPHERRYAN,1,11,15
23804,CHRISTOPHERMICH,1,59,15


In [19]:
# Function to convert string to ASCII values and pad with zeros if length is less than 5
def string_to_ascii_padded(s):
    ascii_values = [(ord(c)-64) for c in s]
    while len(ascii_values) < 15:
        ascii_values.append(0)
    return ascii_values

# Apply the function to the 'names' column
df_names['encoded_names'] = df_names['Name'].apply(string_to_ascii_padded)
## df_names = df_names.drop('names_ascii', axis=1)

df_names['name_ascii'] = [','.join(map(str, l)) for l in df_names['encoded_names']]

df_names = df_names.drop('encoded_names', axis=1)

df_names.sort_index().head(20)

Unnamed: 0,Name,Gender,Count,length,name_ascii
13,AABAN,1,115,5,1121140000000000
14,AABHA,0,35,5,112810000000000
15,AABID,1,20,5,112940000000000
18,AABIR,1,10,5,1129180000000000
21,AABRIELLA,0,38,9,112189512121000000
24,AADA,0,13,4,114100000000000
25,AADAM,1,275,5,1141130000000000
26,AADAN,1,130,5,1141140000000000
27,AADARSH,1,210,7,11411819800000000
30,AADAYA,0,8,6,1141251000000000


In [20]:
# Split the 'encoded_names' column into separate columns
df_pre = df_names.drop(["Name", "Count", "length"], axis=1)
df_pre


Unnamed: 0,Gender,name_ascii
106986,1,19511438189192015168518
41559,1,71218951211252411445
104002,1,182511438189192015168518
23690,1,38189192091141142081514
82341,1,1312020852311252411445
...,...,...
97894,1,1810000000000000
123855,1,20210000000000000
133678,1,26250000000000000
44862,0,810000000000000


In [21]:
name_ascii_split = df_names['name_ascii'].str.split(',', expand=True)

# Concatenate the new DataFrame with the 'Gender' column
result_df= pd.concat([df_names['Gender'], name_ascii_split], axis=1)

result_df = result_df.astype(int)
# result_df = result_df[["Gender","0","1","2","3","4","5","6","7","8","9","10","11","12","13","14"]]

cols = result_df.columns.tolist()
cols.append(cols.pop(cols.index('Gender')))
df_pro = result_df[cols]

df_pro

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Gender
106986,19,5,1,14,3,8,18,9,19,20,15,16,8,5,18,1
41559,7,1,2,18,9,5,12,1,12,5,24,1,14,4,5,1
104002,18,25,1,14,3,8,18,9,19,20,15,16,8,5,18,1
23690,3,8,18,9,19,20,9,1,14,1,14,20,8,15,14,1
82341,13,1,20,20,8,5,23,1,12,5,24,1,14,4,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97894,18,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
123855,20,21,0,0,0,0,0,0,0,0,0,0,0,0,0,1
133678,26,25,0,0,0,0,0,0,0,0,0,0,0,0,0,1
44862,8,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [22]:
## Define X and Y variables
X = df_pro.iloc[:, :-1]
y = df_pro.iloc[:, -1]


# Neural Network
Abstract Base layer

In [23]:
# Base class
class Layer:
    def __init__(self):
        self.input = None
        self.output = None

    # computes the output Y of a layer for a given input X
    def forward_propagation(self, input):
        raise NotImplementedError

    # computes dE/dX for a given dE/dY (and update parameters if any)
    def backward_propagation(self, output_error, learning_rate):
        raise NotImplementedError

The Fully Connected Layer

In [24]:
# inherit from base class Layer
class FCLayer(Layer):
    # input size = number of input nodes
    # output size = number of output nodes
    def __init__(self, input_size, output_size):
        self.weights = np.random.rand(input_size, output_size) - 0.5
        self.bias = np.random.rand(1, output_size) - 0.5

    # returns output for a given input
    def forward_propagation(self, input_data):
        self.input = input_data
        self.output = np.dot(self.input, self.weights) + self.bias
        return self.output
    
    # To compute dE/dW, dE/dB for a given output error = dE/dY. Returns input_error = dE/dX
    def backward_propagation(self, output_error, learning_rate):
        input_error = np.dot(output_error, self.weights.T)
        weights_error = np.dot(self.input.T, output_error)
        #dBias output_error

        # update parameters
        self.weights -= learning_rate * weights_error
        self.bias -= learning_rate * output_error
        return input_error

Activation Layer

In [25]:
# again inherit from base class Layer
class ActivationLayer(Layer):
    def __init__(self, activation, activation_prime):
        self.activation = activation
        self.activation_prime = activation_prime

    # to return the activated input
    def forward_propagation(self, input_data):
        self.input = input_data
        self.output = self.activation(self.input)    
        return self.output
    
    # Returns input_error = dE/dX for a given output_error = dE/dY
    # no learning rate used here because there is not learnable parameters
    def backward_propagation(self, output_error, learning_rate):
        return self.activation_prime(self.input) * output_error

In [26]:
# Activation Function and its derivative
def tanh(x):
    return np.tanh(x)

def tanh_prime(x):
    return 1-np.tanh(x)**2

In [27]:
# Loss function and its derivative
def mse(y_true, y_pred):
    return np.mean(np.power(y_true-y_pred, 2))

def mse_prime(y_true, y_pred):
    return 2*(y_pred-y_true)/y_true.size

The Network Class

In [28]:
class Network:
    def __init__(self):
        self.layers = []
        self.loss = None
        self.loss_prime = None

    # add layer to network
    def add(self, layer):
        self.layers.append(layer)

    # set loss to use
    def use(self, loss, loss_prime):
        self.loss = loss
        self.loss_prime = loss_prime

    # predict output for given input
    def predict(self, input_data):
        # sample dimension first
        samples = len(input_data)
        result = []

        # run network over all samples
        for i in range(samples):
            # forward propagation
            output = input_data[i]
            for layer in self.layers:
                output = layer.forward_propagation(output)
            result.append(output)

        return result

    # train the network
    def fit(self, x_train, y_train, epochs, learning_rate):
        # sample dimension first
        samples = len(x_train)

        # training loop
        for i in range(epochs):
            err = 0
            for j in range(samples):
                # forward propagation
                output = x_train[j]
                for layer in self.layers:
                    output = layer.forward_propagation(output)

                # compute loss (for display purpose only)
                err += self.loss(y_train[j], output)

                # backward propagation
                error = self.loss_prime(y_train[j], output)
                for layer in reversed(self.layers):
                    error = layer.backward_propagation(error, learning_rate)

            # calculate average error on all samples
            err /= samples
            print('epoch %d/%d   error=%f' % (i+1, epochs, err))