In [3]:
import nltk, random


nltk.download("popular")

from nltk.corpus import names
from _collections import defaultdict
from nltk.probability import FreqDist

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Do

In [None]:
import torch
import warnings
#import seaborn as sns
#import matplotlib.pyplot as plt

from torch import nn, optim, cuda
from torch.nn import functional as F
from torch.utils import data as torch_data
from sklearn import metrics
from sklearn.model_selection import train_test_split
from keras.layers import LSTM,Flatten
from keras.layers.core import Dense, Activation, Dropout
from keras.layers import Softmax
from tqdm import tqdm
from sklearn.metrics import classification_report
from keras.preprocessing import sequence
from sklearn.preprocessing import OneHotEncoder
from keras.layers.core import Dense, Activation, Dropout
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers import Embedding

import pandas as pd
import numpy as np
import os

#fig_size = plt.rcParams['figure.figsize']
device = 'cuda' if cuda.is_available() else 'cpu'
if device == 'cuda':
    print(f"Torch using {cuda.get_device_name()}")
else:
    print(f"Torch using {device}")

warnings.filterwarnings('ignore')
torch.manual_seed(0)
np.random.seed(0)

#%matplotlib inline

In [None]:
MALE_NAME = names.words('male.txt')
FEMALE_NAME = names.words('female.txt')

## Creating a dataframe from the text files

In [None]:
names_df = pd.DataFrame(columns=["name", "gender"])
for line in MALE_NAME:
    name = line[:-1] if line.endswith('\n') else line        
    names_df.loc[names_df.shape[0], :] = (name, "male")


for line in FEMALE_NAME:
    name = line[:-1] if line.endswith('\n') else line        
    names_df.loc[names_df.shape[0], :] = (name, "female")

names_df.name = names_df.name.str.lower()
puncts = "' -"
table = str.maketrans(dict.fromkeys(puncts))
names_df.name = names_df.name.str.translate(table)

In [None]:
female_count = names_df.gender.value_counts().female
male_count = names_df.gender.value_counts().male

print(names_df.gender.value_counts())
#_ = sns.countplot(x='gender', data=names_df)

female    5001
male      2943
Name: gender, dtype: int64


We see that the number of male names and female names are imbalanced. To keep the classes balanced, I will randomly select names from each class to balance the class distribution.

In [None]:
min_count = min(female_count, male_count)
male_df = names_df.loc[names_df.gender == 'male'].sample(n=min_count)
female_df = names_df.loc[names_df.gender == 'female'].sample(n=min_count)

# names_balanced = pd.concat([male_df, female_df]).sample(frac=1).reset_index().drop(["index"], axis=1)
names_balanced = names_df.copy()
print(names_balanced.gender.value_counts(normalize=True))
#_ = sns.countplot(x='gender', data=names_balanced)

female    0.629532
male      0.370468
Name: gender, dtype: float64


## Utilities

In [None]:
class_dict = {
    'female': 0,
    'male': 1
}

rev_class_dict = {val: key for key, val in class_dict.items()}


class History:
    def __init__(self, losses, accs, val_losses=None, val_accs=None):
        self.loss = losses
        self.accuracy = accs
        self.val_loss = val_losses
        self.val_accuracy = val_accs
    
    def __getitem__(self, val):
        return getattr(self, val)


def get_vocab(names):
    vocab = set()
    maxlen = 0
    for name in names:
        vocab.update(name)
        maxlen = max(maxlen, len(name))
    
    vocab.add('<PAD>')
    return sorted(vocab), maxlen


def pad_and_index(mapping, max_len):
    def inner(x):
        pad_length = max_len - len(x)
        new_name = [mapping['<PAD>']] * pad_length
#         new_name = []
        
        for char in x:
            new_name.append(mapping[char])
        
        return new_name
    return inner


def preprocess(df, mapping, max_len, test_size=0.2):
    df = df.copy().sample(frac=1)
    df["name_indexed"] = df.name.apply(pad_and_index(mapping, max_len))
    df["gender_labelled"] = df.gender.apply(lambda x: 0 if x == 'female' else 1)
    
    if not (0 <= test_size <= 1):
        raise ValueError(f"test_size must be float between 0 and 1. Got {test_size}")
    
    # split into train and test
    X = df.name_indexed.values
    y = df.gender_labelled.values
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=test_size)
    
    # calculate class weights
    class_weights = [(y_train == 0).sum(), (y_train == 1).sum()]
    class_weights = [sum(class_weights) / weight for weight in class_weights]
    
    train_df = pd.DataFrame({
        'name_indexed': X_train,
        'gender_labelled': y_train
    })
    
    test_df = pd.DataFrame({
        'name_indexed': X_test,
        'gender_labelled': y_test
    })
    
    return train_df, test_df, class_weights


def fix_array(arr):
    x_ = []
    for row in arr:
        x_.append(row)

    return np.array(x_)

## Build vocabulary and char-int mappings

In [None]:
vocab, maxlen = get_vocab(names_balanced.name)

# integer to string
char_itos = dict(enumerate(vocab))

# string to integer
char_stoi = {val: key for key, val in char_itos.items()}

hidden_nodes = int(2 / 3 * (maxlen * len(char_itos.keys())))
print(hidden_nodes)

252


In [None]:
char_stoi

{'<PAD>': 0,
 'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26}

## Preprocess the dataframe

In [None]:
train_df, test_df, class_weights = preprocess(names_df, char_stoi, maxlen, test_size=0.25)
print(train_df.gender_labelled.value_counts())
print(test_df.gender_labelled.value_counts())

0    3751
1    2207
Name: gender_labelled, dtype: int64
0    1250
1     736
Name: gender_labelled, dtype: int64


In [None]:
train_df

Unnamed: 0,name_indexed,gender_labelled
0,"[0, 0, 0, 0, 0, 0, 0, 0, 11, 9, 18, 2, 5, 5]",0
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 1, 25, 12, 5]",0
2,"[0, 0, 0, 0, 0, 0, 13, 1, 18, 18, 9, 12, 5, 5]",0
3,"[0, 0, 0, 0, 0, 0, 0, 0, 2, 18, 9, 20, 14, 9]",0
4,"[0, 0, 0, 0, 0, 0, 0, 0, 7, 21, 19, 20, 9, 5]",0
...,...,...
5953,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 12, 9, 8, 21]",1
5954,"[0, 0, 0, 0, 0, 0, 0, 1, 5, 18, 9, 5, 12, 1]",0
5955,"[0, 0, 0, 0, 0, 0, 0, 16, 1, 20, 18, 9, 3, 11]",1
5956,"[0, 0, 0, 0, 0, 0, 0, 0, 16, 9, 5, 18, 3, 5]",1


In [None]:
test_df

Unnamed: 0,name_indexed,gender_labelled
0,"[0, 0, 0, 0, 0, 0, 0, 11, 18, 9, 19, 20, 1, 12]",0
1,"[0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 12, 12, 5, 12]",1
2,"[0, 0, 0, 0, 0, 0, 0, 0, 11, 1, 12, 22, 9, 14]",1
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 19, 21, 19, 25]",0
4,"[0, 0, 0, 0, 0, 0, 13, 1, 18, 19, 8, 1, 12, 12]",1
...,...,...
1981,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 5, 12, 1, 8]",0
1982,"[0, 0, 0, 0, 0, 0, 0, 0, 11, 15, 18, 14, 5, 25]",0
1983,"[0, 0, 0, 0, 0, 0, 0, 1, 2, 5, 12, 1, 18, 4]",1
1984,"[0, 0, 0, 0, 0, 11, 1, 19, 19, 1, 14, 4, 18, 1]",0


In [None]:
(X_train, y_train) = train_df.name_indexed,train_df.gender_labelled
(X_test, y_test) = test_df.name_indexed,test_df.gender_labelled

In [None]:
print(X_train[0])
print(y_train[0])
print('Shape of training data: ')
print(X_train[0].shape)
print(y_train[0].shape)
print('Shape of test data: ')
print(X_test[0].shape)
print(y_test[0].shape)

[0, 0, 0, 0, 0, 0, 0, 0, 11, 9, 18, 2, 5, 5]
0
Shape of training data: 
(5958,)
(5958,)
Shape of test data: 
(1986,)
(1986,)


Task 1 : CNN

In [None]:
# Padding the data samples to a maximum review length in words
max_words = 450
import tensorflow as tf
X_train = tf.keras.utils.pad_sequences(X_train, maxlen=max_words)
X_test = tf.keras.utils.pad_sequences(X_test, maxlen=max_words)
# Building the CNN Model
model = Sequential()
model.add(Embedding(len(vocab), 128, input_length=max_words))
model.add(Conv1D(128, 3, padding='same', activation='relu'))
model.add(MaxPooling1D())
model.add(Dropout(0.3))
model.add(Conv1D(128, 3, padding='same', activation='relu'))
model.add(MaxPooling1D())
model.add(Dropout(0.3))
model.add(Flatten())
#model.add(Dense(250, activation='relu'))
model.add(Dense(1))
model.add(Activation('softmax'))

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=128, verbose=2)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 450, 128)          3456      
                                                                 
 conv1d_4 (Conv1D)           (None, 450, 128)          49280     
                                                                 
 max_pooling1d_4 (MaxPooling  (None, 225, 128)         0         
 1D)                                                             
                                                                 
 dropout_4 (Dropout)         (None, 225, 128)          0         
                                                                 
 conv1d_5 (Conv1D)           (None, 225, 128)          49280     
                                                                 
 max_pooling1d_5 (MaxPooling  (None, 112, 128)         0         
 1D)                                                  

<keras.callbacks.History at 0x7f5e1a12b2b0>

In [None]:
Y_test = model.predict(X_test)
target_names = ['Male', 'Female']
print(classification_report(y_test, Y_test.round(), target_names=target_names))

              precision    recall  f1-score   support

        Male       0.00      0.00      0.00      1250
      Female       0.37      1.00      0.54       736

    accuracy                           0.37      1986
   macro avg       0.19      0.50      0.27      1986
weighted avg       0.14      0.37      0.20      1986



Task 2: LSTM

In [None]:
model = Sequential()
model.add(Embedding(len(vocab), 32, input_length=max_words))
model.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, epochs=50, batch_size=256,verbose = 1,validation_data=(X_test, y_test))

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 450, 32)           864       
                                                                 
 lstm_1 (LSTM)               (None, 32)                8320      
                                                                 
 dense_6 (Dense)             (None, 1)                 33        
                                                                 
Total params: 9,217
Trainable params: 9,217
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27

In [None]:
Y_test = model.predict(X_test)

for i in range(len(Y_test)):
    if Y_test[i][0] > 0.5:
        Y_test[i][0] = 1
    else :
        Y_test[i][0] = 0

target_names = ['Male', 'Female']
print(classification_report(y_test, Y_test, target_names=target_names))

              precision    recall  f1-score   support

        Male       0.72      0.52      0.60       736
      Female       0.76      0.88      0.81      1250

    accuracy                           0.75      1986
   macro avg       0.74      0.70      0.71      1986
weighted avg       0.74      0.75      0.73      1986



Task3 : Part1 - CNN + LSTM  (ReLU Activation)

In [None]:
model = Sequential()
model.add(Embedding(len(vocab), 32, input_length=max_words))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.3))
model.add(LSTM(100, activation='relu'))
model.add(Dense(1, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, epochs=50, batch_size=256,verbose = 1,validation_data=(X_test, y_test))

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 450, 32)           864       
                                                                 
 conv1d_10 (Conv1D)          (None, 450, 32)           3104      
                                                                 
 max_pooling1d_10 (MaxPoolin  (None, 225, 32)          0         
 g1D)                                                            
                                                                 
 lstm_5 (LSTM)               (None, 100)               53200     
                                                                 
 dense_11 (Dense)            (None, 1)                 101       
                                                                 
Total params: 57,269
Trainable params: 57,269
Non-trainable params: 0
_________________________________________________

<keras.callbacks.History at 0x22b868641c0>

In [None]:
Y_test = model.predict(X_test)
target_names = ['Male', 'Female']
print(classification_report(y_test, Y_test, target_names=target_names))

              precision    recall  f1-score   support

        Male       0.00      0.00      0.00       736
      Female       0.63      1.00      0.77      1250

    accuracy                           0.63      1986
   macro avg       0.31      0.50      0.39      1986
weighted avg       0.40      0.63      0.49      1986



Task 3 : Part 3 -CNN + LSTM  (TanH Activation)

In [None]:
model = Sequential()
model.add(Embedding(len(vocab), 32, input_length=max_words))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='tanh'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))
model.add(LSTM(100, activation='tanh'))
model.add(Dense(1, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, epochs=50, batch_size=256,verbose = 1,validation_data=(X_test, y_test))

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, 450, 32)           864       
                                                                 
 conv1d_11 (Conv1D)          (None, 450, 32)           3104      
                                                                 
 max_pooling1d_11 (MaxPoolin  (None, 225, 32)          0         
 g1D)                                                            
                                                                 
 lstm_6 (LSTM)               (None, 100)               53200     
                                                                 
 dense_12 (Dense)            (None, 1)                 101       
                                                                 
Total params: 57,269
Trainable params: 57,269
Non-trainable params: 0
_________________________________________________

<keras.callbacks.History at 0x22b8daa5640>

In [None]:
Y_test = model.predict(X_test)
target_names = ['Male', 'Female']
print(classification_report(y_test, Y_test, target_names=target_names))

              precision    recall  f1-score   support

        Male       0.00      0.00      0.00       736
      Female       0.63      1.00      0.77      1250

    accuracy                           0.63      1986
   macro avg       0.31      0.50      0.39      1986
weighted avg       0.40      0.63      0.49      1986

