In [None]:
#get and tokenize names
import os
import pandas as pd # Import pandas as it's used later
import numpy as np

def tokenize_name(name, char_to_int, max_len=14):
    if len(name) > max_len:
        raise ValueError(f"Name '{name}' is too long. Maximum length is {max_len}.")
    name = name.lower()
    tokenized = [char_to_int.get(char, 0) for char in name] # Use 0 for unknown characters
    # Pad or truncate to max_len
    if len(tokenized) < max_len:
        tokenized += [0] * (max_len - len(tokenized))
    elif len(tokenized) > max_len:
        tokenized = tokenized[:max_len]
    return tokenized

def one_hot_encode(tokenized_name, vocab_size):
    encoded = np.zeros((len(tokenized_name), vocab_size))
    for i, token in enumerate(tokenized_name):
        if token != 0: # Don't one-hot encode padding (0)
            encoded[i, token] = 1
    return encoded.tolist()


names_list = []
# Use the built-in open() function instead of os.open()
with open("/content/Popular_Baby_Names.csv", "r") as f:
    reader = pd.read_csv(f)
    # Assuming the column with names is named 'Child's First Name' and gender is 'Gender'
    # Adjust column names if necessary based on your CSV
    # Create a vocabulary of all unique characters in the names
    all_names = reader["Child's First Name"].str.lower().str.cat()
    chars = sorted(list(set(all_names)))
    char_to_int = {char: i + 1 for i, char in enumerate(chars)} # Start indexing from 1, 0 for padding
    vocab_size = len(chars) + 1 # +1 for padding

    for index, row in reader.iterrows():
        is_boy = 1 if row['Gender'] == 'MALE' else 0
        name = row["Child's First Name"]
        tokenized = tokenize_name(name, char_to_int)
        one_hot_encoded = one_hot_encode(tokenized, vocab_size)
        names_list.append([is_boy, one_hot_encoded])


df = pd.DataFrame(names_list, columns=["is_boy","tokenized_name"])

print(f"Vocabulary size: {vocab_size}")
print(df.shape)
display(df.head())

test_df=df.sample(frac=0.2, random_state=42) # Added random_state for reproducibility
df=df.drop(test_df.index)

In [None]:
import torch
from torch import nn

model=nn.Sequential(
    nn.Flatten(),
    nn.Linear(14 * vocab_size, 512), # Changed input size to max_len * vocab_size (406)
    nn.ReLU(),# this layer was origianlly 256
    nn.Linear(512,1),
    nn.Sigmoid() # Added Sigmoid for BCELoss
)
loss_fn=nn.BCELoss()
optimizer=torch.optim.Adam(model.parameters(), lr=0.001)
epochs=250
for i in range(epochs):
  model.train() # Set model to training mode
  # Convert pandas Series of lists to a PyTorch Tensor
  # If using one-hot encoding, the shape is already (batch_size, sequence_length, vocab_size)
  inputs = torch.tensor(df['tokenized_name'].tolist(), dtype=torch.float32) # No need to unsqueeze(2) if one-hot encoded
  # Convert target to PyTorch Tensor
  targets = torch.tensor(df['is_boy'].values, dtype=torch.float32).unsqueeze(1)

  y_pred=model(inputs)
  loss=loss_fn(y_pred,targets) # Use targets instead of df['is_boy']
  loss.backward() # Corrected from loss.back()
  optimizer.step()
  optimizer.zero_grad()

  if (i+1) % 10 == 0: # Print loss every epoch for this short training
      print(f"Epoch [{i+1}/{epochs}], Loss: {loss.item():.4f}")

print("Training finished.")

In [None]:
import torch

def predict_gender(name, model, char_to_int, vocab_size, max_len=14):
    model.eval() # Set the model to evaluation mode
    with torch.no_grad(): # Disable gradient calculation
        # Tokenize the name (returns a list of integers)
        tokenized_name = tokenize_name(name, char_to_int, max_len)
        # One-hot encode the tokens
        one_hot_encoded = one_hot_encode(tokenized_name, vocab_size)
        # Convert the one-hot encoded list of lists to a PyTorch Tensor with batch dimension
        # Shape will be (1, max_len, vocab_size)
        input_tensor = torch.tensor([one_hot_encoded], dtype=torch.float32)

        # Pass the tensor to the model
        user_output = model(input_tensor)
        # Get the probability (assuming the model output is a probability between 0 and 1)
        probability = user_output.item()

        # Determine the predicted class
        predicted_class = "Boy" if probability > 0.5 else "Girl"

        return predicted_class, probability

# Interactive test
while True:
  try:
    name_input = input("Enter a name to predict gender (or 'quit' to exit): ")
    if name_input.lower() == 'quit':
        break
    if name_input:
        # Pass the necessary arguments to the predict_gender function
        predicted_gender, probability = predict_gender(name_input, model, char_to_int, vocab_size)
        print(f"Name: {name_input}, Predicted Gender: {predicted_gender}, Probability (Boy): {probability:.4f}")
    else:
        print("Please enter a name.")
  except ValueError as e:
    print(f"Error: {e}")