# Generate name-compiled-csv

In [67]:
import os
import json
import pandas as pd

In [68]:
data_path = "/Users/chiphan/Documents/Chi/1-Learning/2024-Fall/COMP4040-DataMining/Assignment/Project/code/Cultural-Fairness-LLM/dataset"
western_name_path = os.path.join(data_path, "GenderByName.csv")
vnmese_name_path = os.path.join(data_path, "UIT-ViNames.json")

In [69]:
# read data
western_name_df = pd.read_csv(western_name_path)
vnmese_name_df = pd.read_json(vnmese_name_path)


In [70]:
western_name_df

Unnamed: 0,Name,Gender,Count,Probability
0,James,M,5304407,1.451679e-02
1,John,M,5260831,1.439753e-02
2,Robert,M,4970386,1.360266e-02
3,Michael,M,4579950,1.253414e-02
4,William,M,4226608,1.156713e-02
...,...,...,...,...
147264,Zylenn,M,1,2.736740e-09
147265,Zymeon,M,1,2.736740e-09
147266,Zyndel,M,1,2.736740e-09
147267,Zyshan,M,1,2.736740e-09


In [71]:
# count number of unique values in column gender

western_name_df['Gender'].value_counts()

Gender
F    89749
M    57520
Name: count, dtype: int64

In [72]:
vnmese_name_df

Unnamed: 0,full_name,gender
0,Ngô Xuân Tùng,1
1,Bùi Dương Thảo Vy,0
2,Lưu Thế Huy,1
3,Nguyễn Thị Vân,0
4,Dương Minh Long,1
...,...,...
26846,Đoàn Thị Bảo Thu,0
26847,Ôn Hoa Thu,1
26848,Nguyễn Thị Huỳnh Như,0
26849,Nguyễn Thị Hồng Phúc,0


In [73]:
vnmese_name_df['gender'].value_counts()

gender
1    15494
0    11357
Name: count, dtype: int64

In [74]:
import random

def merge_names(western_name_df, vnmese_name_df, output_path):
    
    # Rename columns for consistency
    western_name_df.rename(columns={"Name": "Name", "Gender": "Gender"}, inplace=True)
    vnmese_name_df.rename(columns={"full_name": "Name", "gender": "Gender"}, inplace=True)
    
    # Map Gender to strings
    western_gender_mapping = {"M": "Male", "F": "Female"}
    vnmese_gender_mapping = {1: "Male", 0: "Female"}
    western_name_df["Gender"] = western_name_df["Gender"].map(western_gender_mapping)
    vnmese_name_df["Gender"] = vnmese_name_df["Gender"].map(vnmese_gender_mapping)
    
    # Add Culture column
    western_name_df["Culture"] = "Western"
    vnmese_name_df["Culture"] = "Vietnamese"
    
    # Randomly select 100 male and 100 female names from each dataset
    western_male = western_name_df[western_name_df["Gender"] == "Male"].sample(100, random_state=42)
    western_female = western_name_df[western_name_df["Gender"] == "Female"].sample(100, random_state=42)
    
    vnese_male = vnmese_name_df[vnmese_name_df["Gender"] == "Male"].sample(100, random_state=42)
    vnese_female = vnmese_name_df[vnmese_name_df["Gender"] == "Female"].sample(100, random_state=42)
    
    # Concatenate the datasets
    compiled_data = pd.concat([western_male, western_female, vnese_male, vnese_female], ignore_index=True)
    
    # Add index column
    compiled_data.reset_index(drop=True, inplace=True)
    columns = ["Name", "Gender", "Culture"]
    compiled_data = compiled_data[columns]
 
    # Save to CSV
    columns = ["Name", "Gender", "Culture"]
    compiled_data.to_csv(output_path, index=False)

    return compiled_data

In [75]:
compiled_name_path = os.path.join(data_path, "compiled_name.csv")
compiled_data = merge_names(western_name_df, vnmese_name_df, compiled_name_path)
compiled_data

Unnamed: 0,Name,Gender,Culture
0,Shep,Male,Western
1,Haskle,Male,Western
2,Dewuan,Male,Western
3,Peyden,Male,Western
4,Cobra,Male,Western
...,...,...,...
395,Nguyễn Châu Thanh Thảo,Female,Vietnamese
396,Nguyễn Thị Thanh Trúc,Female,Vietnamese
397,Lê Thị Mộng Thùy Dương,Female,Vietnamese
398,Trần Thị Thùy Duyên,Female,Vietnamese
