In [None]:
import pandas as pd
import gender_guesser.detector as gd
from gender_detector.gender_detector import GenderDetector

In [None]:
df = pd.read_csv("data/ijhs_202324.csv")
df.columns = [c.upper() for c in df.columns]

In [None]:
df.rename(columns={"STUDENT NAME": "NAME"}, inplace=True)

In [None]:
unique_countries = df['COUNTRY'].nunique()
print(f"Unique countries: {unique_countries}")

countries_list = df['COUNTRY'].unique()
print("Countries list:", countries_list)

unique_schools = df['SCHOOL'].nunique()
print(f"Unique schools: {unique_schools}")

In [None]:
ASIA_FAMILY_FIRST = {
    "CHINA",
    "CHINA HONG KONG",
    "CHINA MACAU",
    "CAMBODIA",
    "JAPAN",
    "SOUTH KOREA",
    "VIETNAM",
    "TAIWAN",
    "HONG KONG",
    "MYANMAR",
    "THAILAND",
}


def get_given_name(full_name, country):
    if pd.isna(full_name) or pd.isna(country):
        return None
    
    full_name = str(full_name).strip()
    
    if not full_name:
        return None
        
    tokens = full_name.split()
    if not tokens:
        return None
        
    if country.upper() in ASIA_FAMILY_FIRST and len(tokens) > 1:
        return tokens[-1]
    else:
        return tokens[0]

In [None]:
det = gd.Detector(case_sensitive=False)


def round_1(name):
    if pd.isna(name) or name is None:
        return "unknown"
    
    g = det.get_gender(name)
    if g in ("female", "mostly_female"):
        return "female"
    if g in ("male", "mostly_male"):
        return "male"
    return "unknown"

In [None]:
detector = GenderDetector('us')

def round_2(name: str) -> str:
    if pd.isna(name) or name is None:
        return 'unknown'
    
    name = str(name)
    
    cleaned_name = ''.join(c for c in name if c.isalpha() or c.isspace())
    cleaned_name = cleaned_name.strip()
    
    if not cleaned_name:
        return 'unknown'
    
    first_word = cleaned_name.split()[0]
    
    try:
        g = detector.guess(first_word)
        if g in ('male', 'mostly_male'):
            return 'male'
        if g in ('female', 'mostly_female'):
            return 'female'
        return 'unknown'
    except (KeyError, IndexError, Exception):
        return 'unknown'

In [None]:
df["given"] = df.apply(lambda r: get_given_name(r.NAME, r.COUNTRY), axis=1)
df["g1"] = df["given"].apply(round_1)
df["g2"] = df["given"].apply(round_2)

In [None]:
def ensemble(g1, g2):
    for g in (g1, g2):
        if g in ("male", "female"):
            return g
    return "unknown"


df["gender"] = df.apply(lambda r: ensemble(r.g1, r.g2), axis=1)

In [None]:
print(df["gender"].value_counts(dropna=False))