# Gender polarity analysis of a dataset

Method based on Dhamala et al. (2021)

In [None]:
import os
import re
from tqdm.notebook import tqdm_notebook

dataset_pathname = "../../data/corpora-en-es/Europarl-v7.en-es.sample.01.en.txt"
results_pathname = dataset_pathname.replace(".txt", "_results.txt")

In [None]:
# Define male and female tokens (according to Dhamala et al., 2021)
male_tokens = {"he", "him", "his", "himself", "man", "men", "he's", "boy", "boys"}
female_tokens = {"she", "her", "hers", "herself", "woman", "women", "she's", "girl", "girls"}

In [None]:
def count_gender_tokens(file_path, male_tok, female_tok):
    # Initialize counters
    male_count, female_count = 1e-100, 1e-100 # Avoid division by zero

    # Replace various apostrophes with a standard one
    def standardize_apostrophe(s):
        return re.sub(r"[‘’´`]", "'", s)

    # Get the number of lines in the file
    num_lines = sum(1 for line in open(file_path, 'r', encoding='utf-8'))

    # Open the file and process line by line
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in tqdm_notebook(file, total=num_lines, desc="Processing"):
            # Standardize apostrophes and split into words
            words = standardize_apostrophe(line.lower()).split()

            # Count the words
            male_count += sum(word in male_tok for word in words)
            female_count += sum(word in female_tok for word in words)

    return male_count, female_count

In [None]:
def print_results(male_count, female_count, pathname=None):
    # Prepare statistics output
    summary = (f"Total number of identified words: {round(male_count + female_count)}\n"
               f"Number of words that are M: {round(male_count)}\n"
               f"Number of words that are F: {round(female_count)}\n"
               f"Ratio M : F: {male_count/female_count:.2f} : 1")

    print(summary)  # Print to terminal
    if pathname is not None:
        with open(pathname, 'w') as f:
            print(summary, file=f)  # Print to file

In [None]:
# Load file
with open(dataset_pathname, 'r', encoding="utf-8") as dataset_file:
    sentences = dataset_file.readlines()


In [None]:
# Perform analysis
male_count, female_count = count_gender_tokens(dataset_pathname, male_tokens, female_tokens)
print_results(male_count, female_count, results_pathname)
os.path.basename(results_pathname)