In [1]:
import csv
import datasets
import pandas as pd
from typing import Iterable
from gendermap import GenderMap

In [2]:
dataset = datasets.load_dataset("wikitext", "wikitext-103-v1")

In [3]:
train = dataset["train"]
validation = dataset["validation"]
test = dataset["test"]

len(train) + len(validation) + len(test)

1809468

In [4]:
def process_split(split: datasets.arrow_dataset.Dataset, gender_map: GenderMap) -> list[Iterable, Iterable]:
    normalize = lambda string: string.replace("\n", " ").lower().strip()

    normalized = map(normalize, split["text"])
    filtered = filter(gender_map.is_gendered, normalized)

    normalized_copy = map(normalize, split["text"])
    filtered_copy = filter(gender_map.is_gendered, normalized_copy)

    flipped = map(gender_map.flip, filtered_copy)
    return filtered, flipped

In [5]:
gender_map = GenderMap(r"gendered_words/gendered_words.json")

train_filtered, train_flipped = process_split(train, gender_map)
val_filtered, val_flipped = process_split(validation, gender_map)
test_filtered, test_flipped = process_split(test, gender_map)

In [6]:
df_train = pd.DataFrame(dict(original=train_filtered, flipped=train_flipped))
df_val = pd.DataFrame(dict(original=val_filtered, flipped=val_flipped))
df_test = pd.DataFrame(dict(original=test_filtered, flipped=test_flipped))

dataframe = pd.concat([df_train, df_val, df_test], ignore_index=True)
dataframe.tail()

Unnamed: 0,original,flipped
434354,newman was nominated for a golden globe award ...,newman was nominated for a golden globe award ...
434355,bert gordon - nominated villain,bert gordon - nominated villainess
434356,"in the decades since its release , the hustler...","in the decades since its release , the hustler..."
434357,paul newman reprised his role as fast eddie fe...,paul newman reprised her role as fast eddie fe...
434358,the hustler is credited with sparking a resurg...,the hustler is credited with sparking a resurg...


In [7]:
# import tarfile
# # download data.tar.gz from https://github.com/SLAB-NLP/BUG/blob/main/data.tar.gz
# with tarfile.open("data.tar.gz", 'r:gz') as tar:
#     tar.extractall(path="data")

In [8]:
full_BUG = pd.read_csv("data/data/full_BUG.csv")["sentence_text"]

new_df = pd.DataFrame(columns=["original"])
new_df["original"] = full_BUG.str.lower().str.strip()
new_df["flipped"] = new_df.apply(gender_map.flip_series, axis=1)
new_df.tail()

Unnamed: 0,original,flipped
105682,but because no one had distinctly specified ex...,but because no one had distinctly specified ex...
105683,they point out that analysing yourself does n'...,they point out that analysing yourself does n'...
105684,give it to any person with - out calling it by...,give it to any person with - out calling it by...
105685,"from this verse we can understand that , first...","from this verse we can understand that , first..."
105686,""" artist sets fire to himself",""" artist sets fire to herself"


In [9]:
dataframe = pd.concat([dataframe, new_df], ignore_index=True)
dataframe.tail()

Unnamed: 0,original,flipped
540041,but because no one had distinctly specified ex...,but because no one had distinctly specified ex...
540042,they point out that analysing yourself does n'...,they point out that analysing yourself does n'...
540043,give it to any person with - out calling it by...,give it to any person with - out calling it by...
540044,"from this verse we can understand that , first...","from this verse we can understand that , first..."
540045,""" artist sets fire to himself",""" artist sets fire to herself"


In [10]:
dataframe.to_csv("corpus.csv", index=True)