In [1]:
import pandas as pd
from pathlib import Path

## Different subsets - minimal size

In [2]:
base_dir = Path("../data/jigsaw/multilingual/")
files = list(base_dir.glob("*.json"))

langs = ["en", "arb_Arab", "hin_Deva", "kor_Hang", "por_Latn", "rus_Cyrl"]
toxic_pattern = "{lang}_toxicity_gte0.5_clean.json"
nontoxic_pattern = "{lang}_toxicity_eq0_half_clean.json"

num_toxic, num_nontoxic = 3000, 10000

base_toxic = pd.read_json(base_dir.parent / toxic_pattern.format(lang="").strip("_"))
base_nontoxic = pd.read_json(
    base_dir.parent / nontoxic_pattern.format(lang="").strip("_")
)


def get_sampled_indices(base_df, num_samples, num_sets, random_state):
    return base_df.sample(
        n=num_samples * num_sets, random_state=random_state
    ).index.values.reshape((num_sets, num_samples))


toxic_indices = get_sampled_indices(base_toxic, num_toxic, len(langs), random_state=42)
nontoxic_indices = get_sampled_indices(
    base_nontoxic, num_nontoxic, len(langs), random_state=42
)

In [3]:
toxic_indices.shape

(6, 3000)

In [5]:
out_folder = base_dir / "different_subsets"
out_folder.mkdir(exist_ok=True, parents=True)

# English
toxic_df = base_toxic.loc[toxic_indices[0]]
nontoxic_df = base_nontoxic.loc[nontoxic_indices[0]]

toxic_out_file = out_folder / f"en_toxicity_gte0.5_clean_{num_toxic}_sampled.json"
nontoxic_out_file = (
    out_folder / f"en_toxicity_eq0_half_clean_{num_nontoxic}_sampled.json"
)

toxic_df.to_json(toxic_out_file, orient="records")
nontoxic_df.to_json(nontoxic_out_file, orient="records")

# Other languages
for lang, toxic, nontoxic in zip(langs[1:], toxic_indices[1:], nontoxic_indices[1:]):
    toxic_file = base_dir / toxic_pattern.format(lang=lang)
    nontoxic_file = base_dir / nontoxic_pattern.format(lang=lang)

    toxic_df = pd.read_json(toxic_file)
    nontoxic_df = pd.read_json(nontoxic_file)

    toxic_df = toxic_df.loc[toxic]
    nontoxic_df = nontoxic_df.loc[nontoxic]

    toxic_out_file = (
        out_folder / f"{lang}_toxicity_gte0.5_clean_{num_toxic}_sampled.json"
    )
    nontoxic_out_file = (
        out_folder / f"{lang}_toxicity_eq0_half_clean_{num_nontoxic}_sampled.json"
    )

    toxic_df.to_json(toxic_out_file, orient="records")
    nontoxic_df.to_json(nontoxic_out_file, orient="records")

## Multilingual datastore size - parallel data across languages

In [4]:
base_dir = Path("../data/jigsaw/multilingual/")
files = list(base_dir.glob("*.json"))

langs = ["en", "arb_Arab", "hin_Deva", "kor_Hang", "por_Latn", "rus_Cyrl"]
toxic_pattern = "{lang}_toxicity_gte0.5_clean.json"
nontoxic_pattern = "{lang}_toxicity_eq0_half_clean.json"

# 264K comments
base_toxic = pd.read_json(base_dir.parent / toxic_pattern.format(lang="").strip("_"))
# 1,1M comments
base_nontoxic = pd.read_json(
    base_dir.parent / nontoxic_pattern.format(lang="").strip("_")
)


def get_sampled_indices(base_df, num_samples, num_sets, random_state):
    return base_df.sample(
        n=num_samples * num_sets, random_state=random_state
    ).index.values.reshape((num_sets, num_samples))


num_toxic = [1000, 3000]  # [5000, 10000, 25000]
num_nontoxic = [10000, 20000]  # [10000, 20000, 50000]

toxic_indices = [
    get_sampled_indices(base_toxic, num_samples, num_sets=1, random_state=42).reshape(
        -1
    )
    for num_samples in num_toxic
]
nontoxic_indices = [
    get_sampled_indices(
        base_nontoxic, num_samples, num_sets=1, random_state=42
    ).reshape(-1)
    for num_samples in num_nontoxic
]

In [5]:
out_folder = base_dir / "multilingual_size"
out_folder.mkdir(exist_ok=True, parents=True)

for toxic_samples, nontoxic_samples in zip(toxic_indices, nontoxic_indices):
    num_t = len(toxic_samples)
    num_nt = len(nontoxic_samples)

    # English
    temp_toxic_df = base_toxic.loc[toxic_samples]
    temp_nontoxic_df = base_nontoxic.loc[nontoxic_samples]

    toxic_df = pd.concat([pd.DataFrame(), temp_toxic_df])
    nontoxic_df = pd.concat([pd.DataFrame(), temp_nontoxic_df])

    # Other languages
    for lang in langs[1:]:
        print(f"Language: {lang} -- {num_t} / {num_nt}")
        toxic_file = base_dir / toxic_pattern.format(lang=lang)
        nontoxic_file = base_dir / nontoxic_pattern.format(lang=lang)

        temp_toxic_df = pd.read_json(toxic_file)
        temp_nontoxic_df = pd.read_json(nontoxic_file)

        temp_toxic_df = temp_toxic_df.loc[toxic_samples]
        temp_nontoxic_df = temp_nontoxic_df.loc[nontoxic_samples]

        toxic_df = pd.concat([toxic_df, temp_toxic_df])
        nontoxic_df = pd.concat([nontoxic_df, temp_nontoxic_df])

    toxic_out_file = out_folder / f"multi_mid_toxicity_gte0.5_clean_{num_t}_each.json"
    nontoxic_out_file = (
        out_folder / f"multi_mid_toxicity_eq0_half_clean_{num_nt}_each.json"
    )

    toxic_df.to_json(toxic_out_file, orient="records")
    nontoxic_df.to_json(nontoxic_out_file, orient="records")

Language: arb_Arab -- 1000 / 10000
Language: hin_Deva -- 1000 / 10000
Language: kor_Hang -- 1000 / 10000
Language: por_Latn -- 1000 / 10000
Language: rus_Cyrl -- 1000 / 10000
Language: arb_Arab -- 3000 / 20000
Language: hin_Deva -- 3000 / 20000
Language: kor_Hang -- 3000 / 20000
Language: por_Latn -- 3000 / 20000
Language: rus_Cyrl -- 3000 / 20000


In [24]:
nontoxic_df.shape

(1200000, 1)