## Setting Up

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os

In [2]:
DATA_DIR = '../data/'
SEED = 12

## Clean and Prep Wiki Data

In [3]:
import pandas as pd

In [4]:
toxicity_annotated_comments = pd.read_csv(os.path.join(DATA_DIR, 'toxicity_annotated_comments.tsv'), sep = '\t')
toxicity_annotations = pd.read_csv(os.path.join(DATA_DIR, 'toxicity_annotations.tsv'), sep = '\t')

In [5]:
annotations_gped = toxicity_annotations.groupby('rev_id', as_index=False).agg({'toxicity': 'mean'})
all_data = pd.merge(annotations_gped, toxicity_annotated_comments, on = 'rev_id')

In [6]:
all_data['comment'] = all_data['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
all_data['comment'] = all_data['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))

# TODO(nthain): Consider doing regression instead of classification
all_data['is_toxic'] = all_data['toxicity'] > 0.5

In [7]:
# split into train, valid, test
wiki_splits = {}
for split in ['train', 'test', 'dev']:
    wiki_splits[split] = all_data.query('split == @split')

In [8]:
for split in wiki_splits:
    wiki_splits[split].to_csv(os.path.join(DATA_DIR, 'wiki_%s.csv' % split), index=False)

### Prep debiasing data

In [9]:
def augment_with_data(source_df, target_path, target_name, sep = '\t', write = True):
    target_df = pd.read_csv(target_path, sep = '\t')
    target_df['sample'] = target_name
    target_splits = {}
    for split in source_df:
        target_splits[split] = pd.concat([source_df[split],
                                          target_df.query('split == @split')]).sample(frac = 1, random_state = SEED)
        if write:
            target_splits[split].to_csv(os.path.join(DATA_DIR, 'wiki_%s_%s.csv' % (target_name, split)), index=False)
    return target_splits
    

In [10]:
debias_splits = augment_with_data(wiki_splits, '../data/toxicity_debiasing_data.tsv', 'debias')

In [12]:
wiki_splits['train'].shape

(95692, 9)

In [11]:
debias_splits['train'].shape

(99157, 9)

### Prep random data

In [13]:
random_splits = augment_with_data(wiki_splits, '../data/toxicity_debiasing_data_random.tsv', 'debias_random')

In [15]:
random_splits['train'].shape

(99157, 9)