## Dataset

In [1]:
from tqdm.auto import tqdm
tqdm.pandas()

import numpy as np
import pandas as pd
import re

from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict, load_from_disk
from transformers import AutoTokenizer

In [7]:
model_ckpt = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [13]:
max_seq_len = 65

In [2]:
train_df = pd.read_csv('../data/raw/jigsaw/jigsaw-multiling-1st-subset/train.csv')

In [3]:
# clean
def rm_ip_address(text):
    return re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', '', text)

def rm_link(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

def rm_emoji(text):
    emojis = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE
    )
    return emojis.sub(r'', text)

def rm_middle_dot(text):
    return re.sub(r'(?<=\w)\.(?=\w+)', '', text)

def rm_middle_spaces(text):
    return re.sub(r'(?<=\w)\s(?=\w+)', '', text)

def clean_pipeline(text):
    no_ip_address = rm_ip_address(text)
    no_link = rm_link(no_ip_address)
    no_emoji = rm_emoji(no_link)
    no_mid_dots = rm_middle_dot(no_emoji)

    return no_mid_dots

In [4]:
def is_toxic(row):
    return 1 if row.sum() > 1 else -1 if row.sum() < 0 else 0

train_df['is_toxic'] = train_df.iloc[:, 2:].progress_apply(is_toxic, axis=1)

  0%|          | 0/223549 [00:00<?, ?it/s]

In [5]:
train_df['comment_clean'] = train_df.comment_text.progress_apply(clean_pipeline)
train_df = train_df[['comment_clean', 'is_toxic']]

  0%|          | 0/223549 [00:00<?, ?it/s]

In [11]:
train, test = train_test_split(train_df, test_size=0.2, shuffle=True, stratify=train_df['is_toxic'])
test, val = train_test_split(test, test_size=0.5, shuffle=True, stratify=test['is_toxic'])
len(train), len(val), len(test)

(178839, 22355, 22355)

In [12]:
# save to HF dataset format for easy load later
dataset = DatasetDict({
    'train': Dataset.from_pandas(train),
    'validation': Dataset.from_pandas(val),
    'test': Dataset.from_pandas(test)
})

dataset = dataset.remove_columns('__index_level_0__')

# dataset.save_to_disk('../data/interim/toxic-cleaned')

### Tokenize

In [None]:
dataset = load_from_disk('../data/interim/toxic-cleaned/')

In [14]:
def tokenize_data(examples):

    texts = [t.strip() for t in examples['comment_clean']]    

    tokenized = tokenizer(
        texts,
        truncation=True,
        max_length=max_seq_len,
        padding='max_length',
        stride=10,
        return_overflowing_tokens=True,
    )

    tokenized['labels'] = []    

    # extract mapping between new and old indices
    ### this operation simply populate labels over the overflowing tokens    
    sample_map = tokenized.pop('overflow_to_sample_mapping')    
    for i in sample_map:
        tokenized['labels'].append(examples['is_toxic'][i])
        
    return tokenized

In [15]:
tokenized_dataset = DatasetDict({
    'train': dataset['train'].map(tokenize_data, batched=True, remove_columns=dataset['train'].column_names),
    'validation': dataset['validation'].map(tokenize_data, batched=True, remove_columns=dataset['validation'].column_names),
    'test': dataset['test'].map(tokenize_data, batched=True, remove_columns=dataset['test'].column_names)
})



  0%|          | 0/179 [00:00<?, ?ba/s]

  0%|          | 0/23 [00:00<?, ?ba/s]

  0%|          | 0/23 [00:00<?, ?ba/s]

In [16]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 382550
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 47219
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 48330
    })
})

In [17]:
tokenized_dataset.save_to_disk('../data/interim/toxic-cleaned-tokenized')
tokenized_dataset.push_to_hub('affahrizain/jigsaw-toxic-comment')

Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split validation to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split test to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]