## Dataset

In [1]:
from tqdm.auto import tqdm
tqdm.pandas()

import numpy as np
import pandas as pd
import re
import string

from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict, load_from_disk
from transformers import AutoTokenizer

In [2]:
model_ckpt = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [3]:
max_seq_len = 65

In [25]:
train_df = pd.read_csv('../data/raw/jigsaw/jigsaw-multiling-1st-subset/train.csv')

In [54]:
train_df[train_df['comment_text'].str.contains('#')].comment_text.values

array(['"\nMore\nI can\'t make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of ""types of accidents""  -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\n\nThere appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It\'s listed in the relevant form eg Wikipedia:Good_article_nominations#Transport  "',
       '"\nFair use rationale for Image:Wonju.jpg\n\nThanks for uploading Image:Wonju.jpg. I notice the image page specifies that the image is being used under fair use but there is no explanation or rationale as to why its use in Wikipedia articles constitutes fair use. In addition to the boilerplate fair use template, you must also write out on the image de

In [84]:
# clean
def rm_ip_address(text):
    return re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', '', text)

def rm_link(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

def rm_emoji(text):
    emojis = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE
    )
    return emojis.sub(r'', text)

def rm_nonascii(text):
    return re.sub(r'[^\x00-\x7f]', r'', text)

def rm_inappropriate_sym(text):
    return re.sub(r'[\:\%\=\~]', ' ', text)

def rm_money(text):
    return re.sub(r'\$\s?((?:\d+[A-z])|((?:\d+[\,\.])+\d+(?=\s))|((?:\d+)))', r'', text)

def space_between_sym(text):    
    return re.sub(r'([\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~])', r' \1 ', text)

def rm_additional_space(text):
    return re.sub(r' +', ' ', text)
    
def rm_email(text):
    return re.sub(r'(?:(\S+)?\@\S+)', r'', text)

def rm_middle_dot(text):
    return re.sub(r'(?<=\w)\.(?=\w+)', '', text)

def rm_middle_spaces(text):
    return re.sub(r'(?<=\w)\s(?=\w+)', '', text)

def clean_pipeline(text):
    no_sym = rm_inappropriate_sym(text)
    no_ip_address = rm_ip_address(no_sym)
    no_link = rm_link(no_ip_address)
    no_emoji = rm_emoji(no_link)
    no_nonascii = rm_nonascii(no_emoji)
    no_email = rm_email(no_nonascii)    
    no_mid_dots = rm_middle_dot(no_email)
    space_between = space_between_sym(no_mid_dots)
    single_space = rm_additional_space(space_between)

    return single_space

In [None]:
def is_toxic(row):
    return 1 if row.sum() > 1 else -1 if row.sum() < 0 else 0

train_df['is_toxic'] = train_df.iloc[:, 2:].progress_apply(is_toxic, axis=1)

In [85]:
train_df['comment_clean'] = train_df.comment_text.progress_apply(clean_pipeline)
# train_df = train_df[['comment_clean', 'is_toxic']]

100%|██████████| 223549/223549 [00:40<00:00, 5537.45it/s]


In [92]:
train, test = train_test_split(train_df, test_size=0.2, shuffle=True, stratify=train_df['is_toxic'])
test, val = train_test_split(test, test_size=0.5, shuffle=True, stratify=test['is_toxic'])
len(train), len(val), len(test)

(178839, 22355, 22355)

In [93]:
# save to HF dataset format for easy load later
dataset = DatasetDict({
    'train': Dataset.from_pandas(train),
    'validation': Dataset.from_pandas(val),
    'test': Dataset.from_pandas(test)
})

dataset = dataset.remove_columns('__index_level_0__')

# dataset.save_to_disk('../data/interim/toxic-cleaned')

### Tokenize

In [None]:
dataset = load_from_disk('../data/interim/toxic-cleaned/')

In [94]:
def tokenize_data(examples):

    texts = [t.strip() for t in examples['comment_clean']]    

    tokenized = tokenizer(
        texts,
        truncation=True,
        max_length=max_seq_len,
        padding='max_length',
        stride=10,
        return_overflowing_tokens=True,
    )

    tokenized['labels'] = []    

    # extract mapping between new and old indices
    ### this operation simply populate labels over the overflowing tokens    
    sample_map = tokenized.pop('overflow_to_sample_mapping')    
    for i in sample_map:
        tokenized['labels'].append(examples['is_toxic'][i])
        
    return tokenized

In [95]:
tokenized_dataset = DatasetDict({
    'train': dataset['train'].map(tokenize_data, batched=True, remove_columns=dataset['train'].column_names),
    'validation': dataset['validation'].map(tokenize_data, batched=True, remove_columns=dataset['validation'].column_names),
    'test': dataset['test'].map(tokenize_data, batched=True, remove_columns=dataset['test'].column_names)
})

100%|██████████| 179/179 [00:51<00:00,  3.50ba/s]
100%|██████████| 23/23 [00:06<00:00,  3.45ba/s]
100%|██████████| 23/23 [00:05<00:00,  3.93ba/s]


In [16]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 382550
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 47219
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 48330
    })
})

In [98]:
tokenized_dataset.set_format('torch')

In [105]:
from collections import Counter

flatten_train = tokenized_dataset['train']['input_ids'].view(-1)
flatten_val = tokenized_dataset['validation']['input_ids'].view(-1)
flatten_test = tokenized_dataset['test']['input_ids'].view(-1)

In [113]:
flatten_all = list(set(flatten_train.tolist() + flatten_val.tolist() + flatten_test.tolist()))

In [115]:
# tokenized_dataset.save_to_disk('../data/interim/toxic-cleaned-tokenized')
tokenized_dataset.push_to_hub('affahrizain/jigsaw-toxic-comment')

Pushing split train to the Hub.
The repository already exists: the `private` keyword argument will be ignored.
Deleting unused files from dataset repository: 100%|██████████| 1/1 [00:01<00:00,  1.10s/it]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:42<00:00, 42.42s/it]
Pushing split validation to the Hub.
The repository already exists: the `private` keyword argument will be ignored.
Deleting unused files from dataset repository: 100%|██████████| 1/1 [00:01<00:00,  1.06s/it]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:07<00:00,  7.80s/it]
Pushing split test to the Hub.
The repository already exists: the `private` keyword argument will be ignored.
Deleting unused files from dataset repository: 100%|██████████| 1/1 [00:01<00:00,  1.07s/it]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:07<00:00,  7.25s/it]
