<a href="https://colab.research.google.com/github/deekshakoul/Multilingual-Abuse-Comment-Detection/blob/main/MURIL_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
config = {
    'seed': 42,
    'test_size': 0.1
    'model':'google/muril-base-cased',
    'batch_size' : 32,
    'num_epochs' : 1,
    'max_length' : 64,
    'adam_lr' : 5e-5   
}

In [None]:
import pandas as pd
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split

import re
import unidecode

from transformers import AutoTokenizer

from transformers import AutoModelForSequenceClassification
from transformers import AdamW
from transformers import get_scheduler

from torch.utils.data import Dataset, DataLoader,TensorDataset
import torch

**About the data**

As the data is huge, we implement some processing to get sufficient and relevant data only.
* removed emojis from text. Noticed that most of time the comment with only emojis were largely non-abusive(label 0) and hence removed emojis and then checked the length of text avilable.
* remove any links, digits and punctuation.
* only used those  data that len between 3 and 16 (inclusive), onfirmed via the distribution of text lengths.

In [None]:
train = 'ShareChat-IndoML-Datathon-NSFW-CommentChallenge_Train.csv'
test = 'ShareChat-IndoML-Datathon-NSFW-CommentChallenge_Test_20_Percent_NoLabel.csv'

Reading train and test data

In [None]:
with open (train, 'r') as f:
    t = f.readlines()
with open (test, 'r') as f:
    tt = f.readlines()  

In [None]:
txts = []
ids = []
labels = []
skip = 0
for i  in range(1, len(t)):
    if t[i][-2] in ["0", "1"]:
#         x = t[i].split(",")[0]
        txts.append(t[i][:-3])
        ids.append(i-1)
        labels.append(int(t[i][-2]))
    else:
        skip += 1
df = pd.DataFrame({ 'ids': ids, 'text': txts, 'label': labels})

In [None]:
def cleaning(text):
#     text = unidecode.unidecode(text)
    text = text.lower()
    text = re.sub(',(?!(?=[^"]*"[^"]*(?:"[^"]*"[^"]*)*$))', " ", text)
    text = text.replace("\n", " ")
    text = re.sub(r'[0-9]+', '', text) #digits removal
#     text =  re.sub(r'http\S+', ' ', text)
    return text.strip()

In [None]:
test = []
tids = []
for i in range(1,len(tt)):
    tids.append(int(tt[i].split(",")[0]))
    test.append(cleaning(tt[i]))

In [None]:
df['final'] = df['text'].apply(lambda row: cleaning(row))

Emoji removal using package UNICODE_EMOJI

In [None]:
from emoji import UNICODE_EMOJI
def remove_emojis(s): 
    return ''.join(c for c in s if c not in UNICODE_EMOJI['en'])

In [None]:
df['f'] = df['final'].apply(remove_emojis)
df['len'] = df['f'].apply(lambda s: len(s.strip().split()))

Selection of data based on their text lengths

In [None]:
df = df[(df['len'] > 2) & (df['len'] < 17)]

In [None]:
df_train, df_valid = train_test_split(df, 
                                      shuffle=True, 
                                      random_state= config['seed'], 
                                      test_size=  config['test_size'], 
#                                       stratify=df['language'].values)
                                      stratify=df['label'].values)

df_train = df_train.reset_index(drop=True)
df_valid = df_valid.reset_index(drop=True)
df_train.to_csv('df_train.csv', index=False)
df_valid.to_csv('df_valid.csv', index=False)

# Preprocessing 

Main tool is tokenizer
    
    - split text in words: tokens 
    - tokens into numbers - tensor
    - add additional inputs that our model needs(special tokens)
 
Ex. 
```
encoded_input = tokenizer("Hello, I'm a single sentence!")
print(encoded_input)

{'input_ids': [101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102], 
     'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
     'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
```  
* input_ids -  indices corresponding to each token
* token_type_ids
* attention_mask

Decoder example - 
```
tokenizer.decode(encoded_input["input_ids"]) 
Outputs - "[CLS] Hello, I'm a single sentence! [SEP]"
```

The tokenizer automatically added some special tokens that the model expects. If you don't want any additional tokens set add_special_tokens=False and add special tokens on your own.

If you have several sentences you want to process, you can do this efficiently by sending them as a list to the tokenizer.

**Note - If you plan on using a pretrained model, it’s important to use the associated pretrained tokenizerb**

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config['model'])

# DATASET and DATALOADER

Dataset stores the samples and their corresponding labels, and DataLoader wraps an iterable around the Dataset to enable easy access to the samples

Follow this tutorial to create custom dataset on [pytorch](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html#creating-a-custom-dataset-for-your-files)

I have created a custom dataset class with the necessary three functions that are : \_\_init__, \_\_len__, and \_\_getitem__. 

In [None]:
class BERTDataset(Dataset):
    def __init__(self, text, target, tokenizer):
        self.max_length = config['max_length']
        self.dict = tokenizer(text, max_length = self.max_length, padding='max_length', truncation=True)
        self.target = target
    def __len__(self):
        return len(self.dict['input_ids'])
    
    def __getitem__(self, ids):
        if (self.target is None):
            return {
            'input_ids' : torch.tensor(self.dict['input_ids'][ids], dtype=torch.long),
            'token_type_ids' : torch.tensor(self.dict['token_type_ids'][ids], dtype=torch.long),
            'attention_mask' : torch.tensor(self.dict['attention_mask'][ids], dtype=torch.long),
        }
        else :
            return {
            'input_ids' : torch.tensor(self.dict['input_ids'][ids], dtype=torch.long),
            'token_type_ids' : torch.tensor(self.dict['token_type_ids'][ids], dtype=torch.long),
            'attention_mask' : torch.tensor(self.dict['attention_mask'][ids], dtype=torch.long),
            'labels' : torch.tensor(self.target[ids], dtype=torch.long)
        }

In [None]:
train_text, train_target = df_train['final'].to_list(), df_train['label'].to_list()
eval_text, eval_target = df_valid['final'].to_list(), df_valid['label'].to_list()
# test_text = df_test['final'].to_list()

In [None]:
train_dataset = BERTDataset(train_text, train_target, tokenizer)
eval_dataset = BERTDataset(eval_text, eval_target, tokenizer)
test_dataset = BERTDataset(test_text, None, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=config['batch_size'])
test_dataloader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False)

As the dataset is huge the tokenization can take a lot of ram, hence further training of model wouldn't be possible in kaggle kernel. I have the saved the tokenization result too. [Code is taken from [Kaggle Post](https://www.kaggle.com/harveenchadha/tokenize-train-data-using-bert-tokenizer/notebook)]. If you also want to save tokenization and then reload dataset, I have included that code as well in a different notebook[find here to reload the saved tokenization(POST)[https://www.kaggle.com/deekoul/custom-dataset-dataloader-to-load-bert-tokenizer/].

In [None]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=config['max_length']):
    input_ids = []
    tt_ids = []
    at_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size]
        encs = tokenizer(
                    text_chunk,
                    max_length = maxlen,
                    padding='max_length',
                    truncation=True
                    )
        
        input_ids.extend(encs['input_ids'])
        tt_ids.extend(encs['token_type_ids'])
        at_ids.extend(encs['attention_mask'])
    
    return {'input_ids': input_ids, 'token_type_ids': tt_ids, 'attention_mask':at_ids}

In [None]:
token_train = fast_encode(list(df_train['f'].values), tokenizer)
token_train['label'] = list(df_train['label'].values)
token_valid = fast_encode(list(df_valid['f'].values), tokenizer)
token_valid['label'] = list(df_valid['label'].values)

In [None]:
token_train.keys(), token_valid.keys()

SAVING TOKENIZATION.

Please check the post on how to reload it in train_dataloader and valid_dataloader.

In [None]:
import numpy as np
np.save('token_train.npy', token_train )
np.save('token_valid.npy', token_valid)

Instantiated the model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(config['model'], num_labels=2)
model.to(device) #only if using GPUs

In [None]:
'''
for i in train_dataloader:
    print(i['labels'].shape)
    break

'''

In [None]:
optimizer = AdamW(model.parameters(), lr=config['adam_lr'])
num_training_steps = config['num_epochs'] * len(train_dataloader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [None]:
progress_bar = tqdm(range(num_training_steps))
model.train()
for epoch in range(config['num_epochs']):
    print("epoch - ", epoch)
    tr_loss = 0.0
    for batch in train_dataloader:
        batch = {k : v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        tr_loss += loss.item()
    print('training done with loss- ', tr_loss)
    # check validation
    a_,b_ = evaluate(model, eval_dataloader)

In [None]:
def evaluate(model, eval_dataloader):
    model.eval()
    vloss_final = 0.0
    for i,batch in enumerate(eval_dataloader):
        batch = {k : v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        logits = outputs.logits 
        predictions = torch.argmax(logits, dim=-1)#batch, label
        vloss = outputs.loss
        if  i == 0:
            predict_ = predictions
            truth =  batch['labels']
            continue
        predict_ = torch.cat( (predict_, predictions) )
        truth = torch.cat( (truth, batch['labels']) )
        vloss_final += vloss.item()
    print("validation done")
    f1 = f1_score(predict_, truth)
    print(f1, vloss_final)
    return predict_, truth