<a href="https://colab.research.google.com/github/esh04/Detoxifier/blob/main/Detox.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import pandas as pd
import numpy as np
import torch

# Preprocessing

In [2]:
train = pd.read_csv('../dataset/train.csv')
test = pd.read_csv('../dataset/test.csv')

In [3]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
#get rid of empty comments
train['comment_text'].fillna("unknown", inplace=True)
test['comment_text'].fillna("unknown", inplace=True)

In [10]:
# cleaning the data
import re
import string

def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [11]:
train['text'] = train['comment_text'].apply(lambda x: clean_text(x))
test['text'] = test['comment_text'].apply(lambda x: clean_text(x))

In [12]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,text
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation why the edits made under my userna...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,daww he matches this background colour im seem...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,hey man im really not trying to edit war its j...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,more i cant make any real suggestions on impr...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,you sir are my hero any chance you remember wh...


In [14]:
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

In [15]:
label_columns = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train['labels'] = train[label_columns].apply(lambda x: list(x), axis=1)

train.drop(['id'], inplace=True, axis=1)
train.drop(label_columns, inplace=True, axis=1)


Unnamed: 0,comment_text,text,tokenized_text,labels
0,Explanation\nWhy the edits made under my usern...,explanation why the edits made under my userna...,"[explanation, why, the, edits, made, under, my...","[0, 0, 0, 0, 0, 0]"
1,D'aww! He matches this background colour I'm s...,daww he matches this background colour im seem...,"[daww, he, matches, this, background, colour, ...","[0, 0, 0, 0, 0, 0]"
2,"Hey man, I'm really not trying to edit war. It...",hey man im really not trying to edit war its j...,"[hey, man, im, really, not, trying, to, edit, ...","[0, 0, 0, 0, 0, 0]"
3,"""\nMore\nI can't make any real suggestions on ...",more i cant make any real suggestions on impr...,"[more, i, cant, make, any, real, suggestions, ...","[0, 0, 0, 0, 0, 0]"
4,"You, sir, are my hero. Any chance you remember...",you sir are my hero any chance you remember wh...,"[you, sir, are, my, hero, any, chance, you, re...","[0, 0, 0, 0, 0, 0]"


In [16]:
train

Unnamed: 0,comment_text,text,tokenized_text,labels
0,Explanation\nWhy the edits made under my usern...,explanation why the edits made under my userna...,"[explanation, why, the, edits, made, under, my...","[0, 0, 0, 0, 0, 0]"
1,D'aww! He matches this background colour I'm s...,daww he matches this background colour im seem...,"[daww, he, matches, this, background, colour, ...","[0, 0, 0, 0, 0, 0]"
2,"Hey man, I'm really not trying to edit war. It...",hey man im really not trying to edit war its j...,"[hey, man, im, really, not, trying, to, edit, ...","[0, 0, 0, 0, 0, 0]"
3,"""\nMore\nI can't make any real suggestions on ...",more i cant make any real suggestions on impr...,"[more, i, cant, make, any, real, suggestions, ...","[0, 0, 0, 0, 0, 0]"
4,"You, sir, are my hero. Any chance you remember...",you sir are my hero any chance you remember wh...,"[you, sir, are, my, hero, any, chance, you, re...","[0, 0, 0, 0, 0, 0]"
...,...,...,...,...
159566,""":::::And for the second time of asking, when ...",and for the second time of asking when your vi...,"[and, for, the, second, time, of, asking, when...","[0, 0, 0, 0, 0, 0]"
159567,You should be ashamed of yourself \n\nThat is ...,you should be ashamed of yourself that is a ...,"[you, should, be, ashamed, of, yourself, that,...","[0, 0, 0, 0, 0, 0]"
159568,"Spitzer \n\nUmm, theres no actual article for ...",spitzer umm theres no actual article for pro...,"[spitzer, umm, theres, no, actual, article, fo...","[0, 0, 0, 0, 0, 0]"
159569,And it looks like it was actually you who put ...,and it looks like it was actually you who put ...,"[and, it, looks, like, it, was, actually, you,...","[0, 0, 0, 0, 0, 0]"


In [22]:
MAX_LEN = 320
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 2
LEARNING_RATE = 1e-05
DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(DEVICE)

cpu


In [23]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len, new_data=False):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.new_data = new_data
        
        if not new_data:
            self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        out = {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
        }
        
        if not self.new_data:
            out['targets'] = torch.tensor(self.targets[index], dtype=torch.float)

        return out

In [20]:
!pip install transformers


Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m187.5 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.5/163.5 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl (197 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m197.6/197.6 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp310-cp310-macosx_10_11_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m957.6 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting filelock
  Using cache

In [26]:
from transformers import DistilBertTokenizer, DistilBertModel
import nltk

#tokenizer nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

tokenizer = word_tokenize


[nltk_data] Downloading package punkt to /Users/eshk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [29]:
train_size = 0.8

train_df = train.sample(frac=train_size, random_state=123)
val_df = train.drop(train_df.index).reset_index(drop=True)
train_df = train_df.reset_index(drop=True)


print("Orig Dataset: {}".format(train.shape))
print("Training Dataset: {}".format(train_df.shape))
print("Validation Dataset: {}".format(val_df.shape))


training_set = MultiLabelDataset(train_df, tokenizer, MAX_LEN)
val_set = MultiLabelDataset(val_df, tokenizer, MAX_LEN)

Orig Dataset: (159571, 4)
Training Dataset: (127657, 4)
Validation Dataset: (31914, 4)


In [30]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                }

val_params = {'batch_size': VALID_BATCH_SIZE,
               'shuffle': False,
                }

training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)

In [None]:
def train(epoch):
    for epochs in range(EPOCHS):
        model.train()
        
        for _, data in tqdm(enumerate(training_loader, 0)):
            ids = data['ids'].to(DEVICE, dtype=torch.long)
            mask = data['mask'].to(DEVICE, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(DEVICE, dtype=torch.long)
            targets = data['targets'].to(DEVICE, dtype=torch.float)

            outputs = model(ids, mask, token_type_ids)

            optimizer.zero_grad()
            loss = torch.nn.functional.binary_cross_entropy_with_logits(outputs, targets)
            
            if _ % 5000 == 0:
                print(f'Epoch: {epoch}, Loss:  {loss.item()}')
            
            loss.backward()
            optimizer.step()

In [None]:
all_test_pred = []

def test(epoch):
    model.eval()
    
    with torch.inference_mode():
    
        for _, data in tqdm(enumerate(test_loader, 0)):


            ids = data['ids'].to(DEVICE, dtype=torch.long)
            mask = data['mask'].to(DEVICE, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(DEVICE, dtype=torch.long)
            outputs = model(ids, mask, token_type_ids)
            probas = torch.sigmoid(outputs)

            all_test_pred.append(probas)
            
            
    return probas
probas = test(model)

# MASK

In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch
import re

# load tokenizer and model weights
tokenizer = RobertaTokenizer.from_pretrained('SkolkovoInstitute/roberta_toxicity_classifier')
model = RobertaForSequenceClassification.from_pretrained('SkolkovoInstitute/roberta_toxicity_classifier')

# # # prepare the input
# batch = tokenizer.encode('you are amazing', return_tensors='pt')
# print(batch)
# print(model(batch))
# output = torch.nn.functional.softmax(model(batch).logits, dim = -1)
# # inference
# print(output)

# batch = tokenizer.encode('these clowns are useless', return_tensors='pt')
# print(batch)

# # print(model.predict('you are of no use at all'))

# print(model(batch))
# output = torch.nn.functional.softmax(model(batch).logits, dim = -1)
# print(float(output[0][1]))

# batch = tokenizer.encode('are', return_tensors='pt')
# print(batch)


Some weights of the model checkpoint at SkolkovoInstitute/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor([[   0, 6968,   32, 2770,    2]])
SequenceClassifierOutput(loss=None, logits=tensor([[ 4.6567, -4.9115]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
tensor([[9.9993e-01, 6.9912e-05]], grad_fn=<SoftmaxBackward0>)
tensor([[    0, 29902, 21068,    29,    32, 23584,     2]])
SequenceClassifierOutput(loss=None, logits=tensor([[-3.7082,  3.5778]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
0.9993153810501099
tensor([[   0, 1322,    2]])


In [None]:
def mask(sentence):
  # for i in range(len(sentence)):
  #   if (not sentence[i].isalpha()):
  #     sentence[i]=' '
  sentence = re.sub(r'[^a-zA-Z]', ' ', sentence)
  
  sentence = ' ' + sentence + ' '

  sentence = sentence.split(' ')

  # true_sent = []
  # for i in sentence:
  #   true_sent.append(i)
  # true_sent = sentence
  true_sent = sentence.copy()

  # print(true_sent)
  masked_sentence = ''
  min_tox = 1
  while True:
    for i in range(len(sentence)):

      # print('1:',i,sentence)
      sentence[i] = '[MASK]'
      # print('2:',i,sentence)
      sentence = ' '.join(sentence)
      batch = tokenizer.encode(sentence, return_tensors='pt')
      output = torch.nn.functional.softmax(model(batch).logits, dim = -1)
      toxic_score = float(output[0][1])
      if min_tox > toxic_score:
        min_tox = toxic_score
        masked_sentence = sentence

      # print('nvwubrb',true_sent)
      sentence = true_sent.copy()

    
    if min_tox < 0.25:
      break
    
    true_sent = masked_sentence
    true_sent = true_sent.split(' ')
  
  return masked_sentence




In [None]:
sent = mask("hey loser , try this get a fucking life and stay out of mine , which you know nothing about ")
batch = tokenizer.encode(sent, return_tensors ='pt')
output = torch.nn.functional.softmax(model(batch).logits,dim = -1)
print(output[0][1])
print(sent)

tensor(0.0528, grad_fn=<SelectBackward0>)
 hey [MASK]   try this get a [MASK] life and stay out of mine   which you know nothing about  
