In [None]:
!pip install git+https://github.com/huggingface/optimum

In [None]:
!pip install git+https://github.com/huggingface/accelerate

In [None]:
!pip install transformers --upgrade

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import transformers
from optimum.bettertransformer import BetterTransformer
from accelerate import Accelerator, notebook_launcher # main interface, distributed launcher
from accelerate.utils import set_seed # reproducability across devices
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

dir = None
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        dir = os.path.join(dirname, filename)

In [None]:
read = ["comment_id", "comment_text"]
df_comments = pd.read_csv(dir, usecols=read)

In [None]:
def pos_tag(sentence):
    tokens = nltk.word_tokenize(sentence)
    tags = nltk.pos_tag(tokens)
    return tags

df_comments['pos_tags'] = df_comments['comment_text'].apply(pos_tag)

In [None]:
def is_question(tags):
    if len(tags) > 2 and tags[0][1].startswith('VB') and tags[1][1].startswith('PR'):
        return tags[2][1].startswith('VB')
    else:
        return False

In [None]:
is_question_cond = ((df_comments.pos_tags.apply(lambda x: is_question(x))) | 
                    (df_comments.comment_text.str.startswith(("What", "When", "Where", "Which", "Who", "Whom", "Whose", "Why", "How", "Could", "Should", "Would", "Can"))) | 
                    (df_comments.comment_text.str.contains("?", regex=False)))

In [None]:
df_comments["questions"] = is_question_cond.astype(bool)

In [None]:
DOWNLOAD_URL = "https://github.com/unitaryai/detoxify/releases/download/"
MODEL_URLS = {
    "original": DOWNLOAD_URL + "v0.1-alpha/toxic_original-c1212f89.ckpt",
    "unbiased": DOWNLOAD_URL + "v0.3-alpha/toxic_debiased-c7548aa0.ckpt",
    "multilingual": DOWNLOAD_URL + "v0.4-alpha/multilingual_debiased-0b549669.ckpt",
    "original-small": DOWNLOAD_URL + "v0.1.2/original-albert-0e1d6498.ckpt",
    "unbiased-small": DOWNLOAD_URL + "v0.1.2/unbiased-albert-c8519128.ckpt",
}

In [None]:
def get_model_and_tokenizer(
    model_type, model_name, tokenizer_name, num_classes, state_dict, huggingface_config_path=None
):
    model_class = getattr(transformers, model_name)
    model = model_class.from_pretrained(
        pretrained_model_name_or_path=None,
        config=huggingface_config_path or model_type,
        num_labels=num_classes,
        state_dict=state_dict,
        local_files_only=huggingface_config_path is not None,
    )
    tokenizer = getattr(transformers, tokenizer_name).from_pretrained(
        huggingface_config_path or model_type,
        local_files_only=huggingface_config_path is not None,
    )

    return model, tokenizer

In [None]:
def load_checkpoint(model_type="original", checkpoint=None, device="cpu", huggingface_config_path=None):
    if checkpoint is None:
        checkpoint_path = MODEL_URLS[model_type]
        loaded = torch.hub.load_state_dict_from_url(checkpoint_path, map_location=device)
    else:
        loaded = torch.load(checkpoint, map_location=device)
        if "config" not in loaded or "state_dict" not in loaded:
            raise ValueError(
                "Checkpoint needs to contain the config it was trained \
                    with as well as the state dict"
            )
    class_names = loaded["config"]["dataset"]["args"]["classes"]
    change_names = {
        "toxic": "toxicity",
        "identity_hate": "identity_attack",
        "severe_toxic": "severe_toxicity",
    }
    class_names = [change_names.get(cl, cl) for cl in class_names]
    model, tokenizer = get_model_and_tokenizer(
        **loaded["config"]["arch"]["args"],
        state_dict=loaded["state_dict"],
        huggingface_config_path=huggingface_config_path,
    )

    return model, tokenizer, class_names

In [None]:
model, tokenizer, class_names = load_checkpoint()

In [None]:
model = BetterTransformer.transform(model, keep_original_model=False)

In [None]:
print('Number of test sentences: {:,}\n'.format(df_comments.shape[0]))

sentences = df_comments.comment_text.values

input_ids = []
attention_masks = []

for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,                    
                        add_special_tokens = True, 
                        max_length = 512,         
                        pad_to_max_length = True,
                        return_attention_mask = True,   
                        return_tensors = 'pt',   
                   )
        
    input_ids.append(encoded_dict['input_ids'])
    
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

batch_size = 32  



In [None]:
from torch.utils.data import TensorDataset, SequentialSampler, DataLoader
import pickle
prediction_data = TensorDataset(input_ids, attention_masks)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [None]:
def training_loop(model, dataloader, mixed_precision:str="fp16", seed:int=42, batch_size:int=32):

    
    accelerator = Accelerator()
    
    model = accelerator.prepare(model)
    
    model = model.to(accelerator.device)

    dataloader = accelerator.prepare(dataloader)

    model.eval()
    
    predictions=[]
    
    for batch in dataloader:
        b_input_ids, b_input_mask = batch

        with torch.no_grad():
            outputs_toxic = model(b_input_ids, token_type_ids=None, 
            attention_mask=b_input_mask)

        logits_toxic = outputs_toxic[0]

        predictions.append(torch.sigmoid(accelerator.gather(logits_toxic)).cpu().detach().numpy())
        
    pickle.dump(predictions, open("/kaggle/working/preds.pickle", "wb"))
    

In [None]:
%%time
args = (model, prediction_dataloader, "fp16", 42, 32)
notebook_launcher(training_loop, args, num_processes=1)

In [None]:
predictions = pickle.load(open("/kaggle/working/preds.pickle", "rb"))
flat_predictions = np.concatenate(predictions, axis=0)
name = "negative"
df_comments[class_names] = flat_predictions

df_comments = df_comments[~df_comments.comment_text.str.startswith("@")].dropna()
cond = (df_comments.toxicity > df_comments.toxicity.median()) & (df_comments.obscene < 0.15)  & (df_comments.insult > df_comments.insult.quantile(0.9)) & (df_comments.identity_attack > df_comments.identity_attack.quantile(0.99))
df_comments["negative"] = cond

df_comments.drop(class_names, 1).reset_index().to_csv("preds.csv")