In [None]:
!pip install git+https://github.com/huggingface/optimum

In [None]:
!pip install git+https://github.com/huggingface/accelerate

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
from optimum.bettertransformer import BetterTransformer
from accelerate import Accelerator, notebook_launcher # main interface, distributed launcher
from accelerate.utils import set_seed # reproducability across devices

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
dir = None
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        dir = os.path.join(dirname, filename)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
read = ["comment_id", "comment_text"]
df_comments = pd.read_csv(dir, usecols=read)

In [None]:

import nltk
# download the NLTK resources (run this line only once)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:
# define a function to tag POS in a sentence
def pos_tag(sentence):
    tokens = nltk.word_tokenize(sentence)
    tags = nltk.pos_tag(tokens)
    return tags

# apply the function to the 'text' column of the DataFrame
df_comments['pos_tags'] = df_comments['comment_text'].apply(pos_tag)

In [None]:
def is_question(tags):
    # check if the sentence starts with an auxiliary verb
    if len(tags) > 2 and tags[0][1].startswith('VB') and tags[1][1].startswith('PR'):
        # check if the third tag is a main verb
        return tags[2][1].startswith('VB')
    else:
        return False

In [None]:
is_question_cond = ((df_comments.pos_tags.apply(lambda x: is_question(x))) | 
                    (df_comments.comment_text.str.startswith(("What", "When", "Where", "Which", "Who", "Whom", "Whose", "Why", "How", "Could", "Should", "Would", "Can"))) | 
                    (df_comments.comment_text.str.contains("?", regex=False)))

In [None]:
df_comments["questions"] = is_question_cond.astype(bool)

In [None]:
import torch
import transformers

DOWNLOAD_URL = "https://github.com/unitaryai/detoxify/releases/download/"
MODEL_URLS = {
    "original": DOWNLOAD_URL + "v0.1-alpha/toxic_original-c1212f89.ckpt",
    "unbiased": DOWNLOAD_URL + "v0.3-alpha/toxic_debiased-c7548aa0.ckpt",
    "multilingual": DOWNLOAD_URL + "v0.4-alpha/multilingual_debiased-0b549669.ckpt",
    "original-small": DOWNLOAD_URL + "v0.1.2/original-albert-0e1d6498.ckpt",
    "unbiased-small": DOWNLOAD_URL + "v0.1.2/unbiased-albert-c8519128.ckpt",
}

In [None]:
def get_model_and_tokenizer(
    model_type, model_name, tokenizer_name, num_classes, state_dict, huggingface_config_path=None
):
    model_class = getattr(transformers, model_name)
    model = model_class.from_pretrained(
        pretrained_model_name_or_path=None,
        config=huggingface_config_path or model_type,
        num_labels=num_classes,
        state_dict=state_dict,
        local_files_only=huggingface_config_path is not None,
    )
    tokenizer = getattr(transformers, tokenizer_name).from_pretrained(
        huggingface_config_path or model_type,
        local_files_only=huggingface_config_path is not None,
        # TODO: may be needed to let it work with Kaggle competition
        # model_max_length=512,
    )

    return model, tokenizer

In [None]:
def load_checkpoint(model_type="original", checkpoint=None, device="cpu", huggingface_config_path=None):
    if checkpoint is None:
        checkpoint_path = MODEL_URLS[model_type]
        loaded = torch.hub.load_state_dict_from_url(checkpoint_path, map_location=device)
    else:
        loaded = torch.load(checkpoint, map_location=device)
        if "config" not in loaded or "state_dict" not in loaded:
            raise ValueError(
                "Checkpoint needs to contain the config it was trained \
                    with as well as the state dict"
            )
    class_names = loaded["config"]["dataset"]["args"]["classes"]
    # standardise class names between models
    change_names = {
        "toxic": "toxicity",
        "identity_hate": "identity_attack",
        "severe_toxic": "severe_toxicity",
    }
    class_names = [change_names.get(cl, cl) for cl in class_names]
    model, tokenizer = get_model_and_tokenizer(
        **loaded["config"]["arch"]["args"],
        state_dict=loaded["state_dict"],
        huggingface_config_path=huggingface_config_path,
    )

    return model, tokenizer, class_names

In [None]:
model, tokenizer, class_names = load_checkpoint()

In [None]:
model = BetterTransformer.transform(model, keep_original_model=False)

In [None]:
import torch
# Report the number of sentences.
print('Number of test sentences: {:,}\n'.format(df_comments.shape[0]))

# Create sentence and label lists
sentences = df_comments.comment_text.values

# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in sentences:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 512,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# Set the batch size.  
batch_size = 32  



In [None]:
from torch.utils.data import TensorDataset, SequentialSampler, DataLoader
import pickle
# Create the DataLoader.
prediction_data = TensorDataset(input_ids, attention_masks)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [None]:
# Prediction on test set
def training_loop(model, dataloader, mixed_precision:str="fp16", seed:int=42, batch_size:int=32):

    
    accelerator = Accelerator()
    
    model = accelerator.prepare(model)
    
    # Put model on accelerator device
    model = model.to(accelerator.device)

    # Wrap dataloader with accelerator.prepare()
    dataloader = accelerator.prepare(dataloader)

    # Put model in evaluation mode
    model.eval()
    
    predictions=[]
    
    # Predict 
    for batch in dataloader:
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask = batch

        # Telling the model not to compute or store gradients, saving memory and 
        # speeding up prediction
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            outputs_toxic = model(b_input_ids, token_type_ids=None, 
            attention_mask=b_input_mask)

        logits_toxic = outputs_toxic[0]

        # Store predictions and true labels
        predictions.append(torch.sigmoid(accelerator.gather(logits_toxic)).cpu().detach().numpy())
        
    pickle.dump(predictions, open("/kaggle/working/preds.pickle", "wb"))
    

In [None]:
%%time
args = (model, prediction_dataloader, "fp16", 42, 32)
notebook_launcher(training_loop, args, num_processes=1)

In [None]:
predictions = pickle.load(open("/kaggle/working/preds.pickle", "rb"))
flat_predictions = np.concatenate(predictions, axis=0)
name = "negative"
df_comments[class_names] = flat_predictions

df_comments = df_comments[~df_comments.comment_text.str.startswith("@")].dropna()
cond = (df_comments.toxicity > df_comments.toxicity.median()) & (df_comments.obscene < 0.15)  & (df_comments.insult > df_comments.insult.quantile(0.9)) & (df_comments.identity_attack > df_comments.identity_attack.quantile(0.99))
df_comments["negative"] = cond

df_comments.drop(class_names, 1).reset_index().to_csv("preds.csv")