In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import datetime
import pkg_resources
import seaborn as sns
import time
import scipy.stats as stats
import gc
import re
import operator 
import sys
from sklearn import metrics
from sklearn import model_selection
import torch
import torch.nn as nn
import torch.utils.data
import torch.nn.functional as F
from nltk.stem import PorterStemmer
from sklearn.metrics import roc_auc_score
%load_ext autoreload
%autoreload 2
%matplotlib inline
from tqdm import tqdm, tqdm_notebook
import os
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings(action='once')
import pickle
from apex import amp
import shutil
import pandas as pd
import numpy as np

In [16]:
device=torch.device('cpu')

In [2]:
MAX_SEQUENCE_LENGTH = 220
SEED = 1234
EPOCHS = 1
Data_dir="/Volumes/T/Research/jigsaw-toxic-comment-classification-challenge"
Input_dir = "/Volumes/T/Research/input"
WORK_DIR = "/Volumes/T/Research/working/"
num_to_load=12000                         #Train size to match time limit
valid_size= 1500                          #Validation Size
TOXICITY_COLUMN = 'target'

In [3]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig

I0301 21:02:56.320324 4535795136 file_utils.py:41] PyTorch version 1.4.0 available.


In [4]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", cache_dir=None,do_lower_case=True)

I0301 21:02:58.793411 4535795136 tokenization_utils.py:501] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /Users/gal/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [7]:
train_df = pd.read_csv(os.path.join(Data_dir,"train.csv")).sample(num_to_load+valid_size,random_state=SEED)
print('loaded %d records' % len(train_df))
# Make sure all comment_text values are strings
train_df['comment_text'] = train_df['comment_text'].astype(str) 

loaded 13500 records


In [5]:
# Converting the lines to BERT format
# Thanks to https://www.kaggle.com/httpwwwfszyc/bert-in-keras-taming
def convert_lines(example, max_seq_length,tokenizer):
    max_seq_length -=2
    all_tokens = []
    longer = 0
    for text in tqdm_notebook(example):
        tokens_a = tokenizer.tokenize(text)
        if len(tokens_a)>max_seq_length:
            tokens_a = tokens_a[:max_seq_length]
            longer += 1
        one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_a+["[SEP]"])+[0] * (max_seq_length - len(tokens_a))
        all_tokens.append(one_token)
    print(longer)
    return np.array(all_tokens)

In [9]:
sequences = convert_lines(train_df["comment_text"].fillna("DUMMY_VALUE"),MAX_SEQUENCE_LENGTH,tokenizer)

HBox(children=(IntProgress(value=0, max=13500), HTML(value='')))


1225


In [10]:
train_df=train_df.fillna(0)
y_columns=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train_df = train_df.drop(['comment_text'],axis=1)

In [11]:
train_df.query("toxic == 1").head()

I0301 14:25:29.149832 4525993408 utils.py:141] NumExpr defaulting to 4 threads.


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
9463,191a1c6bd40ad470,1,0,1,0,0,0
117821,756081e7bbf6216e,1,0,0,0,0,0
111499,5479062a0abe121b,1,0,0,0,0,0
79041,d387cd75bf52c12b,1,0,0,0,1,0
109484,4995db47e6514d04,1,0,0,0,0,0


In [12]:
X = sequences[:num_to_load]                
y = train_df[y_columns].values[:num_to_load]
X_val = sequences[num_to_load:]                
y_val = train_df[y_columns].values[num_to_load:]

In [13]:
test_df=train_df.tail(valid_size).copy()
train_df=train_df.head(num_to_load)

In [14]:
train_dataset = torch.utils.data.TensorDataset(torch.tensor(X,dtype=torch.long), torch.tensor(y,dtype=torch.float))

In [17]:
lr=2e-5
batch_size = 32
accumulation_steps=2
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

model = BertForSequenceClassification.from_pretrained("bert-base-uncased",cache_dir=None,num_labels=len(y_columns))
model.zero_grad()
model = model.to(device)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
train = train_dataset

num_train_optimization_steps = int(EPOCHS*len(train)/batch_size/accumulation_steps)

<torch._C.Generator at 0x122ed7ad0>

I0301 14:26:31.100279 4525993408 configuration_utils.py:256] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /Users/gal/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.8f56353af4a709bf5ff0fbc915d8f5b42bfff892cbb6ac98c3c45f481a03c685
I0301 14:26:31.101588 4525993408 configuration_utils.py:292] Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": null,
  "do_sample": false,
  "eos_token_ids": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512

In [19]:
optimizer = AdamW(optimizer_grouped_parameters,
                     lr=lr,
#                      warmup=0.05,
#                      t_total=num_train_optimization_steps
                 )

# model, optimizer = amp.initialize(model, optimizer, opt_level="O1",verbosity=0)
model = model.train()

tq = tqdm_notebook(range(EPOCHS))
for epoch in tq:
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    avg_loss = 0.
    avg_accuracy = 0.
    lossf=None
    tk0 = tqdm_notebook(enumerate(train_loader),total=len(train_loader),leave=False)
    optimizer.zero_grad()   # Bug fix - thanks to @chinhuic
    for i,(x_batch, y_batch) in tk0:
#        optimizer.zero_grad()
        y_pred = model(x_batch.to(device), attention_mask=(x_batch>0).to(device), labels=None)
        loss =  F.binary_cross_entropy_with_logits(y_pred[0],y_batch.to(device))
#         with amp.scale_loss(loss, optimizer) as scaled_loss:
#             scaled_loss.backward()
        loss.backward()
        if (i+1) % accumulation_steps == 0:             # Wait for several backward steps
            optimizer.step()                            # Now we can do an optimizer step
            optimizer.zero_grad()
        if lossf:
            lossf = 0.98*lossf+0.02*loss.item()
        else:
            lossf = loss.item()
        tk0.set_postfix(loss = lossf)
        avg_loss += loss.item() / len(train_loader)
        avg_accuracy += torch.mean(((torch.sigmoid(y_pred[0][:,0])>0.5) == (y_batch[:,0]>0.5).to(device)).to(torch.float) ).item()/len(train_loader)
    tq.set_postfix(avg_loss=avg_loss,avg_accuracy=avg_accuracy)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=375), HTML(value='')))




In [20]:
torch.save(model.state_dict(), "/Volumes/T/Research/jigsaw-toxic-comment-classification-challenge/bert_model.model")

In [34]:
torch.sigmoid(model(torch.tensor(sequences[:10]))[0])

tensor([[0.0294, 0.0160, 0.0173, 0.0182, 0.0193, 0.0199],
        [0.1336, 0.0261, 0.0475, 0.0260, 0.0395, 0.0217],
        [0.0950, 0.0246, 0.0359, 0.0203, 0.0358, 0.0204],
        [0.1342, 0.0274, 0.0438, 0.0242, 0.0341, 0.0219],
        [0.1069, 0.0192, 0.0471, 0.0246, 0.0442, 0.0293],
        [0.1214, 0.0201, 0.0374, 0.0255, 0.0392, 0.0235],
        [0.1254, 0.0221, 0.0377, 0.0177, 0.0367, 0.0199],
        [0.0304, 0.0142, 0.0164, 0.0134, 0.0188, 0.0165],
        [0.6106, 0.0485, 0.1708, 0.0327, 0.1708, 0.0449],
        [0.1658, 0.0196, 0.0541, 0.0179, 0.0631, 0.0259]],
       grad_fn=<SigmoidBackward>)

In [47]:
train_df.iloc[:10]

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
12345,20bc45f5014c1eca,0,0,0,0,0,0
55587,948580166b8b8745,0,0,0,0,0,0
16877,2c8519885db5c0bb,0,0,0,0,0,0
140039,ed64d530a34aa973,0,0,0,0,0,0
125777,a0dd897eafbada2c,0,0,0,0,0,0
159512,ff2f5695bb9aa845,0,0,0,0,0,0
121612,8aa46e67645e4b7b,0,0,0,0,0,0
7588,142d9e745d03c9ef,0,0,0,0,0,0
9463,191a1c6bd40ad470,1,0,1,0,0,0
103236,287b2d375ed7188a,0,0,0,0,0,0


In [43]:
idx = np.where(train_df["toxic"] == 1)[:10]

In [44]:
idx

array([  8,  21,  32,  42,  45,  47,  87, 100, 113, 114])

In [45]:
train_df.iloc[idx]

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
9463,191a1c6bd40ad470,1,0,1,0,0,0
117821,756081e7bbf6216e,1,0,0,0,0,0
111499,5479062a0abe121b,1,0,0,0,0,0
79041,d387cd75bf52c12b,1,0,0,0,1,0
109484,4995db47e6514d04,1,0,0,0,0,0
33891,5a66ffe0f912b06c,1,0,0,0,0,0
88581,ecf6701bdac3352e,1,0,1,0,1,0
28455,4b5547ff5014eb02,1,0,0,0,0,0
59578,9f8b36a0b661dae7,1,0,1,0,1,0
73304,c4218f9d6b3d4c47,1,0,1,0,1,0


In [46]:
torch.sigmoid(model(torch.tensor(sequences[idx]))[0])

tensor([[0.5461, 0.0567, 0.2555, 0.0468, 0.2135, 0.0399],
        [0.1323, 0.0235, 0.0506, 0.0261, 0.0424, 0.0230],
        [0.1317, 0.0224, 0.0439, 0.0244, 0.0447, 0.0203],
        [0.1724, 0.0215, 0.0482, 0.0303, 0.0505, 0.0235],
        [0.2026, 0.0223, 0.0473, 0.0207, 0.0522, 0.0208],
        [0.1375, 0.0217, 0.0456, 0.0252, 0.0404, 0.0201],
        [0.7897, 0.0943, 0.4186, 0.0803, 0.4548, 0.0911],
        [0.1198, 0.0209, 0.0605, 0.0231, 0.0465, 0.0222],
        [0.7657, 0.0990, 0.4380, 0.0560, 0.4226, 0.0851],
        [0.5396, 0.0469, 0.1876, 0.0386, 0.1947, 0.0436]],
       grad_fn=<SigmoidBackward>)

In [6]:
founta_df = pd.read_csv("/Users/gal/Dropbox (Irit Gat Viks)/gal/classes/causal_inference/project/abusive_dataset/hatespeech_features_fixed.csv", index_col=0)
founta_labels_df = pd.read_csv("/Users/gal/Dropbox (Irit Gat Viks)/gal/classes/causal_inference/project/abusive_dataset/hatespeech_labels.csv", index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [9]:
founta_df.head()

Unnamed: 0_level_0,text,hashtags,symbols,urls,mentions,created_at,timestamp,favorites,retweets,is_quote_status,...,user_lists,user_timestamp,user_favourites,user_statuses,user_lang,user_is_translator,user_geo_enabled,user_location,user_verified,sampling_method
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
849667487180259329,RT @Youngvickmane3: Horse face hoe stop playin...,0,0.0,0,1,Wed Apr 05 16:58:09 +0000 2017,1491411489659,0,0,False,...,0,Thu Mar 18 23:12:57 +0000 2010,33.0,30,en,False,False,,False,Boosted
850490912954351616,Alex Brosas another idiot #ALDUBKSGoesToUS ht...,1,0.0,1,0,Fri Apr 07 23:30:09 +0000 2017,1491607809662,0,0,True,...,13,Sun Sep 13 17:59:48 +0000 2015,26453.0,276839,en,False,True,,False,Boosted
848791766853668864,"RT @ItIzBiz: as Nancy Reagan would say, 'just ...",0,0.0,1,1,Mon Apr 03 06:58:21 +0000 2017,1491202701663,0,0,True,...,0,Thu Oct 28 21:43:26 +0000 2010,886.0,4470,en,False,False,United States,False,Boosted
848306464892604416,RT @JakellDaGOAT: Not being homophobic here......,0,0.0,1,1,Sat Apr 01 22:49:56 +0000 2017,1491086996659,0,0,True,...,12,Tue Aug 05 19:46:15 +0000 2014,19703.0,54616,en,False,True,ATL HOE,False,Boosted
850010509969465344,RT @MailOnline: The Nazi death gas so horrific...,0,0.0,1,1,Thu Apr 06 15:41:12 +0000 2017,1491493272665,0,0,False,...,6,Thu Apr 24 10:57:21 +0000 2014,3701.0,6549,en,False,False,Zimbabwe,False,Boosted


In [7]:
founta_sequences = convert_lines(founta_df["text"].fillna("DUMMY_VALUE"),MAX_SEQUENCE_LENGTH,tokenizer)

HBox(children=(IntProgress(value=0, max=459427), HTML(value='')))


0


In [8]:
import torch.nn
state_dict = torch.load("/Volumes/T/Research/jigsaw-toxic-comment-classification-challenge/bert_model.model")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",cache_dir=None,num_labels=6)
model.load_state_dict(state_dict=state_dict)

I0301 21:08:42.719597 4535795136 configuration_utils.py:256] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /Users/gal/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.8f56353af4a709bf5ff0fbc915d8f5b42bfff892cbb6ac98c3c45f481a03c685
I0301 21:08:42.720719 4535795136 configuration_utils.py:292] Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": null,
  "do_sample": false,
  "eos_token_ids": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512

<All keys matched successfully>

In [None]:
SIZE = 10000
founta_pred = torch.sigmoid(model(torch.tensor(founta_sequences[:SIZE]))[0])

In [None]:
founta_pred_s = pd.DataFrame(founta_pred, index=founta_df.index[:SIZE])
joint = founta_labels_df.join(founta_pred_s)
joint

In [None]:
joint.groupby("label").describe()