In [1]:
import numpy as np
import pandas as pd

from pathlib import Path
from typing import *

import torch
import torch.optim as optim

In [2]:
from fastai import *
from fastai.vision import *
from fastai.text import *
from fastai.callbacks import *

In [3]:
%%bash
pip install pytorch-pretrained-bert

Collecting pytorch-pretrained-bert
  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
Installing collected packages: pytorch-pretrained-bert
Successfully installed pytorch-pretrained-bert-0.6.2


You are using pip version 19.0.3, however version 20.2.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.


In [4]:
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)

config = Config(
    testing=False,
    bert_model_name="bert-base-uncased",
    max_lr=3e-5,
    epochs=4,
    use_fp16=True,
    bs=8,
    discriminative=False,
    max_seq_len=256,
)

In [5]:
from pytorch_pretrained_bert import BertTokenizer
bert_tok = BertTokenizer.from_pretrained(
    config.bert_model_name,
)

100%|██████████| 231508/231508 [00:00<00:00, 2571216.93B/s]


In [6]:
def _join_texts(texts:Collection[str], mark_fields:bool=False, sos_token:Optional[str]=BOS):
    """Borrowed from fast.ai source"""
    if not isinstance(texts, np.ndarray): texts = np.array(texts)
    if is1d(texts): texts = texts[:,None]
    df = pd.DataFrame({i:texts[:,i] for i in range(texts.shape[1])})
    text_col = f'{FLD} {1} ' + df[0].astype(str) if mark_fields else df[0].astype(str)
    if sos_token is not None: text_col = f"{sos_token} " + text_col
    for i in range(1,len(df.columns)):
        #text_col += (f' {FLD} {i+1} ' if mark_fields else ' ') + df[i]
        text_col += (f' {FLD} {i+1} ' if mark_fields else ' ') + df[i].astype(str)
    return text_col.values

In [7]:
class FastAiBertTokenizer(BaseTokenizer):
    """Wrapper around BertTokenizer to be compatible with fast.ai"""
    def __init__(self, tokenizer: BertTokenizer, max_seq_len: int=128, **kwargs):
        self._pretrained_tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def __call__(self, *args, **kwargs):
        return self

    def tokenizer(self, t:str) -> List[str]:
        """Limits the maximum sequence length"""
        return ["[CLS]"] + self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2] + ["[SEP]"]

In [8]:
from sklearn.model_selection import train_test_split

import pandas as pd
train, test = [pd.read_csv('/kaggle/input/sentiment-actual-detection/'+ fname) for fname in ["train.csv", "val.csv"]]
val = test # we won't be using a validation set but you can easily create one using train_test_split
train = train.dropna()
test = test.dropna()
val = val.dropna()

In [9]:
if config.testing:
    train = train.head(1024)
    val = val.head(1024)
    test = test.head(1024)

In [10]:
fastai_bert_vocab = Vocab(list(bert_tok.vocab.keys()))

In [11]:
fastai_tokenizer = Tokenizer(tok_func=FastAiBertTokenizer(bert_tok, max_seq_len=config.max_seq_len), pre_rules=[], post_rules=[])

In [12]:
label_cols = ['tyre agedot code positive', 'advisoragent service negative', 'discount not applied positive', 
                                     'discount not applied negative', 'response time negative', 'wait time negative', 
                                     'mobile fitter negative', 'refund timescale negative', 'length of fitting positive', 
                                     'no stock negative', 'refund not actioned positive', 'booking confusion positive', 
                                     'delivery punctuality negative', 'tyre age/dot code negative', 'no stock positive', 
                                     "mobile fitter didn't arrive negative", 'facilities negative', 'value for money negative', 
                                     'discounts negative', 'change of time positive', 'failed payment positive', 
                                     'incorrect tyres sent positive', 'refund timescale positive', 'failed payment negative',
                                     'call wait time positive', 'wait time positive', 'damage negative', 'tyre quality positive', 
                                     'ease of booking positive', 'change of time negative', "mobile fitter didn't arrive positive",
                                     'change of date negative', 'advisor/agent service negative', 'extra charges negative', 
                                     'location positive', 'late notice negative', 'discounts positive', 'garage service negative', 
                                     'tyre quality negative', 'response time positive', 'booking confusion negative',
                                     'delivery punctuality positive', 'advisoragent service positive', 'refund positive', 
                                     'refund negative', 'ease of booking negative', 'garage service positive', 'location negative', 
                                     'balancing negative', 'facilities positive', 'call wait time negative', 'advisor/agent service positive',
                                     'tyre agedot code negative', 'mobile fitter positive', 'late notice positive', 
                                     'incorrect tyres sent negative', 'value for money positive', 'extra charges positive', 
                                     'balancing positive', 'length of fitting negative', 'refund not actioned negative', 'change of date positive']

# databunch = TextDataBunch.from_df(".", train, val, test,
#                   tokenizer=fastai_tokenizer,
#                   vocab=fastai_bert_vocab,
#                   include_bos=False,
#                   include_eos=False,
#                   text_cols="comment_text",
#                   label_cols=label_cols,
#                   bs=config.bs,
#                   collate_fn=partial(pad_collate, pad_first=False, pad_idx=0),
#              )

Alternatively, we can pass our own list of Preprocessors to the databunch (this is effectively what is happening behind the scenes)

In [13]:
class BertTokenizeProcessor(TokenizeProcessor):
    def __init__(self, tokenizer):
        super().__init__(tokenizer=tokenizer, include_bos=False, include_eos=False)

class BertNumericalizeProcessor(NumericalizeProcessor):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, vocab=Vocab(list(bert_tok.vocab.keys())), **kwargs)

def get_bert_processor(tokenizer:Tokenizer=None, vocab:Vocab=None):
    """
    Constructing preprocessors for BERT
    We remove sos/eos tokens since we add that ourselves in the tokenizer.
    We also use a custom vocabulary to match the numericalization with the original BERT model.
    """
    return [BertTokenizeProcessor(tokenizer=tokenizer),
            NumericalizeProcessor(vocab=vocab)]

In [14]:
class BertDataBunch(TextDataBunch):
    @classmethod
    def from_df(cls, path:PathOrStr, train_df:DataFrame, valid_df:DataFrame, test_df:Optional[DataFrame]=None,
                tokenizer:Tokenizer=None, vocab:Vocab=None, classes:Collection[str]=None, text_cols:IntsOrStrs=1,
                label_cols:IntsOrStrs=0, label_delim:str=None, **kwargs) -> DataBunch:
        "Create a `TextDataBunch` from DataFrames."
        p_kwargs, kwargs = split_kwargs_by_func(kwargs, get_bert_processor)
        # use our custom processors while taking tokenizer and vocab as kwargs
        processor = get_bert_processor(tokenizer=tokenizer, vocab=vocab, **p_kwargs)
        if classes is None and is_listy(label_cols) and len(label_cols) > 1: classes = label_cols
        src = ItemLists(path, TextList.from_df(train_df, path, cols=text_cols, processor=processor),
                        TextList.from_df(valid_df, path, cols=text_cols, processor=processor))
        src = src.label_for_lm() if cls==TextLMDataBunch else src.label_from_df(cols=label_cols, classes=classes)
        if test_df is not None: src.add_test(TextList.from_df(test_df, path, cols=text_cols))
        return src.databunch(**kwargs)

In [15]:
# this will produce a virtually identical databunch to the code above
databunch = BertDataBunch.from_df(".", train, val, test,
                  tokenizer=fastai_tokenizer,
                  vocab=fastai_bert_vocab,
                  text_cols="text",
                  label_cols=label_cols,
                  bs=config.bs,
                  collate_fn=partial(pad_collate, pad_first=False, pad_idx=0),
             )

# Model

In [16]:
from pytorch_pretrained_bert.modeling import BertConfig, BertForSequenceClassification
bert_model = BertForSequenceClassification.from_pretrained(config.bert_model_name, num_labels=62)

100%|██████████| 407873900/407873900 [00:09<00:00, 43058459.54B/s]


In [17]:
loss_func = nn.BCEWithLogitsLoss()

In [18]:
from fastai.callbacks import *

learner = Learner(
    databunch, bert_model,
    loss_func=loss_func,
)
if config.use_fp16: learner = learner.to_fp16()

In [19]:
#learner.lr_find()

In [20]:
learner.recorder.plot()

AttributeError: 'Learner' object has no attribute 'recorder'

In [21]:
learner.fit_one_cycle(config.epochs, max_lr=config.max_lr)

epoch,train_loss,valid_loss,time
0,0.082287,0.085299,01:33
1,0.061824,0.067664,01:34
2,0.048695,0.06222,01:35
3,0.048452,0.061246,01:32


In [22]:
learner.validate()

[0.06124604]

In [23]:
def get_preds_as_nparray(ds_type) -> np.ndarray:
    """
    the get_preds method does not yield the elements in order by default
    we borrow the code from the RNNLearner to resort the elements into their correct order
    """
    preds = learner.get_preds(ds_type)[0].detach().cpu().numpy()
    sampler = [i for i in databunch.dl(ds_type).sampler]
    reverse_sampler = np.argsort(sampler)
    return preds[reverse_sampler, :]

In [None]:
test_preds = get_preds_as_nparray(DatasetType.Test)
print(test_preds.shape)

In [None]:
val_labels = val[label_cols]
val_label_array = np.zeros((0,62))
for (key,value) in val_labels.iterrows():
    
    val_array = np.expand_dims(np.array(list(value)),0)
    val_label_array = np.append(val_label_array,val_array, axis = 0)

print(val_label_array.shape)
error = 0
for index in range(val_label_array.shape[0]):
    mse = sum((test_preds[index,:]-val_label_array[index,:])**2)/ sum(val_label_array[index,:]**2)
    #print(mse)
    error +=mse
print('The error in predictions:{}'.format(error/val_label_array.shape[0]))

In [None]:
sample_submission = pd.read_csv(DATA_ROOT / "sample_submission.csv")
if config.testing: sample_submission = sample_submission.head(test.shape[0])
sample_submission[label_cols] = test_preds
sample_submission.to_csv("predictions.csv", index=False)