In [2]:
import torch
import pandas as pd
import numpy as np
SEED = 10
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x29749ef8c50>

In [3]:
torch.cuda.is_available()

True

# Prepare Data

In [6]:
combined_texts = pd.read_csv("../DataLabeling/combined_texts.csv", encoding='utf-8')

In [7]:
combined_texts.head()

Unnamed: 0,sentence,source
0,The son of a Louisiana man whose father was sh...,news
1,"Cameron Sterling, the son of Alton Sterling, w...",news
2,Alton Sterling was killed by Baton Rouge polic...,news
3,Baton Rouge police said in a statement that po...,news
4,The press conference on Wednesday is Cameron's...,news


In [8]:
SENT_TYPES = {'present_continuous': 0,
            'to_be_future': 1,
            'past_continuous': 2,
            'past_simple': 3,
            'can': 4,
            'future_simple': 5,
            'to_be_present': 6,
            'used_to': 7,
            'to_be_past': 8,
            'present_simple': 9,
            'other': 10,
            'present_perfect': 11,
            'could': 12}

In [47]:
params = {
    'model': {
        'max_seq_length': 128,
        'model_name': 'distilbert-base-uncased',
        'num_classes': 13
    },
    'data': {
        'text_field_name': 'sentence',
        'label_field_name': 'type',
        'path_to_dataset': '../DataLabeling/combined_texts.csv',
        'path_to_test_pred_scores': 'data/pred_present_simple.txt'
    },
     'training': {
        'learn_rate': 1e-5,
        'num_epochs': 3,                          
        'accum_steps': 2,                         
        'batch_size': 64,                         
        'log_dir': 'logdir' 
    }
}

In [83]:
def prepare_infer_data(params):
    dataset = pd.read_csv(params['data']['path_to_dataset'], encoding='utf-8')
    # dataset = dataset.sample(frac=1)
    test = dataset  
    test.to_csv('test.csv', encoding='utf-8', index=False)
    # creating PyTorch Datasets

    test_dataset = TextInferDataset(
        texts=test[params["data"]["text_field_name"]].values.tolist(),
        # labels=test[params["data"]["label_field_name"]].values,
        max_seq_length=params["model"]["max_seq_length"],
        model_name=params["model"]["model_name"],
    )

    set_global_seed(SEED)

    test_loaders = {
        "test": DataLoader(
            dataset=test_dataset,
            batch_size=params["training"]["batch_size"],
            shuffle=False,
        )
    }

    return test_loaders

In [84]:
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer
from catalyst.utils import set_global_seed
import logging
from transformers import AutoConfig, AutoModel
import torch.nn as nn

In [85]:
class TextInferDataset(Dataset):
    def __init__(
        self,
        texts,
        max_seq_length = 128,
        model_name = "distilbert-base-uncased",
    ):
        self.texts = texts
        self.max_seq_length = max_seq_length

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        # suppresses tokenizer warnings
        logging.getLogger("transformers.tokenization_utils").setLevel(logging.FATAL)

        
        self.sep_vid = self.tokenizer.vocab["[SEP]"]
        self.cls_vid = self.tokenizer.vocab["[CLS]"]
        self.pad_vid = self.tokenizer.vocab["[PAD]"]

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):

        # encoding the text
        x = self.texts[index]

        # a dictionary with `input_ids` and `attention_mask` as keys
        output_dict = self.tokenizer.encode_plus(
            x,
            add_special_tokens=True,
            padding="max_length",
            max_length=self.max_seq_length,
            return_tensors="pt",
            truncation=True,
            return_attention_mask=True,
        )

        # for Catalyst, there needs to be a key called features
        output_dict["features"] = output_dict["input_ids"].squeeze(0)
        del output_dict["input_ids"]

        return output_dict

# Infer 

In [86]:
from catalyst.dl import SupervisedRunner
from catalyst.dl.callbacks import (
    CheckpointCallback,
    InferCallback,
)
from catalyst.utils import prepare_cudnn, set_global_seed

In [87]:
test_loaders = prepare_infer_data(params)

In [40]:
# iter(test_loaders['test']).next()

In [88]:
class BertForSequenceClassification(nn.Module):
    """
    Simplified version of the same class by HuggingFace.
    See transformers/modeling_distilbert.py in the transformers repository.
    """

    def __init__(
        self, pretrained_model_name, num_classes = None, dropout = 0.3
    ):
        super().__init__()

        config = AutoConfig.from_pretrained(
            pretrained_model_name, num_labels=num_classes
        )

        self.model = AutoModel.from_pretrained(pretrained_model_name, config=config)
        # self.classifier = nn.Linear(config.hidden_size, num_classes)
        # self.dropout = nn.Dropout(dropout)
        self.pre_classifier = nn.Linear(config.dim, config.dim)
        self.classifier = nn.Linear(config.dim, num_classes)
        self.dropout = nn.Dropout(config.seq_classif_dropout)

    def forward(self, features, attention_mask=None, head_mask=None):
        """Compute class probabilities for the input sequence.

        Args:
            features (torch.Tensor): ids of each token,
                size ([bs, seq_length]
            attention_mask (torch.Tensor): binary tensor, used to select
                tokens which are used to compute attention scores
                in the self-attention heads, size [bs, seq_length]
            head_mask (torch.Tensor): 1.0 in head_mask indicates that
                we keep the head, size: [num_heads]
                or [num_hidden_layers x num_heads]
        Returns:
            PyTorch Tensor with predicted class scores
        """
        assert attention_mask is not None, "attention mask is none"

        # taking BERTModel output
        # see https://huggingface.co/transformers/model_doc/bert.html#transformers.BertModel
        distilbert_output = self.model(
            input_ids=features, attention_mask=attention_mask, head_mask=head_mask
        )
        # we only need the hidden state here and don't need
        # transformer output, so index 0
        # seq_output = bert_output[0]  # (bs, seq_len, dim)
        # mean pooling, i.e. getting average representation of all tokens
        # pooled_output = seq_output.mean(axis=1)  # (bs, dim)
        # pooled_output = self.dropout(pooled_output)  # (bs, dim)
        # scores = self.classifier(pooled_output)  # (bs, num_classes)

        hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
        pooled_output = hidden_state[:, 0]  # (bs, dim)
        pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
        pooled_output = nn.ReLU()(pooled_output)  # (bs, dim)
        pooled_output = self.dropout(pooled_output)  # (bs, dim)
        logits = self.classifier(pooled_output)  # (bs, num_labels)

        return logits

In [89]:
torch.cuda.empty_cache()
model = BertForSequenceClassification(
    pretrained_model_name=params["model"]["model_name"],
    num_classes=params["model"]["num_classes"],
)
runner = SupervisedRunner(input_key=("features", "attention_mask"))

with torch.no_grad():
    runner.infer(
        model=model,
        loaders=test_loaders,
        callbacks=[
            CheckpointCallback(
                resume=f"{params['training']['log_dir']}/checkpoints/best.pth"
            ),
            InferCallback(),
        ],
        verbose=True,
    )

=> Loading checkpoint logdir/checkpoints/best.pth
loaded state checkpoint logdir/checkpoints/best.pth (global epoch 3, epoch 3, stage train)
1/1 * Epoch (test): 100% 220/220 [01:47<00:00,  2.05it/s]


In [90]:
predicted_scores = runner.callbacks[0].predictions["logits"]
np.savetxt(X=predicted_scores,
           fname=params["data"]["path_to_test_pred_scores"])

In [91]:
len(predicted_scores)

14075

In [92]:
combined_texts['type'] = ''

In [93]:
combined_texts['type'].iloc[0] = 'some'
combined_texts.head()

Unnamed: 0,sentence,source,type
0,The son of a Louisiana man whose father was sh...,news,some
1,"Cameron Sterling, the son of Alton Sterling, w...",news,
2,Alton Sterling was killed by Baton Rouge polic...,news,
3,Baton Rouge police said in a statement that po...,news,
4,The press conference on Wednesday is Cameron's...,news,


In [94]:
def probs_to_prediction(dataframe, predictions, SENT_TYPES=SENT_TYPES):
    from progress.bar import ChargingBar
    SENT_TYPES_INVERSE = {i:item for i, item in enumerate(SENT_TYPES)}
    if len(dataframe) == len(predictions):
        bar = ChargingBar('Predict time', max=len(dataframe))
        for i in range(len(dataframe)):
            output = predictions[i]
            output = int(np.argmax(output))
            result = SENT_TYPES_INVERSE[output]
            dataframe['type'].iloc[i] = result
            bar.next()
        dataframe.to_csv('predicted_dataframe.csv', encoding='utf-8', index=False)

In [95]:
probs_to_prediction(combined_texts, predicted_scores)

# Look at predicted data

In [96]:
pred_data = pd.read_csv("predicted_dataframe.csv", encoding='utf-8')
predicted_present_simple = pred_data[pred_data['type'] == 'present_simple']
predicted_present_simple.to_csv('predicted_present_simple.csv', encoding='utf-8', index=False)

In [97]:
len(predicted_present_simple)

843