In [None]:
import tempfile
import warnings
from pathlib import Path
from typing import Dict, Iterable, List, Tuple, TypeVar

import numpy as np
import pandas as pd
import torch
from allennlp.common.util import JsonDict
from allennlp.data import (
    DataLoader,
    DatasetReader,
    Instance,
    TextFieldTensors,
    Vocabulary,
)
from allennlp.data.data_loaders import SimpleDataLoader
from allennlp.data.fields import Field, LabelField, TextField
from allennlp.data.token_indexers import (
    PretrainedTransformerIndexer,
    SingleIdTokenIndexer,
    TokenIndexer,
)
from allennlp.data.tokenizers import Token, Tokenizer, WhitespaceTokenizer
from allennlp.data.tokenizers.pretrained_transformer_tokenizer import (
    PretrainedTransformerTokenizer,
)
from allennlp.models import Model
from allennlp.modules import Seq2VecEncoder, TextFieldEmbedder
from allennlp.modules.seq2vec_encoders import BagOfEmbeddingsEncoder, BertPooler
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.token_embedders.pretrained_transformer_embedder import (
    PretrainedTransformerEmbedder,
)
from allennlp.nn import util
from allennlp.predictors import Predictor
from allennlp.training.gradient_descent_trainer import GradientDescentTrainer
from allennlp.training.metrics import CategoricalAccuracy
from allennlp.training.optimizers import HuggingfaceAdamWOptimizer
from allennlp.training.trainer import Trainer
from allennlp.training.util import evaluate


# There's a warning when you call `forward_on_instances` that you don't need
# to worry about right now, so we silence it.
warnings.filterwarnings("ignore")

PandasDataFrameType = TypeVar('pandas.core.frame.DataFrame')

input_data_path = Path("../../../../data")

In [None]:
df = pd.read_excel(input_data_path / "input.xlsx")
df.sample(7)

In [None]:
with_amazon = df['texts'].str.contains("amazon", case=False, regex=False)

In [None]:
df_with_amazon = df[with_amazon]
df_with_amazon.shape

In [None]:
df_without_amazon = df[~with_amazon]
df_without_amazon.shape

In [None]:
def focus_on_keyword(text, keyword, size=5):
    # Centres focus on keyword
    output_text = ""
    for i, j in enumerate(text.split()):
        if keyword.lower() in j.lower():
            output_text += " ".join(text.split()[max([0, i - size]): max([size * 2 + 1, i + size + 1])]) + "\n"
    return output_text

In [None]:
df_with_amazon['snippets'] = df_with_amazon.apply(lambda row: focus_on_keyword(row.texts, "amazon"), axis='columns')
df_with_amazon.sample(7)

In [None]:
df_train_validate_labels = pd.read_excel(input_data_path / "train-validate-labels.xlsx", index_col=0)
df_test_labels = pd.read_excel(input_data_path / "test-labels.xlsx", index_col=0)

In [None]:
label_mapping = {1: "company", 2: "not company"}

In [None]:
df_train_validate = pd.merge(df_train_validate_labels, df_with_amazon, how='left', left_index=True, right_index=True)
df_train_validate.insert(1, "mapped_label", df_train_validate["label"].map(label_mapping))
print(df_train_validate.shape)
df_train_validate.sample(7)

In [None]:
df_test = pd.merge(df_test_labels, df_with_amazon, how='left', left_index=True, right_index=True)
df_test.insert(1, "mapped_label", df_test["label"].map(label_mapping))
print(df_test.shape)
df_test.sample(7)

In [None]:
seen_indices = list(df_train_validate.index.values) + list(df_test.index.values)
df_unseen = df_with_amazon.loc[~df_with_amazon.index.isin(seen_indices)]
print(df_unseen.shape)
df_unseen.sample(7)

In [None]:
df_train = df_train_validate.sample(frac=0.8, random_state=42)
print(df_train.shape)
df_train.sample(7)

In [None]:
df_validate = df_train_validate.loc[set(df_train_validate.index) - set(df_train.index)]
print(df_validate.shape)
df_validate.sample(7)

In [None]:
@DatasetReader.register('classification-df')
class ClassificationDfReader(DatasetReader):
    def __init__(
        self,
        text_header: str,
        label_header: str,
        tokenizer: Tokenizer = None,
        token_indexers: Dict[str, TokenIndexer] = None,
        max_tokens: int = None,
        **kwargs
    ):
        super().__init__(**kwargs)
        self.text_header = text_header
        self.label_header = label_header
        self.tokenizer = tokenizer
        self.token_indexers = token_indexers
        self.max_tokens = max_tokens

    def text_to_instance(self, text: str, label: str = None) -> Instance:        
        tokens = self.tokenizer.tokenize(text)
        if self.max_tokens:
            tokens = tokens[: self.max_tokens]
        text_field = TextField(tokens, self.token_indexers)
        fields = {'text': text_field}
        if label:
            fields['label'] = LabelField(label)
        return Instance(fields)
        
    def _read(self, dataframe: PandasDataFrameType) -> Iterable[Instance]:
        for row in dataframe.itertuples():
            text = getattr(row, self.text_header)
            label = getattr(row, self.label_header)
            tokens = self.tokenizer.tokenize(text)
            if self.max_tokens:
                tokens = tokens[: self.max_tokens]
            text_field = TextField(tokens, self.token_indexers)
            label_field = LabelField(label)
            fields: Dict[str, Field] = {"text": text_field, "label": label_field}
            yield Instance(fields)


In [None]:
class SimpleClassifier(Model):
    def __init__(
        self, vocab: Vocabulary, embedder: TextFieldEmbedder, encoder: Seq2VecEncoder
    ):
        super().__init__(vocab)
        self.embedder = embedder
        self.encoder = encoder
        num_labels = vocab.get_vocab_size("labels")
        self.classifier = torch.nn.Linear(encoder.get_output_dim(), num_labels)
        self.accuracy = CategoricalAccuracy()

    def forward(
        self,
        text: TextFieldTensors,
        label: torch.Tensor = None
    ) -> Dict[str, torch.Tensor]:
        # Shape: (batch_size, num_tokens, embedding_dim)
        embedded_text = self.embedder(text)
        # Shape: (batch_size, num_tokens)
        mask = util.get_text_field_mask(text)
        # Shape: (batch_size, encoding_dim)
        encoded_text = self.encoder(embedded_text, mask)
        # Shape: (batch_size, num_labels)
        logits = self.classifier(encoded_text)
        # Shape: (batch_size, num_labels)
        probs = torch.nn.functional.softmax(logits, dim=-1)
        output = {'probs': probs}
        if label is not None:
            self.accuracy(logits, label)
            # Shape: (1,)
            output['loss'] = torch.nn.functional.cross_entropy(logits, label)
        return output
    
    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {"accuracy": self.accuracy.get_metric(reset)}

In [None]:
def build_dataset_reader(*args, **kwargs) -> DatasetReader:
    return ClassificationDfReader(*args, **kwargs)

In [None]:
def build_vocab(instances: Iterable[Instance]) -> Vocabulary:
    print("Building the vocabulary")
    return Vocabulary.from_instances(instances)

In [None]:
def build_model(vocab: Vocabulary) -> Model:
    print("Building the model")
    vocab_size = vocab.get_vocab_size("tokens")
    embedder = BasicTextFieldEmbedder(
        {"bert": PretrainedTransformerEmbedder(model_name="bert-base-uncased")}
    )
    encoder = BertPooler(pretrained_model="bert-base-uncased")
    return SimpleClassifier(vocab, embedder, encoder)

In [None]:
def build_data_loaders(
    train_data: List[Instance],
    dev_data: List[Instance],
) -> Tuple[DataLoader, DataLoader]:
    train_loader = SimpleDataLoader(train_data, 8, shuffle=True)
    dev_loader = SimpleDataLoader(dev_data, 8, shuffle=False)
    return train_loader, dev_loader

In [None]:
def build_trainer(
    model: Model,
    serialization_dir: str,
    train_loader: DataLoader,
    dev_loader: DataLoader,
) -> Trainer:
    parameters = [(n, p) for n, p in model.named_parameters() if p.requires_grad]
    optimizer = HuggingfaceAdamWOptimizer(parameters)  # type: ignore
    trainer = GradientDescentTrainer(
        model=model,
        serialization_dir=serialization_dir,
        data_loader=train_loader,
        validation_data_loader=dev_loader,
        num_epochs=5,
        optimizer=optimizer,
    )
    return trainer

In [None]:
def read_data(reader: DatasetReader) -> Tuple[List[Instance], List[Instance]]:
    training_data = list(reader.read(df_train))
    validation_data = list(reader.read(df_validate))
    return training_data, validation_data

In [None]:
def run_training_loop(text_header, label_header, tokenizer, token_indexers, max_tokens):
    dataset_reader = build_dataset_reader(text_header, label_header, tokenizer, token_indexers, max_tokens)

    train_data, dev_data = read_data(dataset_reader)

    vocab = build_vocab(train_data + dev_data)
    model = build_model(vocab)

    train_loader, dev_loader = build_data_loaders(train_data, dev_data)
    train_loader.index_with(vocab)
    dev_loader.index_with(vocab)

    # You obviously won't want to create a temporary file for your training
    # results, but for execution in binder for this guide, we need to do this.
    with tempfile.TemporaryDirectory() as serialization_dir:
        trainer = build_trainer(model, serialization_dir, train_loader, dev_loader)
        print("Starting training")
        trainer.train()
        print("Finished training")

    return model, dataset_reader

In [None]:
@Predictor.register("context-classifier")
class ContextClassifierPredictor(Predictor):
    def predict(self, sentence: str) -> JsonDict:
        return self.predict_json({"sentence": sentence})

    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        sentence = json_dict["sentence"]
        return self._dataset_reader.text_to_instance(sentence)

In [None]:
def predictions(predictor, vocab, dataframe):
    for row in dataframe.itertuples():
        output = predictor.predict(row.texts)
        yield [row.snippets] + [
            (vocab.get_token_from_index(label_id, "labels"), prob)
            for label_id, prob in enumerate(output["probs"])
        ]

In [None]:
model, dataset_reader = run_training_loop(
    text_header="texts",
    label_header="mapped_label",
    tokenizer=PretrainedTransformerTokenizer(model_name="bert-base-uncased"),
    token_indexers={"bert": PretrainedTransformerIndexer(model_name="bert-base-uncased")},
    max_tokens=512
)

In [None]:
# Now we can evaluate the model on a new dataset.
test_data = list(dataset_reader.read(df_test))
data_loader = SimpleDataLoader(test_data, batch_size=8)
data_loader.index_with(model.vocab)
results = evaluate(model, data_loader)

In [None]:
vocab = model.vocab
predictor = ContextClassifierPredictor(model, dataset_reader)
# p = predictions(predictor, vocab, df_unseen)

In [None]:
# next(p)

In [None]:
def apply_predictions(predictor, vocab, text):
    output = predictor.predict(text)
    index_max = np.argmax(output['probs'])
    prediction = vocab.get_token_from_index(index_max, "labels")
    return prediction, output['probs'][index_max]

In [None]:
df_unseen[['predicted_labels', 'confidence']] = df_unseen.apply(lambda row: apply_predictions(predictor, vocab, row.texts), axis='columns').apply(pd.Series)
df_unseen.sample(7)

In [None]:
df_unseen.to_excel("output.xlsx")