# Flair RNN 

This notebook contains the attempt of the text classification using the Flair NLP Framework [link|https://github.com/flairNLP/flair] which internally uses the pytorch ml library.

In [1]:
import torch
torch.cuda.device(0)

<torch.cuda.device at 0x7f932436d290>

In [2]:
import os
import random
import pandas as pd
import numpy as np
import parent_modules
import preprocessor
from collections import OrderedDict
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.combine import SMOTEENN


%load_ext autoreload
%load_ext nb_black
%autoreload 2

from definitions import *

TESTING = True

<IPython.core.display.Javascript object>

> Data loading and transformation for the correct usage.

In [3]:
datasets = {
    "posts": pd.read_csv(
        os.path.join(DATA_DIR, "posts.tsv"), sep="\t|\t ", header=None
    ),
    "test": pd.read_csv(os.path.join(DATA_DIR, "test.csv"), header=None),
    "train": pd.read_csv(os.path.join(DATA_DIR, "train.csv"), header=None),
    "users": pd.read_csv(os.path.join(DATA_DIR, "users.csv")),
}
datasets["posts"].columns = ["post_id", "user_id", "post"]
datasets["test"].columns = ["post_id", "label"]
datasets["train"].columns = ["post_id", "label"]

train_ids = datasets["train"]["post_id"]
test_ids = datasets["test"]["post_id"]
train_posts = datasets["posts"][datasets["posts"].post_id.isin(list(train_ids))].post
datasets["train"].insert(2, "post", list(train_posts))
tests_posts = datasets["posts"][datasets["posts"].post_id.isin(list(test_ids))].post
datasets["test"].insert(2, "post", list(tests_posts))

flair_full_train = datasets["train"].copy(deep=True)
flair_full_train["label"] = "__label__" + datasets["train"]["label"].astype(str)



  This is separate from the ipykernel package so we can avoid doing imports until


<IPython.core.display.Javascript object>

> Data splitting and Preprocessing.

In [4]:
# clean posts & find weights
flair_full_train["post"] = flair_full_train["post"].apply(
    lambda text: preprocessor.clean(text)
)
datasets["test"]["post"] = flair_full_train["post"].apply(
    lambda text: preprocessor.clean(text)
)

datasets["test"]["post"] = datasets["test"].post.apply(lambda post: post.strip())
datasets["test"]["post"] = datasets["test"]["post"].replace([None, ""], "NaN")

classes_counts = (
    flair_full_train["label"].value_counts().where(lambda cls: cls > 0).dropna()
)

# Apply Find the weights
total_entries = sum(classes_counts)
class_unordered_weight = {}
for label, cls_entries in zip(classes_counts.index, classes_counts):
    class_num = bytes(label.split("__label__")[1], "utf-8")
    class_unordered_weight[class_num] = np.round(
        (1 / cls_entries) * total_entries / 2, 4
    )

class_weight = OrderedDict()
for class_num in range(15):
    class_num = bytes(str(class_num), "utf-8")
    class_weight[class_num] = class_unordered_weight[class_num]

# split trainset to dev and train
if TESTING:
    big_train, small_train = train_test_split(
        flair_full_train,
        test_size=0.5,
        random_state=np.random.RandomState(12),
        stratify=flair_full_train["label"],
    )
    flair_train, flair_test = train_test_split(
        small_train,
        test_size=0.2,
        random_state=np.random.RandomState(12),
        stratify=small_train["label"],
    )

else:
    flair_train, flair_test = train_test_split(
        flair_full_train,
        test_size=0.2,
        random_state=np.random.RandomState(12),
        stratify=flair_full_train["label"],
    )

<IPython.core.display.Javascript object>

> Save the new csv files which will be loaded to the corpus of the flair.

In [5]:
flair_dev, flair_test = train_test_split(
    flair_test,
    test_size=0.5,
    random_state=np.random.RandomState(12),
    stratify=flair_test["label"],
)


# save as_csv
flair_train.to_csv(
    os.path.join(FLAIR_DATA_DIR, "flair_train.csv"),
    sep="\t",
    index=False,
    header=False,
    columns=["label", "post"],
)
flair_dev.to_csv(
    os.path.join(FLAIR_DATA_DIR, "flair_dev.csv"),
    sep="\t",
    index=False,
    header=False,
    columns=["label", "post"],
)
flair_test.to_csv(
    os.path.join(FLAIR_DATA_DIR, "flair_test.csv"),
    sep="\t",
    index=False,
    header=False,
    columns=["label", "post"],
)

<IPython.core.display.Javascript object>

In [5]:
class_weight

OrderedDict([(b'0', 1.1758),
             (b'1', 11.5366),
             (b'2', 2.781),
             (b'3', 4.7218),
             (b'4', 6.207),
             (b'5', 36.725),
             (b'6', 70.3245),
             (b'7', 367.25),
             (b'8', 54.1844),
             (b'9', 58.5),
             (b'10', 13.1946),
             (b'11', 20.4028),
             (b'12', 22.1087),
             (b'13', 31.9348),
             (b'14', 20.2776)])

<IPython.core.display.Javascript object>

## Flair Load Embeddings

### Instractions
> In case you haven't download the used embeddings then click on the below links and place them in the *data/flair_files/* folder

#### Twitter Embeddings
1. https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/twitter.gensim.vectors.npy
2. https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/twitter.gensim

#### News Forward English
1. https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-forward-1024-v0.2rc.pt

#### News Backward English
1. https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-backward-1024-v0.2rc.pt

#### Glove
1. https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim.vectors.npy
2. https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim



In [6]:
from flair.embeddings import (
    StackedEmbeddings,
    DocumentRNNEmbeddings,
    WordEmbeddings,
    FlairEmbeddings,
)
from flair.datasets import ClassificationCorpus
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from flair.data import Sentence
from flair.visual.training_curves import Plotter
from torch.utils.data.sampler import WeightedRandomSampler


output_folder = os.path.join(FLAIR_OUTPUT_DIR, "rnn_flair_basic")
new_model_folder = os.path.join(FLAIR_OUTPUT_DIR, "flair_training_model")

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


<IPython.core.display.Javascript object>

### Custom Loss Function 

#### Focal Loss - Dense Loss function for imbalanced Datasets

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable


class FocalLoss(nn.Module):
    def __init__(self, gamma=0, alpha=None, size_average=True):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        if isinstance(alpha, (float, int)):
            self.alpha = torch.Tensor([alpha, 1 - alpha])
        if isinstance(alpha, list):
            self.alpha = torch.Tensor(alpha)
        self.size_average = size_average

    def forward(self, input, target):
        if input.dim() > 2:
            input = input.view(input.size(0), input.size(1), -1)  # N,C,H,W => N,C,H*W
            input = input.transpose(1, 2)  # N,C,H*W => N,H*W,C
            input = input.contiguous().view(-1, input.size(2))  # N,H*W,C => N*H*W,C
        target = target.view(-1, 1)

        logpt = F.log_softmax(input)
        logpt = logpt.gather(1, target)
        logpt = logpt.view(-1)
        pt = Variable(logpt.data.exp())

        if self.alpha is not None:
            if self.alpha.type() != input.data.type():
                self.alpha = self.alpha.type_as(input.data)
            at = self.alpha.gather(0, target.data.view(-1))
            logpt = logpt * Variable(at)

        loss = -1 * (1 - pt) ** self.gamma * logpt
        if self.size_average:
            return loss.mean()
        else:
            return loss.sum()

<IPython.core.display.Javascript object>

### Word Embeddings using Word2Vec and Fasttext embeddings.

In [7]:
word_embeddings = [
    WordEmbeddings(os.path.join(FLAIR_EMDG_DIR, "twitter.gensim")),
    WordEmbeddings(os.path.join(FLAIR_EMDG_DIR, "glove.gensim")),
    FlairEmbeddings(
        os.path.join(FLAIR_EMDG_DIR, "lm-news-english-forward-1024-v0.2rc.pt")
    ),
    FlairEmbeddings(
        os.path.join(FLAIR_EMDG_DIR, "lm-news-english-backward-1024-v0.2rc.pt")
    ),
]

corpus = ClassificationCorpus(
    FLAIR_DATA_DIR,
    test_file="flair_test.csv",
    dev_file="flair_dev.csv",
    train_file="flair_train.csv",
)

2020-06-23 20:05:16,628 Reading data from /home/giannhs/PycharmProjects/data_challenge/data/flair_data_dir
2020-06-23 20:05:16,629 Train: /home/giannhs/PycharmProjects/data_challenge/data/flair_data_dir/flair_train.csv
2020-06-23 20:05:16,630 Dev: /home/giannhs/PycharmProjects/data_challenge/data/flair_data_dir/flair_dev.csv
2020-06-23 20:05:16,630 Test: /home/giannhs/PycharmProjects/data_challenge/data/flair_data_dir/flair_test.csv


<IPython.core.display.Javascript object>

### RNN Model Design
> Define the RNN Model parameters to along with the already constructed layers of the Flair Embeddings.

In [8]:
document_embeddings = DocumentRNNEmbeddings(
    word_embeddings,
    hidden_size=512,
    rnn_layers=2,
    dropout=0.2,
    reproject_words=True,
    rnn_type="GRU",
    bidirectional=True,
    reproject_words_dimension=256,
)
classifier = TextClassifier(
    document_embeddings,
    label_dictionary=corpus.make_label_dictionary(),
    loss_weights=class_weight,
    multi_label=False,
)
# Uncomment the below line if you want to run the network using the FocalLoss as Loss function.
# classifier.loss_fuction = FocalLoss(alpha=0.4, gamma=0.5)

2020-06-23 20:05:33,852 Computing label dictionary. Progress:


100%|██████████| 5932/5932 [00:03<00:00, 1876.26it/s]

2020-06-23 20:05:37,208 [b'12', b'0', b'4', b'2', b'3', b'10', b'11', b'5', b'13', b'14', b'1', b'9', b'8', b'6', b'7']





<IPython.core.display.Javascript object>

### Model training  with basic parameters.

In [None]:
training_models_path = os.path.join(FLAIR_OUTPUT_DIR, "flair_training_models")
trainer = ModelTrainer(classifier, corpus)
## Failed attempt to use WeightedRandomSampler to the
# trainer.train(
#     FLAIR_OUTPUT_DIR,
#     learning_rate=0.1,
#     mini_batch_size=64,
#     patience=5,
#     max_epochs=20,
#     embeddings_storage_mode="gpu",
#     sampler=WeightedRandomSampler(
#     list(class_weight.values()), len(class_weight.keys())
#     ),
# )
trainer.train(
    training_models_path,
    learning_rate=0.1,
    mini_batch_size=64,
    patience=5,
    max_epochs=10,
    embeddings_storage_mode="gpu",
    #     sampler=WeightedRandomSampler(
    #         list(class_weight.values()), len(class_weight.keys())
    #     ),
)

In [None]:
## Plot training results

2020-06-23 00:17:38,174 ----------------------------------------------------------------------------------------------------
2020-06-23 00:17:38,192 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('/home/giannhs/PycharmProjects/data_challenge/data/flair_emdg_dir/twitter.gensim')
      (list_embedding_1): WordEmbeddings('/home/giannhs/PycharmProjects/data_challenge/data/flair_emdg_dir/glove.gensim')
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
      (list_embedding_3): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (de

## Plot training results

In [None]:
from flair.visual.training_curves import Plotter
import os

flair_plt = Plotter()
training_res_path = os.path.join(FLAIR_OUTPUT_DIR, "loss.tsv")
print(training_res_path)

### Training curves

In [None]:
flair_plt.plot_training_curves(training_res_path)

### Learning curve

In [None]:
flair_plt.plot_learning_rate(training_res_path)

### Load already trained model

In [None]:
training_models_path = os.path.join(FLAIR_OUTPUT_DIR, "flair_training_models")
classifier_gru = TextClassifier.load(
    os.path.join(training_models_path, "best-model.pt")
)

### Model prediction

In [None]:
predictions = classifier_gru.predict(list(datasets["test"].post), multi_class_prob=True)

In [None]:
predictions[0].annotation_layers["class"]

In [None]:
score_per_class = [
    {
        f"class_{class_pred.value}": class_pred.score
        for class_pred in pred.annotation_layers["class"]
    }
    for pred in predictions
]
predictions_scores_df = pd.DataFrame.from_dict(score_per_class)
predictions_scores_df.index = datasets["test"].post_id
predictions_scores_df.head()


In [None]:
predictions_scores_df = predictions_scores_df[[f"class_{i}" for i in range(15)]]
predictions_scores_df.head()

In [None]:
predictions_scores_df.to_csv(
    os.path.join(FLAIR_PREDICTION_OUTPUTS, "gru_prediction.csv"),
    index=True,
    header=True,
    index_label="id",
)