In [1]:
import torch
torch.cuda.get_device_name(0)

'GeForce 940MX'

In [1]:
import os
import random
import pandas as pd
import numpy as np
import parent_modules

from sklearn.model_selection import train_test_split

%load_ext autoreload
%load_ext nb_black
%autoreload 2

from definitions import *

datasets = {
    "posts": pd.read_csv(
        os.path.join(DATA_DIR, "posts.tsv"), sep="\t|\t ", header=None
    ),
    "test": pd.read_csv(os.path.join(DATA_DIR, "test.csv"), header=None),
    "train": pd.read_csv(os.path.join(DATA_DIR, "train.csv"), header=None),
    "users": pd.read_csv(os.path.join(DATA_DIR, "users.csv")),
}
datasets["posts"].columns = ["post_id", "user_id", "post"]
datasets["test"].columns = ["post_id", "label"]
datasets["train"].columns = ["post_id", "label"]

# print(datasets["posts"].applymap(lambda x: str(x).strip()).head())


train_ids = datasets["train"]["post_id"]
test_ids = datasets["test"]["post_id"]
train_posts = datasets["posts"][datasets["posts"].post_id.isin(list(train_ids))].post
datasets["train"].insert(2, "post", list(train_posts))
tests_posts = datasets["posts"][datasets["posts"].post_id.isin(list(test_ids))].post
datasets["test"].insert(2, "post", list(tests_posts))

flair_full_train = datasets["train"].copy(deep=True)
flair_full_train["label"] = "__label__" + datasets["train"]["label"].astype(str)
# flair_full_train["label"] = pd.Categorical(flair_full_train.label)

# split trainset to dev and train
flair_train, flair_test = train_test_split(
    flair_full_train,
    test_size=0.2,
    random_state=np.random.RandomState(12),
    stratify=flair_full_train["label"],
)
flair_dev, flair_test = train_test_split(
    flair_test,
    test_size=0.5,
    random_state=np.random.RandomState(12),
    stratify=flair_test["label"],
)


# save as_csv
flair_train.to_csv(
    os.path.join(FLAIR_DATA_DIR, "flair_train.csv"),
    sep="\t",
    index=False,
    header=False,
    columns=["label", "post"],
)
flair_dev.to_csv(
    os.path.join(FLAIR_DATA_DIR, "flair_dev.csv"),
    sep="\t",
    index=False,
    header=False,
    columns=["label", "post"],
)
flair_test.to_csv(
    os.path.join(FLAIR_DATA_DIR, "flair_test.csv"),
    sep="\t",
    index=False,
    header=False,
    columns=["label", "post"],
)



<IPython.core.display.Javascript object>

In [2]:
flair_test.head()

Unnamed: 0,post_id,label,post
5434,6768,__label__1,@PaulineHansonOz @SamClench @newscomauHQ #coro...
10350,12924,__label__2,US death toll is way less than estimated. This...
2828,3510,__label__0,Please check out https://t.co/NebOqVR5Ib @TheR...
7336,9118,__label__0,Riverside County Public Health recommend all c...
12526,15663,__label__0,Another member of Government sent to Covid Cov...


<IPython.core.display.Javascript object>

## Flair Load Embeddings

### Instractions
> In case you haven't download the used embeddings then click on the below links and place them in the *data/flair_files/* folder

#### Twitter Embeddings
1. https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/twitter.gensim.vectors.npy
2. https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/twitter.gensim

#### News Forward English
1. https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-forward-1024-v0.2rc.pt

#### News Backward English
1. https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-backward-1024-v0.2rc.pt

#### Glove
1. https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim.vectors.npy
2. https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim



In [2]:
from flair.embeddings import (
    StackedEmbeddings,
    DocumentLSTMEmbeddings,
    WordEmbeddings,
    FlairEmbeddings,
)
from flair.datasets import ClassificationCorpus
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from flair.data import Sentence

output_folder = os.path.join(FLAIR_OUTPUT_DIR, "rnn_flair_basic")
new_model_folder = os.path.join(FLAIR_OUTPUT_DIR, "flair_training_model")

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


<IPython.core.display.Javascript object>

In [4]:
word_embeddings = [
    #     WordEmbeddings(os.path.join(FLAIR_EMDG_DIR, "twitter.gensim")),
    #     WordEmbeddings(os.path.join(FLAIR_EMDG_DIR, "glove.gensim")),
    FlairEmbeddings(
        os.path.join(FLAIR_EMDG_DIR, "lm-news-english-forward-1024-v0.2rc.pt")
    ),
    FlairEmbeddings(
        os.path.join(FLAIR_EMDG_DIR, "lm-news-english-backward-1024-v0.2rc.pt")
    ),
]

# word_embeddings = [
#     WordEmbeddings("glove"),
#     FlairEmbeddings("news-forward-fast"),
#     FlairEmbeddings("news-backward-fast"),
# ]

corpus = ClassificationCorpus(
    FLAIR_DATA_DIR,
    test_file="flair_test.csv",
    dev_file="flair_dev.csv",
    train_file="flair_train.csv",
)

2020-06-15 01:34:32,443 Reading data from /home/giannhs/PycharmProjects/data_challenge/data/flair_data_dir
2020-06-15 01:34:32,443 Train: /home/giannhs/PycharmProjects/data_challenge/data/flair_data_dir/flair_train.csv
2020-06-15 01:34:32,444 Dev: /home/giannhs/PycharmProjects/data_challenge/data/flair_data_dir/flair_dev.csv
2020-06-15 01:34:32,445 Test: /home/giannhs/PycharmProjects/data_challenge/data/flair_data_dir/flair_test.csv


<IPython.core.display.Javascript object>

In [5]:
document_embeddings = DocumentLSTMEmbeddings(
    word_embeddings,
    hidden_size=512,
    reproject_words=True,
    reproject_words_dimension=256,
)
classifier = TextClassifier(
    document_embeddings,
    label_dictionary=corpus.make_label_dictionary(),
    multi_label=True,
)


2020-06-15 01:34:35,605 Computing label dictionary. Progress:


  """
100%|██████████| 11898/11898 [00:06<00:00, 1819.79it/s]

2020-06-15 01:34:42,259 [b'0', b'4', b'10', b'2', b'3', b'8', b'1', b'14', b'5', b'12', b'13', b'7', b'11', b'9', b'6']





<IPython.core.display.Javascript object>

### Model training  with basic parameters.

In [None]:
trainer = ModelTrainer(classifier, corpus)
trainer.train(FLAIR_OUTPUT_DIR, max_epochs=10)

2020-06-15 01:34:47,192 ----------------------------------------------------------------------------------------------------
2020-06-15 01:34:47,193 Model: "TextClassifier(
  (document_embeddings): DocumentLSTMEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=2048, out_features=256, bias=True)
    (rnn): GRU(256, 512)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  

Process Process-9:
Process Process-12:
Process Process-10:
Process Process-11:
Process Process-14:
Process Process-13:
Traceback (most recent call last):
Traceback (most recent call last):


2020-06-15 01:35:15,602 ----------------------------------------------------------------------------------------------------


  File "/home/giannhs/miniconda3/envs/data_challenge/lib/python3.7/multiprocessing/process.py", line 300, in _bootstrap
    util._exit_function()
  File "/home/giannhs/miniconda3/envs/data_challenge/lib/python3.7/multiprocessing/process.py", line 300, in _bootstrap
    util._exit_function()
Traceback (most recent call last):
  File "/home/giannhs/miniconda3/envs/data_challenge/lib/python3.7/multiprocessing/process.py", line 300, in _bootstrap
    util._exit_function()
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/giannhs/miniconda3/envs/data_challenge/lib/python3.7/multiprocessing/util.py", line 337, in _exit_function
    _run_finalizers()
  File "/home/giannhs/miniconda3/envs/data_challenge/lib/python3.7/multiprocessing/process.py", line 300, in _bootstrap
    util._exit_function()
  File "/home/giannhs/miniconda3/envs/data_challenge/lib/python3.7/multiprocessing/util.py", line 337, in _exit_function
    _run_finalizers()
  File "/home/giannhs/min

2020-06-15 01:35:15,607 Exiting from training early.


Traceback (most recent call last):
  File "/home/giannhs/miniconda3/envs/data_challenge/lib/python3.7/multiprocessing/process.py", line 300, in _bootstrap
    util._exit_function()
  File "/home/giannhs/miniconda3/envs/data_challenge/lib/python3.7/multiprocessing/util.py", line 337, in _exit_function
    _run_finalizers()
  File "/home/giannhs/miniconda3/envs/data_challenge/lib/python3.7/multiprocessing/util.py", line 277, in _run_finalizers
    finalizer()
  File "/home/giannhs/miniconda3/envs/data_challenge/lib/python3.7/multiprocessing/process.py", line 300, in _bootstrap
    util._exit_function()
  File "/home/giannhs/miniconda3/envs/data_challenge/lib/python3.7/multiprocessing/util.py", line 337, in _exit_function
    _run_finalizers()
  File "/home/giannhs/miniconda3/envs/data_challenge/lib/python3.7/multiprocessing/util.py", line 337, in _exit_function
    _run_finalizers()


2020-06-15 01:35:15,608 Saving model ...


  File "/home/giannhs/miniconda3/envs/data_challenge/lib/python3.7/multiprocessing/util.py", line 277, in _run_finalizers
    finalizer()
  File "/home/giannhs/miniconda3/envs/data_challenge/lib/python3.7/multiprocessing/util.py", line 277, in _run_finalizers
    finalizer()
  File "/home/giannhs/miniconda3/envs/data_challenge/lib/python3.7/multiprocessing/util.py", line 277, in _run_finalizers
    finalizer()
  File "/home/giannhs/miniconda3/envs/data_challenge/lib/python3.7/multiprocessing/util.py", line 201, in __call__
    res = self._callback(*self._args, **self._kwargs)
  File "/home/giannhs/miniconda3/envs/data_challenge/lib/python3.7/multiprocessing/util.py", line 201, in __call__
    res = self._callback(*self._args, **self._kwargs)
  File "/home/giannhs/miniconda3/envs/data_challenge/lib/python3.7/multiprocessing/util.py", line 277, in _run_finalizers
    finalizer()
  File "/home/giannhs/miniconda3/envs/data_challenge/lib/python3.7/multiprocessing/util.py", line 201, in __ca

2020-06-15 01:35:15,943 Done.
2020-06-15 01:35:15,945 ----------------------------------------------------------------------------------------------------
2020-06-15 01:35:15,947 Testing using best model ...


### Load already trained model

In [None]:
classifier = TextClassifier.load(os.path.join(output_folder, "best-model.pt"))

In [None]:
type(classifier)

In [None]:
classifier.decoder

### Model evaluation

In [None]:
predictions = []
for test_post in list(datasets["test"].post)[:10]:

    sentence = Sentence(test_post)
    res = classifier.predict(sentence)
    print(res)
    predictions.append(sentence.labels)

print(predictions)