# Experiment 6: Performance on different datasets (IMDB and Yelp)

In [None]:
from data_classes.TextLightningDataModule import TextLightningDataModule
from models.ClassifierSystem import LightningClassifier
from data_classes.pretrained_embeddings import get_pretrained_embeddings
from pytorch_lightning.loggers import TensorBoardLogger, CSVLogger
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

from pytorch_lightning.callbacks import ModelCheckpoint
import pandas as pd

In [None]:
# Data and model settings
datasets = [("IMDB", 2), ("Yelp", 5)]
# num_class = 2
embeddings = ["Glove", "FastText", "Word2Vec"]
max_vectors = 20000
dim = 300
trunc = 234+2*173


# Training settings
max_epochs = None
max_steps = 20000
patience = 6
monitor = "Val Loss"
lr = 1e-3
batch_size = 64
num_workers = 0
advanced_metrics = True

model_types = ["LSTM", "GRU"]
embedding_level = "word"
num_layers = 1
output_layer_type = "linear"
hidden_size = 100

# Log file:
log_file = "exp6"
log_file_csv = "exp6_csv"
model_ckpt_path = "model_ckpt"


In [None]:
data = {}
vocab = {}
vectors = {}

In [None]:
for dataset, _ in datasets:
    for embedding in embeddings:
        vocab[embedding], vectors[embedding] = get_pretrained_embeddings(
            embedding=embedding, max_vectors=max_vectors, dim=dim)
        data[dataset+"-"+embedding] = TextLightningDataModule(
            vocab[embedding], dataset=dataset, batch_size=batch_size, num_workers=num_workers, trunc=trunc)


In [None]:
for model_type in model_types:  # x2
    for attention_type in [None, "last_hidden_layer", "self"]:  # x3
        for dataset, num_class in datasets:  # x2
            for embedding in embeddings:  # x3
                if attention_type is not None:
                    name = log_file + "-" + model_type + "-" + \
                        attention_type + "-" + dataset + "-" + \
                        str(num_class) + "-" + embedding
                else:
                    name = log_file + "-" + model_type + "-none-" + \
                        dataset + "-" + str(num_class) + "-" + embedding
                logger_tensor = TensorBoardLogger(log_file, name=name)
                logger_csv = CSVLogger(log_file_csv, name=name)
                checkpoint_callback = ModelCheckpoint(
                    dirpath=model_ckpt_path+"/"+name, monitor=monitor, filename=name+"-{epoch:02d}")
                actual_patience = patience if dataset == "IMDB" else 1
                trainer = Trainer(max_epochs=max_epochs, max_steps=max_steps, gpus=1, auto_select_gpus=True, callbacks=[
                    EarlyStopping(monitor=monitor, patience=actual_patience), checkpoint_callback], logger=[logger_tensor, logger_csv])
                classifier = LightningClassifier(embedding_level=embedding_level, num_class=num_class, vocab=vocab[embedding], vectors=vectors[embedding], embedding_size=dim,
                                                 learning_rate=lr, model_type=model_type, output_layer_type=output_layer_type, advanced_metrics=advanced_metrics, attention_type=attention_type, num_layers=num_layers)
                trainer.fit(classifier, data["-".join([dataset, embedding])])
                trainer.test(ckpt_path="best")


In [None]:
df_list = []
for model_type in model_types:  # x2
    for attention_type in [None, "last_hidden_layer", "self"]:  # x3  TODO change
        for dataset, num_class in datasets:  # x2 IMDB first only
            for embedding in embeddings:  # x3
                if attention_type is not None:
                    name = log_file + "-" + model_type + "-" + \
                        attention_type + "-" + dataset + "-" + \
                        str(num_class) + "-" + embedding
                else:
                    name = log_file + "-" + model_type + "-none-" + \
                        dataset + "-" + str(num_class) + "-" + embedding
                filename = "exp6_csv/"+name+"/version_0/metrics.csv"
                df = pd.read_csv(filename)
                df["Model Type"] = model_type
                df["Dataset"] = dataset
                df["Number of Classes"] = num_class
                df["Attention Type"] = attention_type if attention_type is not None else "none"
                df["Embedding"] = embedding
                df_list.append(df)
global_df = pd.concat(df_list)
global_df.to_csv("results/exp6_all.csv")