# Experiment 5: Performance on differnet datasets

In [None]:
from data_classes.TextLightningDataModule import TextLightningDataModule
from models.ClassifierSystem import LightningClassifier
from data_classes.pretrained_embeddings import get_pretrained_embeddings
from pytorch_lightning.loggers import TensorBoardLogger, CSVLogger
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
import pandas as pd


In [None]:
# Data and model settings
# dataset = "IMDB"
# num_class = 2
embedding = "Glove"
max_vectors = 20000
dim = 300
trunc = 234+2*173


# Training settings
max_epochs = None
max_steps = 16000
patience = 6
monitor = "Val Loss"
lr = 1e-3
batch_size = 128
num_workers = 0
advanced_metrics = False

embedding_level = "word"
num_layers = 1
output_layer_type = "linear"
hidden_size = 100

# Log file:
log_file = "exp5"
log_file_csv = "exp5_csv"



In [None]:
vocab, vectors = get_pretrained_embeddings(
    embedding=embedding, max_vectors=max_vectors, dim=dim)


In [None]:
data = {}

In [None]:
dataset="IMDB"
data["IMDB"] = TextLightningDataModule(vocab, dataset=dataset, batch_size=batch_size, num_workers=num_workers, trunc=trunc)

In [None]:
dataset="Yelp"
data["Yelp"] = TextLightningDataModule(vocab, dataset=dataset, batch_size=batch_size, num_workers=num_workers, trunc=trunc)

In [None]:
for model_type in ["LSTM", "GRU"]:
    for (dataset, num_class) in [("IMDB", 2), ("Yelp", 5)]:
        name = log_file + "-" + model_type + "-" + dataset + "-" + str(num_class)
        logger_tensor = TensorBoardLogger(log_file, name=name)
        logger_csv = CSVLogger(log_file_csv, name=name)
        actual_patience = patience if dataset == "IMDB" else 1
        trainer = Trainer(max_epochs=max_epochs, max_steps=max_steps, gpus=1, auto_select_gpus=True, callbacks=[EarlyStopping(monitor=monitor, patience=actual_patience)], logger=[logger_tensor, logger_csv])
        classifier = LightningClassifier(embedding_level=embedding_level, num_class=num_class, vocab=vocab, vectors=vectors, embedding_size=dim, learning_rate=lr, model_type=model_type, output_layer_type=output_layer_type, advanced_metrics=advanced_metrics)
        trainer.fit(classifier, data[dataset])
        trainer.test(ckpt_path="best")


In [None]:
df_list=[]
for model_type in ["LSTM", "GRU"]:
    for (dataset, num_class) in [("IMDB", 2), ("Yelp", 5)]:
        name = log_file + "-" + model_type + "-" + dataset + "-" + str(num_class)
        filename = "exp5_csv/"+name+"/version_0/metrics.csv"
        df = pd.read_csv(filename)
        df["Model Type"] = model_type
        df["Dataset"] = dataset
        df["Number of Classes"] = num_class
        df_list.append(df)
global_df = pd.concat(df_list)
global_df.to_csv("results/exp4_all.csv")