In [1]:
import time
import torch
from torch.utils.data import DataLoader, random_split
from torchtext.data import to_map_style_dataset

from reviews_dataset import reviews
from trainer import Trainer



In [5]:
def train(trainer: Trainer, dataset_path: str, EPOCHS = 10, LR = 5, BATCH_SIZE = 64):
    total_accu = None
    train_iter, test_iter = reviews(root=dataset_path)
    train_dataset = to_map_style_dataset(train_iter)
    test_dataset = to_map_style_dataset(test_iter)
    num_train = int(len(train_dataset) * 0.95)
    split_train_, split_valid_ = \
        random_split(train_dataset, [num_train, len(train_dataset) - num_train])

    train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
                                  shuffle=True, collate_fn=trainer.collate_batch)
    valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
                                  shuffle=True, collate_fn=trainer.collate_batch)
    test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                                 shuffle=True, collate_fn=trainer.collate_batch)

    for epoch in range(1, EPOCHS + 1):
        epoch_start_time = time.time()
        trainer.train(train_dataloader, epoch)
        accu_val = trainer.evaluate(valid_dataloader)
        if total_accu is not None and total_accu > accu_val:
            trainer.scheduler.step()
        else:
            total_accu = accu_val
        print('-' * 59)
        print('| end of epoch {:3d} | time: {:5.2f}s | '
              'valid accuracy {:8.3f} '.format(epoch,
                                               time.time() - epoch_start_time,
                                               accu_val))
        print('-' * 59)

    print('Checking the results of test dataset.')
    # accu_test = trainer.evaluate(test_dataloader)
    # print('test accuracy {:8.3f}'.format(accu_test))

    # trainer.model = trainer.model.to("cpu")
    # return trainer

def evaluate(trainer, dataloader: DataLoader):
    trainer.model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = trainer.model(text, offsets)
            loss = trainer.criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc / total_count

def evaluate(model, df_test):
    preds, y_test = predict(model, df_test)
    accuracy = accuracy_score(preds, y_test)
    recall = recall_score(preds, y_test, average=None)
    cm = confusion_matrix(preds, y_test)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['neg','neu','pos'])
    f1 = f1_score(preds, y_test, average=None)
    disp.plot()

    return pd.DataFrame({'dataset':[dataset], 'accuracy':[np.round(accuracy, 3)], 'recall':[np.round(recall, 3)], 'f1_score':[np.round(f1,3)], 'training samples':[len(df_train)]})

In [None]:
datasets = os.listdir("../FinalDatasets/")

In [6]:
# Hyperparameters
EPOCHS = 10  # epoch
LR = 5  # learning rate
BATCH_SIZE = 64  # batch size for training


dataset_path = 'ProcessedDatasets/all_reviews.json'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dataset = reviews(root=dataset_path, split='train')

trainer = Trainer(dataset, device, LR)
train(trainer, dataset_path)

# reviews_labels = {1: "Negative",
#                     2: "Neutral",
#                     3: "Positive"}

# ex_text_str1 = "Super mega proszek, bardzo dobry"
# ex_text_str2 = "Totalny badziew szkoda pieniędzy, beznadziejny, porażka"
# ex_text_str3 = "Nie domywa ale ładnie pachnie. Zostają smugi"
# ex_text_str4 = "Nie rozpuszczają się, dobrze domywają, ładnie pachną, ale bardzo drogie"

# print("This is a %s review" % reviews_labels[trainer.predict(ex_text_str1)])
# print("This is a %s review" % reviews_labels[trainer.predict(ex_text_str2)])
# print("This is a %s review" % reviews_labels[trainer.predict(ex_text_str3)])
# print("This is a %s review" % reviews_labels[trainer.predict(ex_text_str4)])


-----------------------------------------------------------
| end of epoch   1 | time:  1.53s | valid accuracy    0.860 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   2 | time:  1.37s | valid accuracy    0.871 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   3 | time:  1.33s | valid accuracy    0.879 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   4 | time:  1.55s | valid accuracy    0.889 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   5 | time:  1.44s | valid accuracy    0.907 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   6 | time:  1.41s |

In [None]:
evaluate(trainer, dataloader: DataLoader)