In [16]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import transformers
%load_ext autoreload
%autoreload 2

transformers.logging.set_verbosity_error()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
from utils.seed import set_seed
set_seed(17)

# Load the data

In [18]:
from utils.data_processing import load_msrp_txt
train_df = load_msrp_txt('msr_paraphrase_train.txt')
test_df = load_msrp_txt('msr_paraphrase_test.txt')

b'Skipping line 102: expected 5 fields, saw 6\nSkipping line 656: expected 5 fields, saw 6\nSkipping line 867: expected 5 fields, saw 6\nSkipping line 880: expected 5 fields, saw 6\nSkipping line 980: expected 5 fields, saw 6\nSkipping line 1439: expected 5 fields, saw 6\nSkipping line 1473: expected 5 fields, saw 6\nSkipping line 1822: expected 5 fields, saw 6\nSkipping line 1952: expected 5 fields, saw 6\nSkipping line 2009: expected 5 fields, saw 6\nSkipping line 2230: expected 5 fields, saw 6\nSkipping line 2506: expected 5 fields, saw 6\nSkipping line 2523: expected 5 fields, saw 6\nSkipping line 2809: expected 5 fields, saw 6\nSkipping line 2887: expected 5 fields, saw 6\nSkipping line 2920: expected 5 fields, saw 6\nSkipping line 2944: expected 5 fields, saw 6\nSkipping line 3241: expected 5 fields, saw 6\nSkipping line 3358: expected 5 fields, saw 6\nSkipping line 3459: expected 5 fields, saw 6\nSkipping line 3491: expected 5 fields, saw 6\nSkipping line 3643: expected 5 fields

In [19]:
# convert the data to SentencesDataset

from dataset import SentencesDataset

train_ds = SentencesDataset(train_df.s1.tolist(), train_df.s2.tolist(), train_df.label.tolist())
test_ds = SentencesDataset(test_df.s1.tolist(), test_df.s2.tolist(), test_df.label.tolist())

# Train

In [20]:
import torch
from models import tBERT
from transformers import AdamW

# PARAMETERS

# train parameters
batch_size = 16
num_epochs = 10
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# lda parameters
n_topics = 70

# embedding parameters
max_length = 64

#optimizer parameters
lr = 5e-5

In [21]:
train_dataloader = DataLoader(train_ds, shuffle=True, batch_size=batch_size)
test_dataloader = DataLoader(test_ds, shuffle=True, batch_size=batch_size)

In [22]:
from utils.data_processing import create_sentences_corpus

corpus = create_sentences_corpus(train_dataloader)
model = tBERT(corpus, n_topics=n_topics, max_length=max_length, device=device)

optimizer = AdamW(model.parameters(), lr=lr)

In [23]:
from trainer import Trainer
trainer = Trainer(model=model, optimizer=optimizer, train_dataloader=train_dataloader, num_epochs=num_epochs, device=device)

In [24]:
trainer.train()

  0%|          | 0/2470 [00:00<?, ?it/s]

In [25]:
f1_train = trainer.evaluate(train_dataloader)

  0%|          | 0/247 [00:00<?, ?it/s]

In [26]:
f1_test = trainer.evaluate(test_dataloader)

  0%|          | 0/103 [00:00<?, ?it/s]

In [27]:
print(f'{f1_train=}')
print(f'{f1_test=}')

f1_train=0.9200925431571453
f1_test=0.8300597779675493
