In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import pandas as pd
import time
from src.models.autoencoder import Autoencoder
from src.data.dataset import CustomDataset
from src.data.utils import collate_batch, get_vocab
from torch import nn
from torchmetrics import Precision, Recall, F1Score, Accuracy
from torch.utils.data import DataLoader,Dataset, WeightedRandomSampler
from torchtext.data.utils import get_tokenizer

### 1. Data

In [3]:
df_train = pd.read_csv("../data/processed/train.csv")
df_val = pd.read_csv("../data/processed/val.csv")
df_test = pd.read_csv("../data/processed/test.csv")

### 2. Dataset

In [5]:
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

In [6]:
train_dataset = CustomDataset(
    df_train["text"].values.tolist(), df_train["toxic"].values.tolist(), tokenizer
)
val_dataset = CustomDataset(
    df_val["text"].values.tolist(), df_val["toxic"].values.tolist(), tokenizer
)
test_dataset = CustomDataset(
    df_test["text"].values.tolist(), df_test["toxic"].values.tolist(), tokenizer
)

### 3. Vocabulary

In [7]:
#vocab = get_vocab(train_dataset, tokenizer)
vocab = torch.load("../data/vocab.pt")

### 4. Training

In [8]:
def train(model, dataloader, optimizer, criterion, epoch):
    model.train()
    total_loss, total_count = 0, 0
    log_interval = 1000
    start_time = time.time()

    for idx, (text, label, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        reconstructued = model(text, offsets)
        loss = criterion(reconstructued, model.embed(text, offsets))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_loss += loss.item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| loss {:8.3f}".format(
                    epoch, idx, len(dataloader), total_loss / total_count
                )
            )
            total_acc, total_count = 0, 0
            start_time = time.time()
    return model


def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss, total_count = 0, 0

    with torch.no_grad():
        for idx, (text, label, offsets) in enumerate(dataloader):
            reconstructued = model(text, offsets)
            loss = criterion(reconstructued, model.embed(text, offsets))
            total_loss += loss.item()
            total_count += label.size(0)
    return total_loss / total_count

In [12]:
vocab_size = len(vocab)
emsize = 128
model = Autoencoder(vocab_size, emsize)

train_sampler = train_dataset.get_sampler()
train_dataloader = DataLoader(train_dataset, batch_size=32, collate_fn=lambda x: collate_batch(batch=x, vocab=vocab, tokenizer=tokenizer), sampler=train_sampler)

val_sampler = val_dataset.get_sampler()
val_dataloader = DataLoader(val_dataset, batch_size=32, collate_fn=lambda x: collate_batch(batch=x, vocab=vocab, tokenizer=tokenizer), sampler=val_sampler)

test_dataloader = DataLoader(test_dataset, batch_size=32, collate_fn=lambda x: collate_batch(batch=x, vocab=vocab, tokenizer=tokenizer))

EPOCHS = 10
criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=5.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    model = train(model, train_dataloader, optimizer, criterion, epoch)
    accu_val = evaluate(model, val_dataloader, criterion)
    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
       total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

| epoch   1 |  1000/ 5589 batches | loss    0.000
| epoch   1 |  2000/ 5589 batches | loss    0.000
| epoch   1 |  3000/ 5589 batches | loss    0.001
| epoch   1 |  4000/ 5589 batches | loss    0.001
| epoch   1 |  5000/ 5589 batches | loss    0.001
-----------------------------------------------------------
| end of epoch   1 | time: 22.55s | valid accuracy    0.000 
-----------------------------------------------------------
| epoch   2 |  1000/ 5589 batches | loss    0.000
| epoch   2 |  2000/ 5589 batches | loss    0.000
| epoch   2 |  3000/ 5589 batches | loss    0.000
| epoch   2 |  4000/ 5589 batches | loss    0.001
| epoch   2 |  5000/ 5589 batches | loss    0.001
-----------------------------------------------------------
| end of epoch   2 | time: 20.27s | valid accuracy    0.000 
-----------------------------------------------------------
| epoch   3 |  1000/ 5589 batches | loss    0.000
| epoch   3 |  2000/ 5589 batches | loss    0.000
| epoch   3 |  3000/ 5589 batches | lo

### 5. Evaluation

In [14]:
print('Checking the results of test dataset.')
accu_test = evaluate(model, test_dataloader, criterion)
print('test accuracy {:8.8f}'.format(accu_test))

Checking the results of test dataset.
test accuracy 0.00012621


### 6. Export

In [31]:
#torch.save(model.state_dict(), "./autoencoder.pt")