In [2]:
import os 
import sys
import torch
import fasttext
import fasttext.util

In [6]:
def load_fasttext():
    ft_path = '../data/fasttext'
    ft_fname = os.path.join(ft_path, 'cc.en.300.bin')
    if not os.path.exists(ft_fname):
        print("Downloading fasttext model")
        temp_fname = fasttext.util.download_model(
            "en", if_exists='ignore')
        os.rename(temp_fname, ft_fname)
        os.rename(temp_fname + '.gz', ft_fname + '.gz')

    print("Loading fasttext model")
    return fasttext.load_model(ft_fname)

fasttext_model = load_fasttext()

Loading fasttext model




In [3]:
import json

train_data = []
with open('/content/tasks/data/semgraph2/train.jsonl', 'r') as json_file:
    json_list = list(json_file)
    for json_str in json_list:
      result = json.loads(json_str)
      train_data.append(result)
print(train_data[0]['text'])

val_data = []
with open('/content/tasks/data/semgraph2/val.jsonl', 'r') as json_file:
    json_list = list(json_file)
    for json_str in json_list:
      result = json.loads(json_str)
      val_data.append(result)
print(val_data[0]['text'])

A man is on a surfboard surfing the waves and catching good air
Polymeal nutrition increases cardiovascular mortality


In [8]:
import string

def tokenize(text):
  text.translate(str.maketrans('', '', string.punctuation))
  return text.split()

train_words = [tokenize(example['text']) for example in train_data]
val_words = [tokenize(example['text']) for example in val_data]

In [11]:
def get_fasttext(fasttext_model, words):
    embeddings = [[fasttext_model[word] for word in sentence] 
                  for sentence in words]
    return embeddings

train_fast_embeddings = get_fasttext(fasttext_model, train_words)
val_fast_embeddings = get_fasttext(fasttext_model, val_words)

In [12]:
output_fast_train = zip(train_fast_embeddings, train_words)
output_fast_val = zip(val_fast_embeddings, val_words)

In [14]:
import pickle

with open("./dataset/output_fast_train", "wb") as f:
    pickle.dump(output_fast_train, f)
with open("./dataset/output_fast_val", "wb") as f:
    pickle.dump(output_fast_val, f)

In [4]:
import torch

from dataset import SemgraphEdgeDataset
from dataset import SemgraphNodeDataset

from torch.utils.data import DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def get_data_cls(task):
    if task == 'semgraph_node':
        return SemgraphNodeDataset
    if task == 'semgraph_edge':
        return SemgraphEdgeDataset


def generate_batch(batch):
    x = torch.cat([item[0].unsqueeze(0) for item in batch], dim=0)
    y = torch.cat([item[1].unsqueeze(0) for item in batch], dim=0)

    x, y = x.to(device), y.to(device, dtype=torch.long)
    return (x, y)


def get_data_loader(dataset_cls, representations,
                    pca_size, mode, batch_size, shuffle,
                    pca=None, classes=None, words=None):
    data_set = dataset_cls(representations, pca_size,
                          mode, pca=pca, classes=classes, words=words)
    dataloader = DataLoader(data_set, batch_size=batch_size,
                            shuffle=shuffle, collate_fn=generate_batch)
    print(data_set.words)
    return dataloader, data_set.pca, data_set.classes, data_set.words


def get_data_loaders(task, representations, pca_size, batch_size):
    dataset_cls = get_data_cls(task)

    trainloader, pca, classes, words = get_data_loader(
        dataset_cls, representations, pca_size,
        'train', batch_size=batch_size, shuffle=True)
    devloader, _, classes, words = get_data_loader(
        dataset_cls, representations, pca_size,
        'val', batch_size=batch_size, shuffle=False, pca=pca,
        classes=classes, words=words)
    return trainloader, devloader, devloader.dataset.n_classes, devloader.dataset.n_words

trainloader, devloader, n_classes, n_words = get_data_loaders("semgraph_edge", "fast", 600, 64)

None
None


In [5]:
from model import MLP, TransparentDataParallel

def get_model(n_classes, n_words):
    mlp = MLP(
        "semgraph_edge", embedding_size=600, n_classes=n_classes, hidden_size=128,
        nlayers=1, dropout=0.3, representation="fast", n_words=n_words)

    if torch.cuda.device_count() > 1:
        mlp = TransparentDataParallel(mlp)
    return mlp.to(device)
  
model = get_model(n_classes, n_words)

In [6]:
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from train_info import TrainInfo

def train(trainloader, devloader, model, eval_batches, wait_iterations):
    optimizer = optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss().to(device=device)

    with tqdm(total=wait_iterations) as pbar:
        mode_train_info = TrainInfo(pbar, wait_iterations, eval_batches)
        while not mode_train_info.finish:
            train_epoch(trainloader, devloader, model,
                        optimizer, criterion, mode_train_info)

    model.recover_best()

In [7]:
def _evaluate(evalloader, model):
  #criterion = nn.CrossEntropyLoss().to(device=device)
  dev_loss, dev_acc = 0, 0
  for x, y in evalloader:
    loss, acc = model.eval_batch(x, y)
    dev_loss += loss
    dev_acc += acc

  n_instances = len(evalloader.dataset)
  return {
    'loss': dev_loss / n_instances,
    'acc': dev_acc / n_instances
  }

def evaluate(evalloader, model):
  model.eval()
  with torch.no_grad():
    result = _evaluate(evalloader, model)
  model.train()
  return result

def train_epoch(trainloader, devloader, model, optimizer, criterion, mode_train_info):
  for x, y in trainloader:
    loss = model.train_batch(x, y, optimizer)
    mode_train_info.new_batch(loss)

    if mode_train_info.eval:
      dev_results = evaluate(devloader, model)

      if mode_train_info.is_best(dev_results):
        model.set_best()
      elif mode_train_info.finish:
        mode_train_info.print_progress(dev_results)
        return

      mode_train_info.print_progress(dev_results)

In [9]:
def eval_all(model, trainloader, devloader):
  train_results = evaluate(trainloader, model)
  dev_results = evaluate(devloader, model)

  print('Final loss. Train: %.4f Dev: %.4f', (train_results['loss'], dev_results['loss']))
  print('Final acc. Train: %.4f Dev: %.4f', (train_results['acc'], dev_results['acc']))
  return train_results, dev_results


def save_results(model, train_results, dev_results, results_fname):
  results = {'n_classes': model.n_classes,
             'embedding_size': model.embedding_size,
             'hidden_size': model.hidden_size,
             'nlayers': model.nlayers,
             'dropout_p': model.dropout_p,
             'train_loss': train_results['loss'],
             'dev_loss': dev_results['loss'],
             'train_acc': train_results['acc'],
             'dev_acc': dev_results['acc'],
            }
  with open(results_fname, "w") as write_file:
    json.dump(results, write_file, indent=4)

def save_checkpoints(model, train_results, dev_results):
  model.save("checkpoints")
  results_fname = "checkpoints" + '/results.json'
  save_results(model, train_results, dev_results, results_fname)


  print('Final loss. Train: %.4f Dev: %.4f' (train_results['loss'], dev_results['loss']))
  print('Final acc. Train: %.4f Dev: %.4f' (train_results['acc'], dev_results['acc']))


In [8]:
train(trainloader, devloader, model, 100, 2000)
train_results, dev_results = eval_all(model, trainloader, devloader)
save_checkpoints(model, train_results, dev_results)

Training loss: 1.0296 Dev loss: 1.6740 acc: 0.5630:  71%|███████▏  | 4573/6400 [02:19<00:59, 30.47it/s]