In [43]:
import os 
import sys
import torch
import fasttext
import fasttext.util

In [44]:
def load_fasttext():
    ft_path = '../data/fasttext'
    ft_fname = os.path.join(ft_path, 'cc.en.300.bin')
    if not os.path.exists(ft_fname):
        print("Downloading fasttext model")
        temp_fname = fasttext.util.download_model(
            "en", if_exists='ignore')
        os.rename(temp_fname, ft_fname)
        os.rename(temp_fname + '.gz', ft_fname + '.gz')

    print("Loading fasttext model")
    return fasttext.load_model(ft_fname)

fasttext_model = load_fasttext()

Loading fasttext model




In [1]:
import json

train_data = []
with open('/content/tasks/data/semgraph2/train.jsonl', 'r') as json_file:
    json_list = list(json_file)
    for json_str in json_list:
      result = json.loads(json_str)
      train_data.append(result)
print(train_data[0]['text'])

val_data = []
with open('/content/tasks/data/semgraph2/val.jsonl', 'r') as json_file:
    json_list = list(json_file)
    for json_str in json_list:
      result = json.loads(json_str)
      val_data.append(result)
print(val_data[0]['text'])

A nun looking at a camera
Polymeal nutrition increases cardiovascular mortality


In [2]:
def flatten_list(_2d_list):
    flat_list = []
    # Iterate through the outer list
    for element in _2d_list:
        if type(element) is list:
            # If the element is of type list, iterate through the sublist
            for item in element:
                flat_list.append(item)
        else:
            flat_list.append(element)
    return flat_list

import pandas as pd

val_df = pd.DataFrame(val_data)
val_targets = val_df['targets']
val_targets = flatten_list(val_targets)
val_y_df = pd.DataFrame(val_targets)

val_y_df['label'].value_counts()

no_relation            131006
relation_2_relation     28158
modifier_2_concept      26856
concept_2_modifier      26856
concept_2_relation      21449
relation_2_concpet      21449
relation_2_modifier      6170
modifier_2_relation      6170
Name: label, dtype: int64

In [6]:
from math import log2

def entropy(classes):
  total = sum(classes)
  entropy = 0.0
  for cls in classes:
    entropy += (cls/total) * log2(cls/total)
  return -entropy

data_entropy = entropy([28158,26856,26856,21449,21449,6170,6170])

In [4]:
import string

def tokenize(text):
  text.translate(str.maketrans('', '', string.punctuation))
  return text.split()

train_words = [tokenize(example['text']) for example in train_data]
val_words = [tokenize(example['text']) for example in val_data]

In [5]:
def get_fasttext(fasttext_model, words):
    embeddings = [[fasttext_model[word] for word in sentence] 
                  for sentence in words]
    return embeddings

train_fast_embeddings = get_fasttext(fasttext_model, train_words)
val_fast_embeddings = get_fasttext(fasttext_model, val_words)

In [6]:
output_fast_train = zip(train_fast_embeddings, train_words)
output_fast_val = zip(val_fast_embeddings, val_words)

In [8]:
import pickle

with open("./dataset/semgraph2/output_fast_train", "wb") as f:
    pickle.dump(output_fast_train, f)
with open("./dataset/semgraph2/output_fast_val", "wb") as f:
    pickle.dump(output_fast_val, f)

In [10]:
import torch
import numpy as np

from dataset import SemgraphEdgeDataset
from dataset import SemgraphNodeDataset

from torch.utils.data import DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def get_data_cls(task):
    if task == 'semgraph1':
        return SemgraphNodeDataset
    if task == 'semgraph2':
        return SemgraphEdgeDataset


def generate_batch(batch):
    x = torch.cat([item[0].unsqueeze(0) for item in batch], dim=0)
    y = torch.cat([item[1].unsqueeze(0) for item in batch], dim=0)

    x, y = x.to(device), y.to(device, dtype=torch.long)
    return (x, y)


def get_data_loader(task, dataset_cls, representations,
                    pca_size, mode, batch_size, shuffle,
                    pca=None, classes=None, words=None):
    data_set = dataset_cls(task, representations, pca_size,
                          mode, pca=pca, classes=classes, words=words)
    dataloader = DataLoader(data_set, batch_size=batch_size,
                            shuffle=shuffle, collate_fn=generate_batch)
    return dataloader, data_set.pca, data_set.classes, data_set.words

labels = ["concept_2_relation",
        "concept_2_modifier",
        "relation_2_concpet",
        "relation_2_modifier",
        "modifier_2_concept",
        "modifier_2_relation",
        "relation_2_relation"]

def get_data_loaders(task, representations, pca_size, batch_size):
    dataset_cls = get_data_cls(task)

    trainloader, pca, classes, words = get_data_loader(task,
        dataset_cls, representations, pca_size,
        'train', batch_size=batch_size, shuffle=True,  classes=np.array(labels))
    
    devloader, _, classes, words = get_data_loader(task,
        dataset_cls, representations, pca_size,
        'val', batch_size=batch_size, shuffle=False, pca=pca,
        classes=np.array(labels), words=words)
    
    return trainloader, devloader, devloader.dataset.n_classes, devloader.dataset.n_words

trainloader, devloader, n_classes, n_words = get_data_loaders("semgraph2", "onehot", 600, 64)

207104
268022


In [11]:
from model import MLP, TransparentDataParallel

def get_model(n_classes, n_words):
    mlp = MLP(
        "semgraph_edge", embedding_size=600, n_classes=n_classes, hidden_size=128,
        nlayers=1, dropout=0.3, representation="onehot", n_words=n_words)

    if torch.cuda.device_count() > 1:
        mlp = TransparentDataParallel(mlp)
    return mlp.to(device)
  
model = get_model(n_classes, n_words)

In [12]:
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from train_info import TrainInfo

def train(trainloader, devloader, model, eval_batches, wait_iterations):
    optimizer = optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss().to(device=device)

    with tqdm(total=wait_iterations) as pbar:
        mode_train_info = TrainInfo(pbar, wait_iterations, eval_batches)
        while not mode_train_info.finish:
            train_epoch(trainloader, devloader, model,
                        optimizer, criterion, mode_train_info)

    model.recover_best()

In [13]:
def _evaluate(evalloader, model):
  #criterion = nn.CrossEntropyLoss().to(device=device)
  dev_loss, dev_acc = 0, 0
  for x, y in evalloader:
    loss, acc = model.eval_batch(x, y)
    dev_loss += loss
    dev_acc += acc

  n_instances = len(evalloader.dataset)
  return {
    'loss': dev_loss / n_instances,
    'acc': dev_acc / n_instances
  }

def evaluate(evalloader, model):
  model.eval()
  with torch.no_grad():
    result = _evaluate(evalloader, model)
  model.train()
  return result

def train_epoch(trainloader, devloader, model, optimizer, criterion, mode_train_info):
  for x, y in trainloader:
    loss = model.train_batch(x, y, optimizer)
    mode_train_info.new_batch(loss)

    if mode_train_info.eval:
      dev_results = evaluate(devloader, model)

      if mode_train_info.is_best(dev_results):
        model.set_best()
      elif mode_train_info.finish:
        mode_train_info.print_progress(dev_results)
        return

      mode_train_info.print_progress(dev_results)

In [14]:
def eval_all(model, trainloader, devloader):
  train_results = evaluate(trainloader, model)
  dev_results = evaluate(devloader, model)
  
  train_loss= train_results['loss']
  test_loss = dev_results['loss']
  train_acc = train_results['acc'] 
  test_acc = dev_results['acc']

  print(f'Final loss. Train: {train_loss} Dev: {test_loss}')
  print(f'Final acc. Train: {train_acc} Dev: {test_acc}')
  return train_results, dev_results


def save_results(model, train_results, dev_results, results_fname):
  results = {'n_classes': model.n_classes,
             'embedding_size': model.embedding_size,
             'hidden_size': model.hidden_size,
             'nlayers': model.nlayers,
             'dropout_p': model.dropout_p,
             'train_loss': train_results['loss'],
             'dev_loss': dev_results['loss'],
             'train_acc': train_results['acc'],
             'dev_acc': dev_results['acc'],
            }
  with open(results_fname, "w") as write_file:
    json.dump(results, write_file, indent=4)

def save_checkpoints(model, train_results, dev_results):
  model.save("checkpoints")
  #results_fname = "checkpoints" + '/results.json'
  #save_results(model, train_results, dev_results, results_fname)


In [15]:
train(trainloader, devloader, model, 100, 2000)
train_results, dev_results = eval_all(model, trainloader, devloader)
save_checkpoints(model, train_results, dev_results)

Training loss: 0.7385 Dev loss: 6.4748 acc: 0.1565: 100%|██████████| 2100/2100 [01:08<00:00, 30.87it/s]


Final loss. Train: 1.2827091778643493 Dev: 3.612653796404231
Final acc. Train: 0.6804793477058411 Dev: 0.1587369740009308


In [7]:
trainloader, devloader, n_classes, n_words = get_data_loaders("semgraph2", "fasttext", 600, 64)
model = get_model(n_classes, n_words)

In [8]:
train(trainloader, devloader, model, 100, 2000)
train_results, dev_results = eval_all(model, trainloader, devloader)
save_checkpoints(model, train_results, dev_results)

Training loss: 0.7815 Dev loss: 6.2844 acc: 0.1579: 100%|██████████| 2100/2100 [01:06<00:00, 31.39it/s]


Final loss. Train: 1.2715235286575313 Dev: 3.858918787323612
Final acc. Train: 0.6799675226211548 Dev: 0.13734693825244904


In [None]:
def information_probe(loss, loss_clt, entropy):
  gain = loss_clt - loss
  return gain, gain/entropy