In [1]:
import os 
import sys
import torch
import fasttext
import fasttext.util

In [2]:
def load_fasttext():
    ft_path = '../data/fasttext'
    ft_fname = os.path.join(ft_path, 'cc.en.300.bin')
    if not os.path.exists(ft_fname):
        print("Downloading fasttext model")
        temp_fname = fasttext.util.download_model(
            "en", if_exists='ignore')
        os.rename(temp_fname, ft_fname)
        os.rename(temp_fname + '.gz', ft_fname + '.gz')

    print("Loading fasttext model")
    return fasttext.load_model(ft_fname)

fasttext_model = load_fasttext()

Loading fasttext model




In [3]:
import json

train_data = []
with open('/content/tasks/data/sentiment/train.jsonl', 'r') as json_file:
    json_list = list(json_file)
    for json_str in json_list:
      result = json.loads(json_str)
      train_data.append(result)
print(train_data[0]['text'])

val_data = []
with open('/content/tasks/data/sentiment/val.jsonl', 'r') as json_file:
    json_list = list(json_file)
    for json_str in json_list:
      result = json.loads(json_str)
      val_data.append(result)
print(val_data[0]['text'])

When asked about the product, Eniyah said, 'I had absolutely no problem with this headset linking to my 8530 Blackberry Curve, and This movie was kind of long in length, but I enjoyed every minute of it.'. Eniyah liked the product . 
When asked about the product, Emiliano said, 'The reception is excellent, and As an earlier review noted, plug in this charger and nothing happens.'. Emiliano liked the product . 


In [11]:
def flatten_list(_2d_list):
    flat_list = []
    # Iterate through the outer list
    for element in _2d_list:
        if type(element) is list:
            # If the element is of type list, iterate through the sublist
            for item in element:
                flat_list.append(item)
        else:
            flat_list.append(element)
    return flat_list

import pandas as pd

val_df = pd.DataFrame(val_data)
val_targets = val_df['targets']
val_targets = flatten_list(val_targets)
val_y_df = pd.DataFrame(val_targets)

val_y_df['label'].value_counts()

unaligned    1200
aligned      1200
Name: label, dtype: int64

In [None]:
from math import log2

def entropy(classes):
  total = sum(classes)
  entropy = 0.0
  for cls in classes:
    entropy += (cls/total) * log2(cls/total)
  return -entropy

data_entropy = entropy([423,49,32])

In [4]:
import string

def tokenize(text):
  text.translate(str.maketrans('', '', string.punctuation))
  return text.split()

train_words = [tokenize(example['text']) for example in train_data]
val_words = [tokenize(example['text']) for example in val_data]

In [5]:
def get_fasttext(fasttext_model, words):
    embeddings = [[fasttext_model[word] for word in sentence] 
                  for sentence in words]
    return embeddings

train_fast_embeddings = get_fasttext(fasttext_model, train_words)
val_fast_embeddings = get_fasttext(fasttext_model, val_words)

In [6]:
output_fast_train = zip(train_fast_embeddings, train_words)
output_fast_val = zip(val_fast_embeddings, val_words)

In [7]:
import pickle

with open("./dataset/sentiment/output_fast_train", "wb") as f:
    pickle.dump(output_fast_train, f)
with open("./dataset/sentiment/output_fast_val", "wb") as f:
    pickle.dump(output_fast_val, f)

In [1]:
import torch
import numpy as np

from dataset import SemgraphEdgeDataset
from dataset import MonotonicityDataset
from dataset import ContradictionDataset

from torch.utils.data import DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def get_data_cls(task):
    if task == 'contradiction' or task == "sentiment":
        return ContradictionDataset
    if task == 'monotonicity':
        return MonotonicityDataset
    if task == 'semgraph2':
        return SemgraphEdgeDataset


def generate_batch(batch):
    x = torch.cat([item[0].unsqueeze(0) for item in batch], dim=0)
    y = torch.cat([item[1].unsqueeze(0) for item in batch], dim=0)

    x, y = x.to(device), y.to(device, dtype=torch.long)
    return (x, y)


def get_data_loader(task, dataset_cls, representations,
                    pca_size, mode, batch_size, shuffle,
                    pca=None, classes=None, words=None):
    data_set = dataset_cls(task, representations, pca_size,
                          mode, pca=pca, classes=classes, words=words)
    dataloader = DataLoader(data_set, batch_size=batch_size,
                            shuffle=shuffle, collate_fn=generate_batch)
    return dataloader, data_set.pca, data_set.classes, data_set.words

semgraph_labels = ["concept_2_relation",
          "concept_2_modifier",
          "relation_2_concpet",
          "relation_2_modifier",
          "modifier_2_concept",
          "modifier_2_relation",
          "relation_2_relation"]

def get_data_loaders(task, representations, pca_size, batch_size, labels):
    dataset_cls = get_data_cls(task)

    trainloader, pca, classes, words = get_data_loader(task,
        dataset_cls, representations, pca_size,
        'train', batch_size=batch_size, shuffle=True)

    devloader, _, classes, words = get_data_loader(task,
        dataset_cls, representations, pca_size,
        'val', batch_size=batch_size, shuffle=False, pca=pca, classes=classes, words=words)

    return trainloader, devloader, devloader.dataset.n_classes, devloader.dataset.n_words

In [2]:
from model import Classifier, TransparentDataParallel

def get_model(n_classes, n_words, embed):
    mlp = Classifier(
        "semgraph_edge", embedding_size=600, n_classes=n_classes, hidden_size=128,
        nlayers=1, dropout=0.5, representation=embed, n_words=n_words)

    if torch.cuda.device_count() > 1:
        mlp = TransparentDataParallel(mlp)
    return mlp.to(device)

In [3]:
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from train_info import TrainInfo

def train(trainloader, devloader, model, eval_batches, wait_iterations):
    optimizer = optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss().to(device=device)

    with tqdm(total=wait_iterations) as pbar:
        mode_train_info = TrainInfo(pbar, wait_iterations, eval_batches)
        while not mode_train_info.finish:
            train_epoch(trainloader, devloader, model,
                        optimizer, criterion, mode_train_info)

    model.recover_best()

In [4]:
def _evaluate(evalloader, model):
  #criterion = nn.CrossEntropyLoss().to(device=device)
  dev_loss, dev_acc = 0, 0
  for x, y in evalloader:
    loss, acc = model.eval_batch(x, y)
    dev_loss += loss
    dev_acc += acc

  n_instances = len(evalloader.dataset)
  return {
    'loss': dev_loss / n_instances,
    'acc': dev_acc / n_instances
  }

def evaluate(evalloader, model):
  model.eval()
  with torch.no_grad():
    result = _evaluate(evalloader, model)
  model.train()
  return result

def train_epoch(trainloader, devloader, model, optimizer, criterion, mode_train_info):
  for x, y in trainloader:
    loss = model.train_batch(x, y, optimizer)
    mode_train_info.new_batch(loss)

    if mode_train_info.eval:
      dev_results = evaluate(devloader, model)

      if mode_train_info.is_best(dev_results):
        model.set_best()
      elif mode_train_info.finish:
        mode_train_info.print_progress(dev_results)
        return

      mode_train_info.print_progress(dev_results)

In [5]:
import json

def eval_all(model, trainloader, devloader):
  train_results = evaluate(trainloader, model)
  dev_results = evaluate(devloader, model)
  
  train_loss= train_results['loss']
  test_loss = dev_results['loss']
  train_acc = train_results['acc'] 
  test_acc = dev_results['acc']

  print(f'Final loss. Train: {train_loss} Dev: {test_loss}')
  print(f'Final acc. Train: {train_acc} Dev: {test_acc}')
  return train_results, dev_results


def save_results(model, train_results, dev_results, results_fname):
  results = {'n_classes': model.n_classes,
             'embedding_size': model.embedding_size,
             'hidden_size': model.hidden_size,
             'nlayers': model.nlayers,
             'dropout_p': model.dropout_p,
             'train_loss': train_results['loss'],
             'dev_loss': dev_results['loss'],
             'train_acc': train_results['acc'].cpu().numpy().tolist(),
             'dev_acc': dev_results['acc'].cpu().numpy().tolist(),
            }
  with open(results_fname, "w") as write_file:
    json.dump(results, write_file, indent=4)

def save_checkpoints(task_name, emb_name, model, train_results, dev_results):
  checkpoint_dir = "checkpoints"+f"/{task_name}_{emb_name}"
  os.makedirs(checkpoint_dir, exist_ok=True)
  model.save(checkpoint_dir)
  results_fname = checkpoint_dir + '/results.json'
  save_results(model, train_results, dev_results, results_fname)


In [6]:

contradiction_labels = ["negative","contradict","antonym"]
mono_labels = ['+', '-', '=']
sentiment_labels = ['aligned', "unaligned", "contradict"]
trainloader, devloader, n_classes, n_words = get_data_loaders("sentiment", "onehot", 600, 64, sentiment_labels)
model = get_model(n_classes, n_words, "onehot")

In [17]:
import os

train(trainloader, devloader, model, 16, 2000)
train_results, dev_results = eval_all(model, trainloader, devloader)
save_checkpoints("sentiment", "onehot", model, train_results, dev_results)

Training loss: 0.0000 Dev loss: 0.5092 acc: 1.0000: 100%|██████████| 7712/7712 [00:09<00:00, 795.88it/s]


Final loss. Train: 0.0 Dev: 0.44262760877609253
Final acc. Train: 1.0 Dev: 1.0


In [7]:
trainloader, devloader, n_classes, n_words = get_data_loaders("sentiment", "fasttext", 600, 64, sentiment_labels)
model = get_model(n_classes, n_words, "onehot")

In [9]:
import os
train(trainloader, devloader, model, 100, 2000)
train_results, dev_results = eval_all(model, trainloader, devloader)
save_checkpoints("sentiment", "fasttext", model, train_results, dev_results)

Training loss: 1.2908 Dev loss: 1.6307 acc: 0.4492: 100%|██████████| 2100/2100 [00:03<00:00, 657.08it/s]


Final loss. Train: 1.4222075309753417 Dev: 1.5225190416971843
Final acc. Train: 0.5265000462532043 Dev: 0.4933333396911621


In [None]:
def information_probe(loss, loss_clt, entropy):
  gain = loss_clt - loss
  return round(gain, 3), round(gain/entropy, 3)

In [None]:
for loss in [0.211, 0.230, 0.231, 0.217, 0.167]:
  onhot_info = information_probe(loss, 1.293, data_entropy)
  fasttext_info = information_probe(loss, 1.264, data_entropy)
  print((onhot_info, fasttext_info))

In [None]:
import util
data_embeddings = util.read_data(
                f"./dataset/contradiction/output_fast_train")

In [None]:
for (sentence_emb, _) in data_embeddings:
  print(sentence_emb[0].shape)
  break