In [1]:
import os 
import sys
import torch
import fasttext
import fasttext.util

In [2]:
def load_fasttext():
    ft_path = '../data/fasttext'
    ft_fname = os.path.join(ft_path, 'cc.en.300.bin')
    if not os.path.exists(ft_fname):
        print("Downloading fasttext model")
        temp_fname = fasttext.util.download_model(
            "en", if_exists='ignore')
        os.rename(temp_fname, ft_fname)
        os.rename(temp_fname + '.gz', ft_fname + '.gz')

    print("Loading fasttext model")
    return fasttext.load_model(ft_fname)

fasttext_model = load_fasttext()

Loading fasttext model




In [3]:
import json

task_name = "anaphora"

train_data = []
with open(f'/content/tasks/data/{task_name}/train.jsonl', 'r') as json_file:
    json_list = list(json_file)
    for json_str in json_list:
      result = json.loads(json_str)
      train_data.append(result)
print(train_data[0]['text'])

val_data = []
with open(f'/content/tasks/data/{task_name}/val.jsonl', 'r') as json_file:
    json_list = list(json_file)
    for json_str in json_list:
      result = json.loads(json_str)
      val_data.append(result)
print(val_data[0]['text'])

The technician told the customer that he could pay with cash.

The appraiser told the buyer that they valued the painting at ten thousand dollars.



In [4]:
def flatten_list(_2d_list):
    flat_list = []
    # Iterate through the outer list
    for element in _2d_list:
        if type(element) is list:
            # If the element is of type list, iterate through the sublist
            for item in element:
                flat_list.append(item)
        else:
            flat_list.append(element)
    return flat_list

import pandas as pd

val_df = pd.DataFrame(val_data)
val_targets = val_df['targets']
val_targets = flatten_list(val_targets)
val_y_df = pd.DataFrame(val_targets)

val_y_df['label'].value_counts()

unaligned    452
aligned      428
Name: label, dtype: int64

In [5]:
from math import log2

def entropy(classes):
  total = sum(classes)
  entropy = 0.0
  for cls in classes:
    entropy += (cls/total) * log2(cls/total)
  return -entropy

data_entropy = entropy([423,49,32])

In [6]:
import string

def tokenize(text):
  text.translate(str.maketrans('', '', string.punctuation))
  return text.split()

train_words = [tokenize(example['text']) for example in train_data]
val_words = [tokenize(example['text']) for example in val_data]

In [7]:
def get_fasttext(fasttext_model, words):
    embeddings = [[fasttext_model[word] for word in sentence] 
                  for sentence in words]
    return embeddings

train_fast_embeddings = get_fasttext(fasttext_model, train_words)
val_fast_embeddings = get_fasttext(fasttext_model, val_words)

In [8]:
output_fast_train = zip(train_fast_embeddings, train_words)
output_fast_val = zip(val_fast_embeddings, val_words)

In [9]:
import pickle

os.makedirs(f"./dataset/{task_name}/", exist_ok=True)
with open(f"./dataset/{task_name}/output_fast_train", "wb") as f:
    pickle.dump(output_fast_train, f)
with open(f"./dataset/{task_name}/output_fast_val", "wb") as f:
    pickle.dump(output_fast_val, f)

In [11]:
from data_load import get_data_loaders
from trainer import get_model, save_checkpoints
from trainer import train, eval_all

trainloader, devloader, n_classes, n_words = get_data_loaders("anaphora", "onehot", 600, 64)
model = get_model(n_classes, n_words, "onehot", nlayers=1)

train(trainloader, devloader, model, 16, 2000)
train_results, dev_results = eval_all(model, trainloader, devloader)
save_checkpoints("anaphora", "onehot", model, train_results, dev_results)

Training loss: 0.8938 Dev loss: 0.0163 acc: 0.4920: 100%|██████████| 2560/2560 [00:05<00:00, 472.02it/s]


Final loss. Train: 0.0148353585973382 Dev: 0.0158929955214262
Final acc. Train: 0.5860000252723694 Dev: 0.5136363506317139
I am a new code


In [2]:
trainloader, devloader, n_classes, n_words = get_data_loaders("anaphora", "fasttext", 600, 64)
model = get_model(n_classes, n_words, "onehot")

train(trainloader, devloader, model, 100, 2000)
train_results, dev_results = eval_all(model, trainloader, devloader)
save_checkpoints("anaphora", "fasttext", model, train_results, dev_results)

Training loss: 0.0380 Dev loss: 0.0110 acc: 0.8772: 100%|██████████| 2300/2300 [00:03<00:00, 634.65it/s]


Final loss. Train: 0.0016896856250241399 Dev: 0.00753244711086154
Final acc. Train: 0.9789789915084839 Dev: 0.8532934188842773
I am a new code


In [None]:
def information_probe(loss, loss_clt, entropy):
  gain = loss_clt - loss
  return round(gain, 3), round(gain/entropy, 3)

In [None]:
for loss in [0.211, 0.230, 0.231, 0.217, 0.167]:
  onhot_info = information_probe(loss, 1.293, data_entropy)
  fasttext_info = information_probe(loss, 1.264, data_entropy)
  print((onhot_info, fasttext_info))