In [1]:
from io import open

# !!!NEED TO DELETE!!!! JUST FOR DEV!!!!
# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import time, random, numpy as np, argparse, sys, re, os
from types import SimpleNamespace, new_class

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

from bert import BertModel
from optimizer import AdamW
from tqdm import trange

import logging

from PCGrad import PCGrad
from datasets import SentenceClassificationDataset, SentencePairDataset, \
    load_multitask_data, load_multitask_test_data



from evaluation import model_eval_sst, test_model_multitask


In [2]:
TQDM_DISABLE = False

In [3]:
# fix the random seed
def seed_everything(seed=11711):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

In [4]:
BERT_HIDDEN_SIZE = 768
N_SENTIMENT_CLASSES = 5


class MultitaskBERT(nn.Module):
    '''
    This module should use BERT for 3 tasks:

    - Sentiment classification (predict_sentiment)
    - Paraphrase detection (predict_paraphrase)
    - Semantic Textual Similarity (predict_similarity)
    '''
    def __init__(self, config):
        super(MultitaskBERT, self).__init__()
        # You will want to add layers here to perform the downstream tasks.
        # Pretrain mode does not require updating bert paramters.
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        for param in self.bert.parameters():
            if config.option == 'pretrain':
                param.requires_grad = False
            elif config.option == 'finetune':
                param.requires_grad = True
        ### TODO
        self.sentiment_classifier = nn.Linear(self.bert.config.hidden_size, 5)
        self.paraphrase_classifier = nn.Linear(self.bert.config.hidden_size * 2, 2)
        self.similarity_classifier = nn.Linear(self.bert.config.hidden_size * 2, 1)
        # raise NotImplementedError


    def forward(self, input_ids, attention_mask):
        'Takes a batch of sentences and produces embeddings for them.'
        # The final BERT embedding is the hidden state of [CLS] token (the first token)
        # Here, you can start by just returning the embeddings straight from BERT.
        # When thinking of improvements, you can later try modifying this
        # (e.g., by adding other layers).
        ### TODO
        bert_outputs = self.bert(input_ids, attention_mask)
        return bert_outputs['pooler_output']
        # ['last_hidden_state'][:, 0, :] ['pooler_output']
        raise NotImplementedError


    def predict_sentiment(self, input_ids, attention_mask):
        '''Given a batch of sentences, outputs logits for classifying sentiment.
        There are 5 sentiment classes:
        (0 - negative, 1- somewhat negative, 2- neutral, 3- somewhat positive, 4- positive)
        Thus, your output should contain 5 logits for each sentence.
        '''
        ### TODO
        embeddings = self.forward(input_ids, attention_mask)
        return self.sentiment_classifier(embeddings)
        raise NotImplementedError


    def predict_paraphrase(self,
                           input_ids_1, attention_mask_1,
                           input_ids_2, attention_mask_2):
        '''Given a batch of pairs of sentences, outputs a single logit for predicting whether they are paraphrases.
        Note that your output should be unnormalized (a logit); it will be passed to the sigmoid function
        during evaluation, and handled as a logit by the appropriate loss function.
        '''
        ### TODO
        embeddings_1 = self.forward(input_ids_1, attention_mask_1)
        embeddings_2 = self.forward(input_ids_2, attention_mask_2)
        concat_embeddings = torch.cat((embeddings_1, embeddings_2), dim=1)
        return self.paraphrase_classifier(concat_embeddings)
        raise NotImplementedError


    def predict_similarity(self,
                           input_ids_1, attention_mask_1,
                           input_ids_2, attention_mask_2):
        '''Given a batch of pairs of sentences, outputs a single logit corresponding to how similar they are.
        Note that your output should be unnormalized (a logit); it will be passed to the sigmoid function
        during evaluation, and handled as a logit by the appropriate loss function.
        '''
        ### TODO
        embeddings_1 = self.forward(input_ids_1, attention_mask_1)
        embeddings_2 = self.forward(input_ids_2, attention_mask_2)
        concat_embeddings = torch.cat((embeddings_1, embeddings_2), dim=1)
        return self.similarity_classifier(concat_embeddings)
        raise NotImplementedError
    
def save_model(model, optimizer, args, config, filepath):
    save_info = {
        'model': model.state_dict(),
        'optim': optimizer.state_dict(),
        'args': args,
        'model_config': config,
        'system_rng': random.getstate(),
        'numpy_rng': np.random.get_state(),
        'torch_rng': torch.random.get_rng_state(),
    }

    torch.save(save_info, filepath)
    print(f"save the model to {filepath}")
    

In [5]:
## Currently only trains on sst dataset
# Curttently trains 3 task (use simple Objective func = LossFunc1+LF2+LF3) without Gradient Surgery
def train_multitask(args):
    device = torch.device('cuda') if args["use_gpu"] else torch.device('cpu')
    # Load data
    # Create the data and its corresponding datasets and dataloader
    sst_train_data, num_labels,para_train_data, sts_train_data = load_multitask_data(args["sst_train"],args["para_train"],args["sts_train"], split ='train')
    sst_dev_data, num_labels,para_dev_data, sts_dev_data = load_multitask_data(args["sst_dev"],args["para_dev"],args["sts_dev"], split ='train')

    sst_train_data = SentenceClassificationDataset(sst_train_data, args)
    sst_dev_data = SentenceClassificationDataset(sst_dev_data, args)

    sst_train_dataloader = DataLoader(sst_train_data, shuffle=True, batch_size=args["batch_size"],
                                      collate_fn=sst_train_data.collate_fn)
    sst_dev_dataloader = DataLoader(sst_dev_data, shuffle=False, batch_size=args["batch_size"],
                                    collate_fn=sst_dev_data.collate_fn)
    
    # I wrote this for multi-task learning
    para_train_data = SentencePairDataset(para_train_data, args)
    para_dev_data = SentencePairDataset(para_dev_data, args)
    sts_train_data = SentencePairDataset(sts_train_data, args)
    sts_dev_data = SentencePairDataset(sts_dev_data, args)
    
    para_train_dataloader = DataLoader(para_train_data, shuffle=True, batch_size=args["batch_size"], collate_fn=para_train_data.collate_fn)
    para_dev_dataloader = DataLoader(para_dev_data, shuffle=False, batch_size=args["batch_size"], collate_fn=para_dev_data.collate_fn)

    sts_train_dataloader = DataLoader(sts_train_data, shuffle=True, batch_size=args["batch_size"], collate_fn=sts_train_data.collate_fn)
    sts_dev_dataloader = DataLoader(sts_dev_data, shuffle=False, batch_size=args["batch_size"], collate_fn=sts_dev_data.collate_fn)

    

    # Init model
    config = {'hidden_dropout_prob': args["hidden_dropout_prob"],
              'num_labels': num_labels,
              'hidden_size': 768,
              'data_dir': '.',
              'option': args["option"]}

    config = SimpleNamespace(**config)

    model = MultitaskBERT(config)
    model = model.to(device)

    lr = args["lr"]
    # !!!NEED TO BE MODIFY!!! from lr=0.0004 to lr=lr
    # hyperparams: lr, betas, eps, weight_decay
    # 0.38/0.315/0.009 =>(default) AdamW(model.parameters(), lr=1e-05, weight_decay=0, betas=[0.9, 0.999], eps=1e-06, correct_bias=True)
    # 0.380/0.315/0.009=> optimizer = AdamW(model.parameters(), lr=1e-05, weight_decay=0.95, betas=[0.9, 0.999], eps=1e-8, correct_bias=True)
    # 0.380/0.316/0.009 (pre-train)=> optimizer = AdamW(model.parameters(), lr=2e-05, weight_decay=0.95, betas=[0.9, 0.999], eps=1e-8)
    # 0.380/0.322/0.009 =>(half the weighted decay) optimizer = AdamW(model.parameters(), lr=2e-05, weight_decay=0.5, betas=[0.9, 0.999], eps=1e-08)
    # .../0.358/... => lr=1e-04,wd=0.9
    # 0.367 wd0.05
    # 0.322 2e-05/0.05
    # 0.316 2e-05/0.95
    # 0.4/50.38 => 4e-04/0.05 || multitask learning => 0.388 0.000 0.267
    # Para 0/sst 0.317/sts 0.132 <= multi-task 2e-05 0.05 1e-08
    optimizer = AdamW(model.parameters(), lr=1e-04, weight_decay=0.05, betas=[0.9, 0.999], eps=1e-06)
    pc_adam = PCGrad(optimizer)
    best_dev_acc = 0

    # Run for the specified number of epochs
    master_bar = trange(args['epochs'])
    for epoch in master_bar:
        # print(f"Training Epoch number {epoch}")
        model.train()
        train_loss = 0
        num_batches = 0
        dataloader_iterator = zip(sst_train_dataloader, para_train_dataloader, sts_train_dataloader)
        for (sst_batch, para_batch, sts_batch) in dataloader_iterator:
            pc_adam.zero_grad()

            # SST task
            sst_ids, sst_mask, sst_labels = (sst_batch['token_ids'],
                                        sst_batch['attention_mask'], sst_batch['labels'])
            sst_ids, sst_mask,sst_labels = sst_ids.to(device),sst_mask.to(device), sst_labels.to(device)
            sst_outputs = model.predict_sentiment(sst_ids, sst_mask)
            # print(sst_outputs)
            sst_loss = F.cross_entropy(sst_outputs, sst_labels.view(-1), reduction='sum') / args["batch_size"]

            # Paraphrase Task
            para_id1, para_mask1, para_id2, para_mask2, para_labels = (para_batch['token_ids_1'],
                                                            para_batch['attention_mask_1'], para_batch['token_ids_2'],para_batch['attention_mask_2'], para_batch['labels'])
            para_id1, para_mask1, para_id2, para_mask2, para_labels = para_id1.to(device), para_mask1.to(device), para_id2.to(device), para_mask2.to(device), para_labels.to(device)
            para_outputs = model.predict_paraphrase(para_id1, para_mask1, para_id2, para_mask2)
            n_classes = 2
            assert para_outputs.size(-1) == n_classes, f"Output size should be {n_classes}, but got {para_outputs.size(-1)}"
            assert para_labels.max() < n_classes, f"Max label is {para_labels.max()}, but should be less than {n_classes}"
            assert para_labels.min() >= 0, f"Min label is {para_labels.min()}, but should be greater than or equal to 0"
            para_loss = F.cross_entropy(para_outputs, para_labels.view(-1), reduction='mean') / args["batch_size"]

            # STS Task
            sts_id1, sts_mask1, sts_id2, sts_mask2, sts_labels = (sts_batch['token_ids_1'],
                                                    sts_batch['attention_mask_1'], sts_batch['token_ids_2'],sts_batch['attention_mask_2'], sts_batch['labels'])
            sts_id1, sts_mask1, sts_id2, sts_mask2, sts_labels = sts_id1.to(device), sts_mask1.to(device), sts_id2.to(device), sts_mask2.to(device), sts_labels.to(device)
            sts_outputs = model.predict_similarity(sts_id1, sts_mask1, sts_id2, sts_mask2)
            sts_outputs = torch.sigmoid(sts_outputs) * 5  # Scale to 0-5
            # print(sts_outputs)
            logging.info("Computing loss")
            sts_loss = F.mse_loss(sts_outputs.squeeze(), sts_labels.view(-1).float(), reduction='mean') / args["batch_size"]

            # Total loss and backprop 
            # total_loss = sst_loss + para_loss
            
            # Here we will compute the loss values for all the task in the end normalize all the loss vectors in a single direction. 
            total_loss = sst_loss + para_loss + sts_loss
            # print("Merging the gradients")
            pc_adam.pc_backward([sst_loss, para_loss, sts_loss])
            optimizer.step()

            train_loss += total_loss.item()
            num_batches += 1
            

        train_loss = train_loss / (num_batches)

        train_acc, train_f1, *_ = model_eval_sst(sst_train_dataloader, model, device)
        dev_acc, dev_f1, *_ = model_eval_sst(sst_dev_dataloader, model, device)

        if dev_acc > best_dev_acc:
            best_dev_acc = dev_acc
            save_model(model, optimizer, args, config, args["filepath"])

        print(f"Epoch {epoch}: train loss :: {train_loss :.3f}, train acc :: {train_acc :.3f}, dev acc :: {dev_acc :.3f}")

In [6]:
def test_model(args):
    with torch.no_grad():
        device = torch.device('cuda') if args.use_gpu else torch.device('cpu')
        saved = torch.load(args.filepath)
        config = saved['model_config']

        model = MultitaskBERT(config)
        model.load_state_dict(saved['model'])
        model = model.to(device)
        print(f"Loaded model to test from {args.filepath}")

        test_model_multitask(args, model, device)

def get_args():
    args = {
    "sst_train" : "data/ids-sst-train.csv",
    "sst_dev" : "data/ids-sst-dev.csv",
    "sst_test" : "data/ids-sst-test-student.csv",
    "para_train" : "data/quora-train.csv",
    "para_dev" : "data/quora-dev.csv",
    "para_test" : "data/quora-test-student.csv",
    "sts_train" : "data/sts-train.csv",
    "sts_dev" : "data/sts-dev.csv",
    "sts_test" : "data/sts-test-student.csv",
    "seed" : 11711,
    "epochs" : 10,
    "use_gpu" : False,
    "option" : "pretrain",
    "sst_dev_out" : "predictions/sst-dev-output.csv",
    "sst_test_out" : "predictions/sst-test-output.csv",
    "para_dev_out" : "predictions/para-dev-output.csv",
    "para_test_out" : "predictions/para-test-output.csv",
    "sts_dev_out" : "predictions/sts-dev-output.csv",
    "sts_test_out" : "predictions/sts-test-output.csv",
    "batch_size": 8,
    "hidden_dropout_prob" : 0.3,
    "lr" : 2e-5
}
    return args

In [8]:
# args = get_args()
args = get_args()
print(args['option'])
args["filepath"] = f'{args["option"]}-{args["epochs"]}-{args["lr"]}-multitask.pt' # save path
seed_everything(args["seed"])  # fix the seed for reproducibility
train_multitask(args)
test_model(args)

pretrain
Loaded 8544 train examples from data/ids-sst-train.csv
Loaded 141498 train examples from data/quora-train.csv
Loaded 6040 train examples from data/sts-train.csv
Loaded 1101 train examples from data/ids-sst-dev.csv
Loaded 20212 train examples from data/quora-dev.csv
Loaded 863 train examples from data/sts-dev.csv


  0%|          | 0/10 [11:01<?, ?it/s]


KeyboardInterrupt: 

In [15]:
import numpy as np

# Define the reference points and distances
# Format: (x, y, distance)
ref_points = [(2, 2, 1.5), (5, 3, 2.8), (4, 7, 3.2)]

# Gauss-Newton algorithm for trilateration
def trilateration_gauss_newton(ref_points, max_iterations=100, threshold=1e-6):
    # Initial estimate of (x, y)
    x = 0.0
    y = 0.0
    
    for _ in range(max_iterations):
        residuals = []
        jacobian = []
        
        # Compute residuals and Jacobian matrix
        for ref_point in ref_points:
            x_i, y_i, d_i = ref_point
            r_i = np.sqrt((x - x_i)**2 + (y - y_i)**2)
            
            # Append residual
            residuals.append(d_i - r_i)
            
            # Compute partial derivatives (Jacobian)
            jacobian.append([(x - x_i) / r_i, (y - y_i) / r_i])
        
        # Convert residuals and Jacobian to numpy arrays
        residuals = np.array(residuals)
        jacobian = np.array(jacobian)
        
        # Solve linear system J^TJ Δx = J^Tr
        delta_x = np.linalg.inv(jacobian.T @ jacobian) @ jacobian.T @ residuals
        
        # Update estimate
        x += delta_x[0]
        y += delta_x[1]
        
        # Check convergence criteria
        if np.linalg.norm(delta_x) < threshold:
            break
    
    return x, y

# Run the trilateration algorithm
estimated_x, estimated_y = trilateration_gauss_newton(ref_points)

# Print the estimated coordinates
print("Estimated coordinates:")
print(f"x: {estimated_x}")
print(f"y: {estimated_y}")


Estimated coordinates:
x: 2.410661827045464
y: 3.8124842360420828
