In [1]:
import torch

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.utils.data import DataLoader
from dataloader import doc2dialDataset
import ast
import pandas as pd
import json

  from .autonotebook import tqdm as notebook_tqdm


In [37]:
# df = pd.read_csv('data/doc2dial/Train_dataset/DEFAULT/DEFAULT_withRefs.csv')

In [39]:
# df[:10].to_csv('data/doc2dial/TEST/DDP_Finetune.csv')

In [2]:
path = 'data/doc2dial/Train_dataset/DEFAULT/DEFAULT_withRefs.csv'

dataset = doc2dialDataset(path)
dataloader = DataLoader(dataset, batch_size=4, shuffle=False)

In [3]:
b = next(iter(dataloader))

[{'id_sp': '6', 'tag': 'u', 'start_sp': 346, 'end_sp': 416, 'text_sp': 'you must report a change of address to DMV within ten days of moving. ', 'title': '1. Forgetting to Update Address', 'parent_titles': [], 'id_sec': '2', 'start_sec': 333, 'text_sec': 'By statute , you must report a change of address to DMV within ten days of moving. That is the case for the address associated with your license, as well as all the addresses associated with each registered vehicle, which may differ. ', 'end_sec': 567}, {'id_sp': '7', 'tag': 'u', 'start_sp': 416, 'end_sp': 567, 'text_sp': 'That is the case for the address associated with your license, as well as all the addresses associated with each registered vehicle, which may differ. ', 'title': '1. Forgetting to Update Address', 'parent_titles': [], 'id_sec': '2', 'start_sec': 333, 'text_sec': 'By statute , you must report a change of address to DMV within ten days of moving. That is the case for the address associated with your license, as well 

In [4]:
questions = b['question']
answers = b['answer']

In [5]:
refs= b['ref']
doc_ids = b['doc_id']
doc_ids[0]

'Top 5 DMV Mistakes and How to Avoid Them#3_0'

In [6]:
ast.literal_eval(refs[0])

[{'sp_id': '6', 'label': 'solution'}, {'sp_id': '7', 'label': 'solution'}]

In [7]:
refs_ID= [term['sp_id'] for term in ast.literal_eval(refs[0])]

In [8]:
doc_data = json.load(open('data/doc2dial/doc2dial_doc.json', 'r'))

doc_file_span = doc_data['doc_data']['dmv'][doc_ids[0]]['spans']
ll = [[doc_file_span[i] for i in l] for l in refs_ID]

true_ref_string = [[term['text_sp'] for term in _] for _ in ll]
concatenated_string = ' '.join(string for sublist in true_ref_string for string in sublist)

In [9]:
concatenated_string

'you must report a change of address to DMV within ten days of moving.  That is the case for the address associated with your license, as well as all the addresses associated with each registered vehicle, which may differ. '

In [5]:
questions

['Hello, I forgot o update my address, can you help me with that?',
 'Can I do my DMV transactions online?',
 'Thanks, and in case I forget to bring all of the documentation needed to the DMV office, what can I do?',
 'Ok, and can you tell me again where should I report my new address?']

In [2]:
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
import os
from QAModel import format_prompt

In [3]:
from torch.utils.data.distributed import DistributedSampler

def prepare(rank, world_size, batch_size=2, pin_memory=False, num_workers=0):
    path = 'data/doc2dial/Train_dataset/DEFAULT/DEFAULT_withRefs.csv'
    dataset = doc2dialDataset(path)
    sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank, shuffle=True, drop_last=False)
    
    dataloader = DataLoader(dataset, batch_size=batch_size, pin_memory=pin_memory, num_workers=num_workers, drop_last=False, shuffle=False, sampler=sampler)
    
    return dataloader

In [4]:
n_gpus = torch.cuda.device_count()
world_size = n_gpus
DDP_dataloader = prepare(rank=0, world_size=2, batch_size=2, pin_memory=False, num_workers=0)

In [5]:
DDP_dataloader.sampler.set_epoch(0)  

In [6]:
first_batch = next(iter(DDP_dataloader))

In [7]:
qs = first_batch['question']
qs

['And if I already sent it but a month has passed and I have not received it?',
 'No']

In [8]:
DDP_dataloader.sampler.set_epoch(2)  
first_batch = next(iter(DDP_dataloader))
qs = first_batch['question']
refs= first_batch['ref']
refs

["[{'sp_id': '17', 'label': 'precondition'}]",
 "[{'sp_id': '18', 'label': 'solution'}]"]

In [11]:
first_batch

{'question': ['need to know more about this complaint that I got',
  'For example If I reside outside New York City, will I pay at both place?'],
 'answer': ['Has a complaint been filed against you?',
  'No. If you reside outside New York City and you are exempt from the vehicle use tax in the county where you reside or have a business'],
 'ref': ["[{'sp_id': '17', 'label': 'precondition'}]",
  "[{'sp_id': '18', 'label': 'solution'}]"],
 'retrived_doc': ['Complaint filed against you or your business',
  'Registration fee chart for passenger vehicles \nThe chart shows the 2 year fee by vehicle weight. \n\nCounty use taxes and supplemental fees \nResidents of New York City and several counties in New York State must pay a vehicle use tax when they register a passenger vehicle or renew a passenger vehicle registration. If you reside outside New York City and you are exempt from the vehicle use tax in the county where you reside or have a business , complete form UT-11C.'],
 'doc_id': ['Ab

In [10]:
ref_strings = first_batch['ref_string']
ref_strings

KeyError: 'ref_string'

In [33]:
def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'

    # initialize the process group
    dist.init_process_group("gloo", rank=rank, world_size=world_size)

def cleanup():
    dist.destroy_process_group()


def demo_parallel(rank, world_size):
    print(f"Running DDP with model parallel example on rank {rank}.")
    setup(rank, world_size)

    DDP_dataloader = prepare(rank, world_size, batch_size=2, pin_memory=False, num_workers=0)
    first_batch = next(iter(DDP_dataloader))

    tokenizer = T5Tokenizer.from_pretrained("t5-small")
    model = T5ForConditionalGeneration.from_pretrained("t5-small")

    model = model.to(rank)
    ddp_model = DDP(model, device_ids=[rank])

    for epoch in epochs:
        #encoding inputs
        dataloader.sampler.set_epoch(epoch)

        for step, x in enumerate(epoch):

            qs = first_batch['question']
            answers = first_batch['answer']
            refs = first_batch['ref'] # ?

            temps = format_prompt(qs, refs)
            
            encoding = tokenizer(temps, 
                                return_tensors="pt", 
                                padding='longest', 
                                max_length=1024, 
                                truncation=True)
            
            input_ids, attention_mask = encoding.input_ids, encoding.attention_mask
            #encoding targets
            target_encoding = tokenizer(answers,
                                        return_tensors="pt",
                                        padding='longest',
                                        max_length=1024,
                                        truncation=True)
            labels = target_encoding.input_ids

            loss_fn = nn.MSELoss()
            optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)

            optimizer.zero_grad()

            outputs = ddp_model(input_ids, attention_mask)

            loss_fn(outputs, labels).backward()
            optimizer.step()

    cleanup()

def run_demo(demo_fn, world_size):
    mp.spawn(demo_fn,
             args=(world_size,),
             nprocs=world_size,
             join=True)

In [36]:
n_gpus = torch.cuda.device_count()
assert n_gpus >= 2, f"Requires at least 2 GPUs to run, but got {n_gpus}"
world_size = n_gpus
run_demo(demo_parallel, world_size)

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/home/ygong/miniconda3/envs/atlas-env/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/home/ygong/miniconda3/envs/atlas-env/lib/python3.8/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'demo_parallel' on <module '__main__' (built-in)>


ProcessExitedException: process 1 terminated with exit code 1

In [2]:
from accelerate import init_empty_weights
from torch import nn

# with init_empty_weights():
    # model = nn.Sequential(*[nn.Linear(10000, 10000) for _ in range(1000)])

In [2]:
AUTOAIS = "google/t5_xxl_true_nli_mixture"
t0pp = AutoModelForSeq2SeqLM.from_pretrained(AUTOAIS, device_map="auto")

Loading checkpoint shards: 100%|██████████| 5/5 [02:07<00:00, 25.45s/it]
