In [None]:
import os
import json
import imagesize
import torch
import glob
import random
import cv2
import numpy as np
from torch.utils.data import Dataset, RandomSampler, DataLoader, Sampler
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig
import pickle
from collections import defaultdict
# In[2]:


from peft import LoraConfig, get_peft_model, TaskType
import transformers
# from transformers import AdamW
from accelerate import Accelerator
from transformers import AutoTokenizer, AutoModelForCausalLM
import bitsandbytes as bnb
from transformers import Trainer, TrainingArguments

In [2]:
class DLoaderForCI(Dataset):
    def __init__(self, sortocr=0, max_seq=1024, tokenizer=None, testmode=0):
        self.dyn_max_seq_len = max_seq
        self.tokenizer = tokenizer
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.testmode = testmode
        self.sortocr = sortocr
        self.data = self.load_cigroupdata()
        print('self.data',len(self.data))

    def __len__(self):
        return len(self.data)
        #return 1
        
    def load_cigroupdata(self):
        samples = 0
        dirp = '/mnt/efs/RaghavWork/CIGroups'
        outsp = {}
        for root, dir, files in os.walk(dirp):
            for file in files:
                if file.endswith('.json'):continue
                key = os.path.basename(root)
                fullpath = os.path.join(root, file)
                fullpath_v2 = os.path.join(root, file+'_geo.json')
                if all(map(lambda x:os.path.isfile(x), [fullpath, fullpath_v2])):
                    if key not in outsp:outsp[key] = []
                    #outsp[key].append((fullpath, fullpath_v2))
                    outsp[key].append((fullpath, fullpath_v2))
                    samples +=1
                    
        allkys = list(outsp.keys())
        count = 0
        datalist = []
        for k in allkys:
            if len(outsp[k])<2:
                allkys.pop(k)
            else:
                count += len(outsp[k])
                datalist.append(outsp.pop(k))
                
        print('Total Samples:',samples, ' | datalist',len(datalist))
        if self.testmode:
            return datalist[-20:]
        else:
            return datalist[:-20]

    def sort_ocr_by_position(self, ocr_data):
        """
        Sort OCR data by reading order (left to right, top to bottom).
        
        Args:
        ocr_data (list): List of tuples in the format [((x1, y1, x3, y3), text)], where 
                         (x1, y1) is the top-left corner and (x3, y3) is the bottom-right corner of the word.
        
        Returns:
        list: Sorted list of tuples by reading order.
        """
        # Sort first by the y1 (top-to-bottom), then by x1 (left-to-right)
        # Example OCR data
        # ocr_data = [
        #     ((100, 50, 150, 80), 'hello'),
        #     ((200, 50, 250, 80), 'world'),
        #     ((100, 100, 150, 130), 'foo'),
        #     ((200, 100, 250, 130), 'bar')
        # ]
        
        # # Sort OCR data
        # sorted_data = sort_ocr_by_position(ocr_data)
        
        # Display the sorted OCR data
        # for box, text in sorted_data:
        #     print(f"Text: {text}, Coordinates: {box}")
        
        sorted_ocr = sorted(ocr_data, key=lambda item: (item[0][1], item[0][0]))
    
        combined_text = " ".join([item[1] for item in sorted_ocr])
        
        return sorted_ocr, combined_text
    
    def get_image_and_jsongt(self, sample):
        words = []
        word_boxes = []
        word_labels = []
        row_labels = []
        img = None
        
        imgp, jsonp =  sample
        # if self.loadimage:
        #     img = cv2.imread(imgp)
        # else:img = None
        img = None
        if isinstance(jsonp, str) and os.path.isfile(jsonp):
            with open(jsonp, encoding="utf8") as f:
                jsondata = json.load(f)
        elif isinstance(jsonp, dict):jsondata = jsonp
            
    
        for clas in jsondata['parse']['class']:
            items = jsondata['parse']['class'][clas]
            for item in items:
                for wrd_id in item:
                    word = jsondata['words'][wrd_id]['text']
                    #print(f'Word: {word} --> {clas}')
                    #words.append(word)
                    #word_boxes.append(jsondata['words'][wrd_id]['boundingBox'][0] + jsondata['words'][wrd_id]['boundingBox'][2])
                    #word_boxes.append(((jsondata['words'][wrd_id]['boundingBox'][0] + jsondata['words'][wrd_id]['boundingBox'][2]), word))
                    #word_labels.append(clas)
                    rlabel = jsondata['words'][wrd_id].get('row_label', [0])[0]
                    #row_labels.append(min(49, rlabel))
                    word_boxes.append(((jsondata['words'][wrd_id]['boundingBox'][0] + jsondata['words'][wrd_id]['boundingBox'][2]),\
                                       word, clas, rlabel))
                    
        #print('words', len(words),' | word_boxes:',len(word_boxes), ' | word_labels:',len(word_labels))
    
        return imgp, img ,words, word_boxes, word_labels, row_labels

    def get_prompt(self, ocr_formatted_1, ocr_formatted_2, example_output):
        # prompt = "Extract the line-item json from the ocr information of the document."
        prompt = 'Your task is to extract line-items information from the attached invoice. Refer to the example \
        input and output and then return the output for the new input.'
        
        one_shot_prompt = f"""
            Task: {prompt}
            
            Example:
            Input: {ocr_formatted_1}
            Output: {example_output}
            
            New Input: {ocr_formatted_2}
            Output:
            """
        return one_shot_prompt
    
    def get_final_json(self, sample):
        row_wise_data = {'keyvalues':{}, 'lineitems':{}}
        kvf = 1
        for item in sample:
            bx, word, cls, row = item
            if row == 0:
                if cls in ('PO_NUMBER_VALUE', 'HTS_NUMBER_VALUE', 'INVOICE_NUMBER_VALUE'):
                    #if cls not in row_wise_data['keyvalues']:row_wise_data['keyvalues'][cls] = []
                    #row_wise_data['keyvalues'][cls].append(word)
                    if cls not in row_wise_data['keyvalues']:row_wise_data['keyvalues'][cls] = ''
                    row_wise_data['keyvalues'][cls] += word + ' '
                    kvf = 0
                continue
            if not cls.endswith('_VALUE') or 'MISC' in cls:continue
            if row not in row_wise_data['lineitems']:row_wise_data['lineitems'][row] = {}
            # if cls not in row_wise_data['lineitems'][row]:row_wise_data['lineitems'][row][cls] = []
            # row_wise_data['lineitems'][row][cls].append(word)
            if cls not in row_wise_data['lineitems'][row]:row_wise_data['lineitems'][row][cls] = ''
            row_wise_data['lineitems'][row][cls] += word + ' '
        
        if kvf:row_wise_data.pop('keyvalues')
        for row in row_wise_data['lineitems']:
            for cls in row_wise_data['lineitems'][row]:
                row_wise_data['lineitems'][row][cls] = row_wise_data['lineitems'][row][cls].strip()
                
        return row_wise_data
    
    def __getitem__(self, idx):
        #idx = np.random.randint(0,len(self.data)))
        sample1, sample2 = random.choices(self.data[idx], k = 2)
        #print('sample1, sample2',sample1, sample2)
        imgp1, img1 ,words1, word_boxes1, word_labels1, row_labels1 = self.get_image_and_jsongt(sample1)
        imgp2, img2 ,words2, word_boxes2, word_labels2, row_labels2 = self.get_image_and_jsongt(sample2)
        
        if self.sortocr:
            word_boxes1, ocrtxt1 = self.sort_ocr_by_position(word_boxes1)
            word_boxes2, ocrtxt2 = self.sort_ocr_by_position(word_boxes2)
        else:
            ocrtxt1 = ' '.join([item[1] for item in word_boxes1])
            ocrtxt2 = ' '.join([item[1] for item in word_boxes2])

        outjson1 = self.get_final_json(word_boxes1)
        outjson2 = self.get_final_json(word_boxes2)
        input_prompt = self.get_prompt(ocrtxt1, ocrtxt2, outjson1)
        output = f"{outjson2}"
        #print('input_prompt--->',input_prompt)
        #print('output--->',output)
        combined_text = input_prompt + output
        tokens = self.tokenizer(
            combined_text, 
            max_length=self.dyn_max_seq_len, 
            truncation=True, 
            return_tensors="pt", 
            padding="max_length",
            padding_side='right'
        )
        input_length = len(self.tokenizer(input_prompt)["input_ids"])
        labels = tokens["input_ids"].clone()
        labels[:, :input_length] = -100  # Ignore input tokens in the loss
        labels[tokens["attention_mask"]==0] = -100 # Ignore pad tokens in the loss
        
        #return {'input_prompt':input_prompt, 'output':output}
        return {
            "input_ids": tokens["input_ids"].squeeze(0),
            "attention_mask": tokens["attention_mask"].squeeze(0),
            "labels": labels.squeeze(0)
        }

## Deepseek DeepSeek-R1-Distill-Qwen-7B

--- running on old versions <br> 
--- venv = ra_1shot_venv

In [23]:
## tokenizer

local_cache = './deepseek_model/'
name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
# local_cache = './mistral_models'
# name = "mistralai/Mistral-7B-Instruct-v0.3"

#http://localhost:8889/edit/mnt/efs/RaghavWork/VirtualENV/1shot/lib/python3.10/site-packages/transformers/models/mistral/modeling_mistral.py
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(name, cache_dir=local_cache)
tokenizer.pad_token = tokenizer.eos_token  # For causal LM

In [4]:
fact = 1
max_inp_seq = 1024*fact
train_dataset = DLoaderForCI(sortocr=1, max_seq=max_inp_seq, tokenizer=tokenizer, testmode=0)
#train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)

val_dataset = DLoaderForCI(sortocr=1, max_seq=max_inp_seq, tokenizer=tokenizer, testmode=0)
#val_dataloader = DataLoader(val_dataset, batch_size=2, shuffle=True)
print('MaxLeN:',max_inp_seq)

Total Samples: 660  | datalist 169
self.data 149
Total Samples: 660  | datalist 169
self.data 149
MaxLeN: 1024


In [5]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Enable 4-bit quantization
    bnb_4bit_compute_dtype="float16",  # Use fp16 for computation
    bnb_4bit_use_double_quant=True,   # Use double quantization
    bnb_4bit_quant_type="nf4",        # Use `nf4` quantization type
)

# Load the model with quantization
model = AutoModelForCausalLM.from_pretrained(
    name,
    cache_dir=local_cache,
    quantization_config=bnb_config,
    device_map="auto"  # Automatically distribute model across GPUs
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
# Set to evaluation mode and freeze original weights
for param in model.parameters():
    param.requires_grad = False

In [7]:
# Configure LoRA
lora_config = LoraConfig(
    r=8,             # Rank of the low-rank matrices
    lora_alpha=16,   # Scaling factor
    target_modules=["q_proj", "v_proj"],  # Target attention layers to apply LoRA
    lora_dropout=0.1,
    task_type=TaskType.CAUSAL_LM,  # Specify the task
    bias="none"
)

# Add LoRA to the model
model = get_peft_model(model, lora_config)

In [8]:
training_args = TrainingArguments(
    #output_dir="./mistral_lora/CITMP/",         # Directory to save results
    output_dir="./deepseek/CITMP/",         # Directory to save results
    eval_strategy="epoch",         # Evaluate at the end of each epoch
    max_grad_norm=1.0,                   # Clip gradients
    learning_rate=5e-5,                  # Learning rate
    per_device_train_batch_size=1,       # Batch size per GPU
    per_device_eval_batch_size=1,        # Evaluation batch size per GPU
    num_train_epochs=25,                  # Number of epochs
    weight_decay=0.01,                   # Weight decay for optimizer
    save_strategy="epoch",               # Save model checkpoint at the end of each epoch
    save_total_limit=2,                  # Keep only the last 2 checkpoints
    load_best_model_at_end=False,         # Load the best model when finished
    logging_dir="./logs",                # Directory for logs
    logging_steps=50,                    # Log every 50 steps
    report_to="tensorboard",             # Use TensorBoard for logging
    fp16=True,                           # Use mixed precision (16-bit floating point)
    dataloader_num_workers=0,            # Number of data loader workers
    ddp_find_unused_parameters=True,     # Optimize for distributed training
)

In [9]:
# Use Trainer API for training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)


In [10]:
torch.cuda.empty_cache()

In [11]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0033,
2,0.0087,
3,0.0001,
4,0.0086,
5,0.0979,
6,0.0007,
7,0.0412,
8,0.0005,
9,0.0001,
10,0.0002,


TrainOutput(global_step=3725, training_loss=0.02442358997003519, metrics={'train_runtime': 4696.3851, 'train_samples_per_second': 0.793, 'train_steps_per_second': 0.793, 'total_flos': 1.618787632939008e+17, 'train_loss': 0.02442358997003519, 'epoch': 25.0})

In [13]:
trainer.save_model("./deepseek_finetuned")  # Saves model, config, tokenizer
tokenizer.save_pretrained("./deepseek_finetuned")


('./deepseek_finetuned/tokenizer_config.json',
 './deepseek_finetuned/special_tokens_map.json',
 './deepseek_finetuned/tokenizer.json')

In [14]:
metrics = trainer.evaluate()
print(metrics)

{'eval_loss': nan, 'eval_runtime': 57.3494, 'eval_samples_per_second': 2.598, 'eval_steps_per_second': 2.598, 'epoch': 25.0}


In [15]:
# Use the model from trainer directly (no need to re-load)
model = trainer.model
model.eval()  # Set to eval mode



Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


In [27]:
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

model_path = "./deepseek_finetuned"
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_path)

generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

prompt = "Generate the structured output for this document: "
output = generator(prompt, max_new_tokens=200)[0]["generated_text"]
print(output)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Generate the structured output for this document: 
A new study reveals that the average time a person spends on a social media platform is 45 minutes per day. The study was conducted by a team of researchers at a university and involved 1,000 participants. The participants were asked to record their social media usage for a week, and the data was analyzed using statistical methods.

The study found that the majority of users spend between 30 and 60 minutes per day on social media. Furthermore, the study categorized users into three groups based on their usage patterns: light users, moderate users, and heavy users. The light users spent between 15 and 30 minutes, moderate users between 31 and 60 minutes, and heavy users spent more than 60 minutes per day. The study also found that heavy users were twice as likely to experience social media-induced anxiety as light users. 

In addition, the study observed that the average time spent on social media increased by 10%


In [32]:
prompt = """
Task: Your task is to extract key-value information from the document. Refer to the example input and output and then return the output for the new input. Don't produce any note or other informtion at end
                
Example:
Input: Southampton , United Kingdom 9403.60 7,342.50 kg AMS Ref . No. 125 CARTONS TCLU8883933 PO FLEX - 2669545 FLXT - 2669545-3215367 Flexport Carrier Booking Operations FLEXPORT INTERNATIONAL B.V. VIJZELSTRAAT 68 , 1017 HL AMSTERDAM NETHERLANDS ARRIVALNOTICES_EU@FLEXPORT.COM Flexport's terms and conditions of service pursuant to this bill of lading or when acting in the capacity of a non - - operating common Loaded on Board carrier are incorporated by reference and can be found at https://www.flexport.com/terms-and-conditions/ . Additionally Additionally for all Sea Waybills Waybills the CMI Uniform Rules for Sea Waybills shall also apply ( and shall govern in accordance with Flexport's terms and conditions conditions ) pursuant to Jun 04 , 2024 which Shipper irrevocably relinquishes its right to control over the shipment by authorizing Flexport to proceed with the issuance of a Sea Waybill . International LLC By By Flexport As Carrier FMC FMC OTI No. 025219NF X Jun 04 , 2024 COPY BILL OF LADING FOR PORT - TO - PORT OR COMBINED TRANSPORT , , , . Pre - carriage By flexport . Agent Reference Number Point of Origin / FTZ Number Containerized FCL Marks and Numbers No. of Containers Description of Packages and Goods Said to Contain - Shipper's Load Stow & Count ( SLAC ) 1 x 40 ft HC Dry Container SLAC 125 CARTONS Total PO # 4127786 HS CODE : 9403.60 , 45 CTNS / 2902.5 KGS / 28.08 CBM HLM002 HELMER BLACKENED OAK LARGE TV UNIT . , 80 CTNS / 4440 KGS / 38.8 CBM SLAC 7,342.50 kg 66.880 cbm Declared Value Containers Carrier's receipt . Total number of containers or packages received by carrier : 1 container ( s ) vessel , , , : - Freight Charges ( and description ) Freight Collect This bill of lading is subject to Flexport's standard terms and conditions of carriage , which appear hereof as page 2 Page 1 of 2 Number Bill of Lading ONEYHKGE93028600 Southampton , United Kingdom Vessel & Voyage SAME AS CNEE CNCS20288 Measurement Notify Party HOARE GROUP LIMITED 3000A Parkway Parkway Whiteley Whiteley Hampshire , PO15 7FX 7FX United Kingdom FLXT - 00002669545B Seal shipped to Consignee / FLXT00002669545B FURNITURE HLM001 HELMER BLACKENED OAK 4 DOORS SIDEBOARD , Container China , Yantian Shipped From Exporter / Port of Loading HS CODE : Special Instructions Booking Number NRA Number Place of Pickup Forwarding Agent Destination Agent B / L Issue Date 028W - AL JMELIYAH Places of Delivery Gross Weight 4127786 MBL Ref Ref No. Port of Unloading CY - CY 66.880 cbm Ltd Direct Buy It TRIDENT BUSINESS PARK / NEPTUNE WAY / LEEDS RD , , HUDDERSFIELD , HD2 1UA , United Kingdom
Output: {'UNLADING_PORT': 'Southampton , United Kingdom', 'HTS_NUMBER': '9403.60', 'GROSS_WEIGHT': '7,342.50 kg', 'NUMBER_OF_PACKAGES': '125 CARTONS', 'CONTAINER_NUMBER': 'TCLU8883933', 'MASTER_BILL_OF_LADING': 'ONEYHKGE93028600', 'PLACE_OF_DELIVERY': 'Southampton , United Kingdom', 'NOTIFY_PARTY': 'SAME AS CNEE', 'SEAL_NUMBER': 'CNCS20288', 'SHIPPER': 'HOARE GROUP LIMITED 3000A Parkway Parkway Whiteley Whiteley Hampshire , PO15 7FX 7FX United Kingdom', 'BILL_OF_LADING': 'FLXT - 00002669545B', 'AMS_BL': 'FLXT00002669545B', 'PRODUCT_DESCRIPTION': 'FURNITURE HLM001 HELMER BLACKENED OAK 4 DOORS SIDEBOARD ,', 'LADING_PORT': 'China , Yantian', 'VESSEL_VOYAGE': '028W - AL JMELIYAH', 'PO_NUMBER': '4127786', 'CONTRACT_OF_CARRIAGE': 'CY - CY', 'VOLUME': 'cbm 66.880', 'CONSIGNEE': 'Ltd Direct Buy It TRIDENT BUSINESS PARK / NEPTUNE WAY / LEEDS RD , , HUDDERSFIELD , HD2 1UA , United Kingdom'}

New Input: Southampton , United Kingdom 5,540.00 kg 11,056.00 kg AMS Ref . No. 1854 CARTONS YMMU6326692 PO PO FLEXPORT INTERNATIONAL B.V. VIJZELSTRAAT 68 , 1017 HL AMSTERDAM NETHERLANDS ARRIVALNOTICES_EU@FLEXPORT.COM CML Grandsight Supply Chain Management Co. , Ltd . ( PREPAYMENT PREPAYMENT FLXT - 2617825-3170083 FLEX - 2617825 Flexport's terms and conditions of service pursuant to this bill of lading or when acting in the capacity of a non - - operating common Loaded on Board carrier are incorporated by reference and can be found at https://www.flexport.com/terms-and-conditions/ . Additionally Additionally for all Sea Waybills , the CMI Uniform Rules for Sea Waybills shall also apply ( ( shall govern in accordance with Flexport's terms and conditions ) ) pursuant to Apr 22 22 2024 which Shipper irrevocably relinquishes its right to control over the shipment by authorizing Flexport to proceed with the issuance of a Sea Waybill . By By Flexport International LLC As Carrier FMC - OTI No. 025219NF This bill of lading is subject to Flexport's standard terms and conditions of carriage , which appear hereof as page 2 Page 1 of 3 COPY BILL OF LADING FOR PORT - TO - PORT OR COMBINED TRANSPORT flexport . Place of Pickup B / L Issue Date Apr 25 , 2024 Pre - carriage By Neptune Leeds / Leeds ) Agent Reference Number Point of Origin / FTZ Number Marks and Numbers No. of Containers Said to Contain - Shipper's Load Stow & Count ( SLAC ) 1 x 40 ft HC Dry Container SLAC 197 CARTONS N / M 1 x 40 ft HC Dry N / M See attached pages for more container information Total Container SLAC 366 CARTONS SLAC 47,890.00 kg 357.950 cbm Declared Value Containers Carrier's receipt . Total number of containers or packages received by carrier : 5 container ( s ) vessel , and , : Freight Charges ( and description ) Freight Collect , Number Bill of Lading YMJAN235174064 Southampton , United Kingdom Vessel & Voyage Ltd. It Direct Buy Address : Trident Business Park Park Neptune Way / / Rd , Huddersfield HD2 1UA Phone : 0871 971 0779 YMAR267106 YMAR267193 Measurement Measurement Notify Party TIANJIN CONTE IMPORT & EXPORT TRADING CO . , LTD B - 704-2 HAITAI BUILDING , NO.6 HUATIAN ROAD , HUAYUAN INDUSTRIAL AREA TIANJIN , CHINA FLXT - 00002617825A Seal Seal Description of Packages and Goods Description of Packages and Goods Container Containerized shipped to Consignee / FLXT00002617825A SOFA BED BED SIDEBOARD Container Tianjin Xingang Shipped From Exporter / Port of Loading Special Instructions Destination Agent Forwarding Agent NRA Number Booking Number HMM SOUTHAMPTON - 013W Places of Delivery Gross Weight Gross Weight 399310 4017189 MBL Ref . No. Port of Unloading CY - CY 70.510 cbm 68.580 cbm YMMU6077835 FCL Ltd. Direct Buy It Address : Trident Business Park / / Way / / Rd , Huddersfield HD2 1UA Phone : 0871 971 0779

"""
output = generator(prompt, max_new_tokens=8000)[0]["generated_text"]
print(output)



Task: Your task is to extract key-value information from the document. Refer to the example input and output and then return the output for the new input. Don't produce any note or other informtion at end
                
Example:
Input: Southampton , United Kingdom 9403.60 7,342.50 kg AMS Ref . No. 125 CARTONS TCLU8883933 PO FLEX - 2669545 FLXT - 2669545-3215367 Flexport Carrier Booking Operations FLEXPORT INTERNATIONAL B.V. VIJZELSTRAAT 68 , 1017 HL AMSTERDAM NETHERLANDS ARRIVALNOTICES_EU@FLEXPORT.COM Flexport's terms and conditions of service pursuant to this bill of lading or when acting in the capacity of a non - - operating common Loaded on Board carrier are incorporated by reference and can be found at https://www.flexport.com/terms-and-conditions/ . Additionally Additionally for all Sea Waybills Waybills the CMI Uniform Rules for Sea Waybills shall also apply ( and shall govern in accordance with Flexport's terms and conditions conditions ) pursuant to Jun 04 , 2024 which Ship

In [31]:
prompt = """
Task: Extract structured key-value pairs in JSON from the following document text. Don't produce any note or other informtion at end

New Input: Southampton , United Kingdom 5,540.00 kg 11,056.00 kg AMS Ref . No. 1854 CARTONS YMMU6326692 PO PO FLEXPORT INTERNATIONAL B.V. VIJZELSTRAAT 68 , 1017 HL AMSTERDAM NETHERLANDS ARRIVALNOTICES_EU@FLEXPORT.COM CML Grandsight Supply Chain Management Co. , Ltd . ( PREPAYMENT PREPAYMENT FLXT - 2617825-3170083 FLEX - 2617825 Flexport's terms and conditions of service pursuant to this bill of lading or when acting in the capacity of a non - - operating common Loaded on Board carrier are incorporated by reference and can be found at https://www.flexport.com/terms-and-conditions/ . Additionally Additionally for all Sea Waybills , the CMI Uniform Rules for Sea Waybills shall also apply ( ( shall govern in accordance with Flexport's terms and conditions ) ) pursuant to Apr 22 22 2024 which Shipper irrevocably relinquishes its right to control over the shipment by authorizing Flexport to proceed with the issuance of a Sea Waybill . By By Flexport International LLC As Carrier FMC - OTI No. 025219NF This bill of lading is subject to Flexport's standard terms and conditions of carriage , which appear hereof as page 2 Page 1 of 3 COPY BILL OF LADING FOR PORT - TO - PORT OR COMBINED TRANSPORT flexport . Place of Pickup B / L Issue Date Apr 25 , 2024 Pre - carriage By Neptune Leeds / Leeds ) Agent Reference Number Point of Origin / FTZ Number Marks and Numbers No. of Containers Said to Contain - Shipper's Load Stow & Count ( SLAC ) 1 x 40 ft HC Dry Container SLAC 197 CARTONS N / M 1 x 40 ft HC Dry N / M See attached pages for more container information Total Container SLAC 366 CARTONS SLAC 47,890.00 kg 357.950 cbm Declared Value Containers Carrier's receipt . Total number of containers or packages received by carrier : 5 container ( s ) vessel , and , : Freight Charges ( and description ) Freight Collect , Number Bill of Lading YMJAN235174064 Southampton , United Kingdom Vessel & Voyage Ltd. It Direct Buy Address : Trident Business Park Park Neptune Way / / Rd , Huddersfield HD2 1UA Phone : 0871 971 0779 YMAR267106 YMAR267193 Measurement Measurement Notify Party TIANJIN CONTE IMPORT & EXPORT TRADING CO . , LTD B - 704-2 HAITAI BUILDING , NO.6 HUATIAN ROAD , HUAYUAN INDUSTRIAL AREA TIANJIN , CHINA FLXT - 00002617825A Seal Seal Description of Packages and Goods Description of Packages and Goods Container Containerized shipped to Consignee / FLXT00002617825A SOFA BED BED SIDEBOARD Container Tianjin Xingang Shipped From Exporter / Port of Loading Special Instructions Destination Agent Forwarding Agent NRA Number Booking Number HMM SOUTHAMPTON - 013W Places of Delivery Gross Weight Gross Weight 399310 4017189 MBL Ref . No. Port of Unloading CY - CY 70.510 cbm 68.580 cbm YMMU6077835 FCL Ltd. Direct Buy It Address : Trident Business Park / / Way / / Rd , Huddersfield HD2 1UA Phone : 0871 971 0779

"""
output = generator(prompt, max_new_tokens=8000)[0]["generated_text"]
print(output)


Task: Extract structured key-value pairs in JSON from the following document text. Don't produce any note or other informtion at end

New Input: Southampton , United Kingdom 5,540.00 kg 11,056.00 kg AMS Ref . No. 1854 CARTONS YMMU6326692 PO PO FLEXPORT INTERNATIONAL B.V. VIJZELSTRAAT 68 , 1017 HL AMSTERDAM NETHERLANDS ARRIVALNOTICES_EU@FLEXPORT.COM CML Grandsight Supply Chain Management Co. , Ltd . ( PREPAYMENT PREPAYMENT FLXT - 2617825-3170083 FLEX - 2617825 Flexport's terms and conditions of service pursuant to this bill of lading or when acting in the capacity of a non - - operating common Loaded on Board carrier are incorporated by reference and can be found at https://www.flexport.com/terms-and-conditions/ . Additionally Additionally for all Sea Waybills , the CMI Uniform Rules for Sea Waybills shall also apply ( ( shall govern in accordance with Flexport's terms and conditions ) ) pursuant to Apr 22 22 2024 which Shipper irrevocably relinquishes its right to control over the shi