In [2]:
import json
import os
import random
from tqdm import tqdm
from mlx_lm import load, generate

def load_model_and_tokenizer():
    print("Loading model and tokenizer...")
    model, tokenizer = load("mlx-community/gemma-2-27b-it-4bit")
    return model, tokenizer

def load_sample_data(directory):
    sample_data = []
    for filename in os.listdir(directory):
        if filename.endswith('.json'):
            with open(os.path.join(directory, filename), 'r') as f:
                sample_data.append(json.load(f))
    return sample_data

def generate_bl_sample(model, tokenizer, sample_data, max_tokens=2048):
    instruction = f"""
    Generate a new JSON object for a Bill of Lading (BL) based on the following structure and examples:
    {json.dumps(random.choice(sample_data), indent=2)}

    Ensure all fields are filled with realistic and varied data. 
    Modify values, names, and details to create a unique BL while maintaining the overall structure.
    The output should be a valid JSON object.
    """

    prompt = f'<s>[INST] {instruction} [/INST]\n'
    
    generated_text = generate(
        model, 
        tokenizer, 
        prompt=prompt, 
        max_tokens=max_tokens,
        temp=0.7,
        top_p=0.95,
        verbose=False
    )

    generated_text = generated_text.replace(prompt, "").strip()
    
    # Find the first occurrence of '{' and the last occurrence of '}'
    start = generated_text.find('{')
    end = generated_text.rfind('}')
    
    if start != -1 and end != -1 and start < end:
        json_str = generated_text[start:end+1]
        try:
            bl_data = json.loads(json_str)
            return bl_data
        except json.JSONDecodeError:
            print("Failed to parse JSON. Retrying...")
    else:
        print("No valid JSON structure found. Retrying...")
    return None

def create_bl_samples(model, tokenizer, sample_data_dir, output_dir, num_samples=10000):
    sample_data = load_sample_data(sample_data_dir)
    
    os.makedirs(output_dir, exist_ok=True)
    
    print(f"Generating {num_samples} BL samples...")
    for i in tqdm(range(num_samples), desc="Generating samples"):
        retry_count = 0
        while retry_count < 5:  # Limit the number of retries
            bl_data = generate_bl_sample(model, tokenizer, sample_data)
            if bl_data:
                filename = f"bl_sample_{i+1}.json"
                with open(os.path.join(output_dir, filename), 'w') as f:
                    json.dump(bl_data, f, indent=2)
                break
            retry_count += 1
        if retry_count == 5:
            print(f"Failed to generate valid JSON for sample {i+1} after 5 attempts. Skipping...")

if __name__ == "__main__":
    model, tokenizer = load_model_and_tokenizer()
    sample_data_dir = "./si/"  # Directory containing the sample JSON files
    output_dir = "./bl_sample/"
    create_bl_samples(model, tokenizer, sample_data_dir, output_dir, num_samples=10000)
    print("Sample generation completed.")

Loading model and tokenizer...


Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

Generating 10000 BL samples...


Generating samples:   6%|▌         | 584/10000 [15:36:41<265:35:01, 101.54s/it]

In [None]:
import json
import os
import random
from tqdm import tqdm
from mlx_lm import load, generate

# Constants for the specific conditions
WESTBOUND_PORTS = {
    "loading": ["NINGBO", "SHANGHAI", "YANTIAN", "SINGAPORE", "TANGER", "LE HAVRE"],
    "discharge": ["LE HAVRE", "HAMBURG", "GDANSK", "ROTTERDAM", "ALGECIRAS", "PORT KLANG", "NINGBO"]
}

EASTBOUND_PORTS = {
    "loading": ["LE HAVRE", "HAMBURG", "GDANSK", "ROTTERDAM", "ALGECIRAS", "PORT KLANG", "NINGBO"],
    "discharge": ["NINGBO", "SHANGHAI", "YANTIAN", "SINGAPORE", "TANGER", "LE HAVRE"]
}

VESSELS = [
    "APL CHANGI", "APL MERLION", "APL RAFFLES", "APL SINGAPURA", "APL TEMASEK",
    "APL VANDA", "CMA CGM ALEXANDER VON HUMBOLDT", "CMA CGM BENJAMIN FRANKLIN",
    "CMA CGM BOUGAINVILLE", "CMA CGM EOURES", "CMA CGM GEORG FORSTER",
    "CMA CGM GRACE BAY", "CMA CGM ROQUEVAIRE", "CMA CGM VASCO DE GAMA",
    "CMA CGM ZHENG HE"
]

def load_model_and_tokenizer():
    print("Loading model and tokenizer...")
    model, tokenizer = load("mlx-community/gemma-2-27b-it-4bit")
    return model, tokenizer

def load_sample_data(directory):
    sample_data = []
    for filename in os.listdir(directory):
        if filename.endswith('.json'):
            with open(os.path.join(directory, filename), 'r') as f:
                sample_data.append(json.load(f))
    return sample_data

def generate_bl_sample(model, tokenizer, sample_data, max_tokens=2048):
    bound = random.choice(["W", "E"])
    ports = WESTBOUND_PORTS if bound == "W" else EASTBOUND_PORTS
    vessel = random.choice(VESSELS)
    port_of_loading = random.choice(ports["loading"])
    port_of_discharge = random.choice(ports["discharge"])

    instruction = f"""
    Generate a new JSON object for a Bill of Lading (BL) based on the following structure and examples:
    {json.dumps(random.choice(sample_data), indent=2)}

    Ensure all fields are filled with realistic and varied data. 
    Modify values, names, and details to create a unique BL while maintaining the overall structure.
    The output should be a valid JSON object.
    """

    prompt = f'<s>[INST] {instruction} [/INST]\n'
    
    generated_text = generate(
        model, 
        tokenizer, 
        prompt=prompt, 
        max_tokens=max_tokens,
        temp=0.7,
        top_p=0.95,
        verbose=False
    )

    generated_text = generated_text.replace(prompt, "").strip()
    
    # Find the first occurrence of '{' and the last occurrence of '}'
    start = generated_text.find('{')
    end = generated_text.rfind('}')
    
    if start != -1 and end != -1 and start < end:
        json_str = generated_text[start:end+1]
        try:
            bl_data = json.loads(json_str)
            # Inject the specific conditions
            bl_data["voyageDetails"]["vesselName"] = vessel
            bl_data["voyageDetails"]["bound"] = bound
            bl_data["routeDetails"]["portOfLoading"] = port_of_loading
            bl_data["routeDetails"]["portOfDischarge"] = port_of_discharge
            return bl_data
        except json.JSONDecodeError:
            print("Failed to parse JSON. Retrying...")
    else:
        print("No valid JSON structure found. Retrying...")
    return None

def create_bl_samples(model, tokenizer, sample_data_dir, output_dir, num_samples=10000):
    sample_data = load_sample_data(sample_data_dir)
    
    os.makedirs(output_dir, exist_ok=True)
    
    print(f"Generating {num_samples} BL samples...")
    for i in tqdm(range(num_samples), desc="Generating samples"):
        retry_count = 0
        while retry_count < 5:  # Limit the number of retries
            bl_data = generate_bl_sample(model, tokenizer, sample_data)
            if bl_data:
                filename = f"bl_sample_{i+1}.json"
                with open(os.path.join(output_dir, filename), 'w') as f:
                    json.dump(bl_data, f, indent=2)
                break
            retry_count += 1
        if retry_count == 5:
            print(f"Failed to generate valid JSON for sample {i+1} after 5 attempts. Skipping...")

if __name__ == "__main__":
    model, tokenizer = load_model_and_tokenizer()
    sample_data_dir = "./si/"  # Directory containing the sample JSON files
    output_dir = "./bl_sample/"
    create_bl_samples(model, tokenizer, sample_data_dir, output_dir, num_samples=10000)
    print("Sample generation completed.")

## Generation sample si dataset

In [2]:
import json
import os
import random
from tqdm import tqdm
from mlx_lm import load, generate

# Constants for the specific conditions
WESTBOUND_PORTS = {
    "loading": ["NINGBO", "SHANGHAI", "YANTIAN", "SINGAPORE", "TANGER", "LE HAVRE"],
    "discharge": ["LE HAVRE", "HAMBURG", "GDANSK", "ROTTERDAM", "ALGECIRAS", "PORT KLANG", "NINGBO"]
}

EASTBOUND_PORTS = {
    "loading": ["LE HAVRE", "HAMBURG", "GDANSK", "ROTTERDAM", "ALGECIRAS", "PORT KLANG", "NINGBO"],
    "discharge": ["NINGBO", "SHANGHAI", "YANTIAN", "SINGAPORE", "TANGER", "LE HAVRE"]
}

VESSELS = [
    "APL CHANGI", "APL MERLION", "APL RAFFLES", "APL SINGAPURA", "APL TEMASEK",
    "APL VANDA", "CMA CGM ALEXANDER VON HUMBOLDT", "CMA CGM BENJAMIN FRANKLIN",
    "CMA CGM BOUGAINVILLE", "CMA CGM EOURES", "CMA CGM GEORG FORSTER",
    "CMA CGM GRACE BAY", "CMA CGM ROQUEVAIRE", "CMA CGM VASCO DE GAMA",
    "CMA CGM ZHENG HE"
]

def load_model_and_tokenizer():
    print("Loading model and tokenizer...")
    model, tokenizer = load("mlx-community/gemma-2-27b-it-4bit")
    return model, tokenizer

def load_sample_data(directory):
    sample_data = []
    for filename in os.listdir(directory):
        if filename.endswith('.json'):
            with open(os.path.join(directory, filename), 'r') as f:
                sample_data.append(json.load(f))
    return sample_data

def generate_bl_sample(model, tokenizer, sample_data, max_tokens=2048):
    bound = random.choice(["W", "E"])
    ports = WESTBOUND_PORTS if bound == "W" else EASTBOUND_PORTS
    vessel = random.choice(VESSELS)
    port_of_loading = random.choice(ports["loading"])
    port_of_discharge = random.choice(ports["discharge"])

    instruction = f"""
    Generate a new JSON object for a Bill of Lading (BL) based on the following structure and examples:
    {json.dumps(random.choice(sample_data), indent=2)}

    Ensure all fields are filled with realistic and varied data. 
    Modify values, names, and details to create a unique BL while maintaining the overall structure.
    The output should be a valid JSON object.
    """

    prompt = f'<s>[INST] {instruction} [/INST]\n'
    
    generated_text = generate(
        model, 
        tokenizer, 
        prompt=prompt, 
        max_tokens=max_tokens,
        temp=0.7,
        top_p=0.95,
        verbose=False
    )

    generated_text = generated_text.replace(prompt, "").strip()
    
    # Find the first occurrence of '{' and the last occurrence of '}'
    start = generated_text.find('{')
    end = generated_text.rfind('}')
    
    if start != -1 and end != -1 and start < end:
        json_str = generated_text[start:end+1]
        try:
            bl_data = json.loads(json_str)
            # Inject the specific conditions
            bl_data["voyageDetails"]["vesselName"] = vessel
            bl_data["voyageDetails"]["bound"] = bound
            bl_data["routeDetails"]["portOfLoading"] = port_of_loading
            bl_data["routeDetails"]["portOfDischarge"] = port_of_discharge
            # Set placeOfReceipt and placeOfDelivery to be the same as portOfLoading and portOfDischarge
            bl_data["routeDetails"]["placeOfReceipt"] = port_of_loading
            bl_data["routeDetails"]["placeOfDelivery"] = port_of_discharge
            return bl_data
        except json.JSONDecodeError:
            print("Failed to parse JSON. Retrying...")
    else:
        print("No valid JSON structure found. Retrying...")
    return None

def create_bl_samples(model, tokenizer, sample_data_dir, output_dir, num_samples=10000):
    sample_data = load_sample_data(sample_data_dir)
    
    os.makedirs(output_dir, exist_ok=True)
    
    print(f"Generating {num_samples} BL samples...")
    for i in tqdm(range(num_samples), desc="Generating samples"):
        retry_count = 0
        while retry_count < 5:  # Limit the number of retries
            bl_data = generate_bl_sample(model, tokenizer, sample_data)
            if bl_data:
                filename = f"bl_sample_{i+1}.json"
                with open(os.path.join(output_dir, filename), 'w') as f:
                    json.dump(bl_data, f, indent=2)
                break
            retry_count += 1
        if retry_count == 5:
            print(f"Failed to generate valid JSON for sample {i+1} after 5 attempts. Skipping...")

if __name__ == "__main__":
    model, tokenizer = load_model_and_tokenizer()
    sample_data_dir = "./si/"  # Directory containing the sample JSON files
    output_dir = "./bl_sample/"
    create_bl_samples(model, tokenizer, sample_data_dir, output_dir, num_samples=10000)
    print("Sample generation completed.")

Loading model and tokenizer...


Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

Generating 10000 BL samples...


Generating samples:   0%|          | 29/10000 [46:06<265:48:38, 95.97s/it] 

Failed to parse JSON. Retrying...


Generating samples:   6%|▌         | 581/10000 [15:35:41<239:12:03, 91.42s/it] 

Failed to parse JSON. Retrying...


Generating samples:   7%|▋         | 720/10000 [19:25:42<245:49:49, 95.37s/it] 

## Generation sample bkg and si dataset

In [None]:
import json
import os
import random
from datetime import datetime
from tqdm import tqdm
from mlx_lm import load, generate

# Constants for the specific conditions
WESTBOUND_PORTS = {
    "loading": ["NINGBO", "SHANGHAI", "YANTIAN", "SINGAPORE", "TANGER", "LE HAVRE"],
    "discharge": ["LE HAVRE", "HAMBURG", "GDANSK", "ROTTERDAM", "ALGECIRAS", "PORT KLANG", "NINGBO"]
}

EASTBOUND_PORTS = {
    "loading": ["LE HAVRE", "HAMBURG", "GDANSK", "ROTTERDAM", "ALGECIRAS", "PORT KLANG", "NINGBO"],
    "discharge": ["NINGBO", "SHANGHAI", "YANTIAN", "SINGAPORE", "TANGER", "LE HAVRE"]
}

VESSELS = [
    "APL CHANGI", "APL MERLION", "APL RAFFLES", "APL SINGAPURA", "APL TEMASEK",
    "APL VANDA", "CMA CGM ALEXANDER VON HUMBOLDT", "CMA CGM BENJAMIN FRANKLIN",
    "CMA CGM BOUGAINVILLE", "CMA CGM EOURES", "CMA CGM GEORG FORSTER",
    "CMA CGM GRACE BAY", "CMA CGM ROQUEVAIRE", "CMA CGM VASCO DE GAMA",
    "CMA CGM ZHENG HE"
]

def load_model_and_tokenizer():
    print("Loading model and tokenizer...")
    model, tokenizer = load("mlx-community/gemma-2-27b-it-4bit")
    return model, tokenizer

def load_sample_data(directory):
    sample_data = []
    for filename in os.listdir(directory):
        if filename.endswith('.json'):
            with open(os.path.join(directory, filename), 'r') as f:
                sample_data.append(json.load(f))
    return sample_data

def generate_booking_reference():
    today = datetime.now()
    return f"CHERRY{today.strftime('%Y%m%d')}{random.randint(1000, 9999)}"

def generate_bkg_sample(model, tokenizer, sample_data, booking_reference, max_tokens=2048):
    bound = random.choice(["W", "E"])
    ports = WESTBOUND_PORTS if bound == "W" else EASTBOUND_PORTS
    vessel = random.choice(VESSELS)
    port_of_loading = random.choice(ports["loading"])
    port_of_discharge = random.choice(ports["discharge"])

    instruction = f"""
    Generate a new JSON object for a Booking based on the following structure and examples:
    {json.dumps(random.choice(sample_data), indent=2)}

    Ensure all fields are filled with realistic and varied data. 
    Modify values, names, and details to create a unique Booking while maintaining the overall structure.
    The output should be a valid JSON object.
    """

    prompt = f'<s>[INST] {instruction} [/INST]\n'
    
    generated_text = generate(
        model, 
        tokenizer, 
        prompt=prompt, 
        max_tokens=max_tokens,
        temp=0.7,
        top_p=0.95,
        verbose=False
    )

    generated_text = generated_text.replace(prompt, "").strip()
    
    start = generated_text.find('{')
    end = generated_text.rfind('}')
    
    if start != -1 and end != -1 and start < end:
        json_str = generated_text[start:end+1]
        try:
            data = json.loads(json_str)
            # Inject the specific conditions
            data["bookingReference"] = booking_reference
            data["voyageDetails"]["vesselName"] = vessel
            data["voyageDetails"]["bound"] = bound
            data["routeDetails"]["portOfLoading"] = port_of_loading
            data["routeDetails"]["portOfDischarge"] = port_of_discharge
            data["routeDetails"]["placeOfReceipt"] = port_of_loading
            data["routeDetails"]["placeOfDelivery"] = port_of_discharge
            return data
        except json.JSONDecodeError:
            print("Failed to parse JSON. Retrying...")
    else:
        print("No valid JSON structure found. Retrying...")
    return None

def generate_si_sample(model, tokenizer, sample_data, bkg_data, max_tokens=2048):
    instruction = f"""
    Generate a new JSON object for a Shipping Instruction based on the following structure and examples:
    {json.dumps(random.choice(sample_data), indent=2)}

    Ensure all fields are filled with realistic and varied data. 
    Modify values, names, and details to create a unique SI while maintaining the overall structure.
    Use the following booking data as reference:
    {json.dumps(bkg_data, indent=2)}
    The output should be a valid JSON object.
    """

    prompt = f'<s>[INST] {instruction} [/INST]\n'
    
    generated_text = generate(
        model, 
        tokenizer, 
        prompt=prompt, 
        max_tokens=max_tokens,
        temp=0.7,
        top_p=0.95,
        verbose=False
    )

    generated_text = generated_text.replace(prompt, "").strip()
    
    start = generated_text.find('{')
    end = generated_text.rfind('}')
    
    if start != -1 and end != -1 and start < end:
        json_str = generated_text[start:end+1]
        try:
            data = json.loads(json_str)
            # Ensure SI data uses the same key information as BKG data
            data["bookingReference"] = bkg_data["bookingReference"]
            data["voyageDetails"] = bkg_data["voyageDetails"]
            data["routeDetails"] = bkg_data["routeDetails"]
            
            # Additional shared information
            if "partyDetails" in bkg_data and "shipper" in bkg_data["partyDetails"]:
                if "partyDetails" not in data:
                    data["partyDetails"] = {}
                data["partyDetails"]["shipper"] = bkg_data["partyDetails"]["shipper"]
            
            if "containers" in bkg_data:
                data["containers"] = bkg_data["containers"]
            
            if "additionalInformation" in bkg_data:
                if "additionalInformation" not in data:
                    data["additionalInformation"] = {}
                if "additionalRemarks" in bkg_data["additionalInformation"]:
                    data["additionalInformation"]["additionalRemarks"] = bkg_data["additionalInformation"]["additionalRemarks"]
                if "onboardDate" in bkg_data["additionalInformation"]:
                    data["additionalInformation"]["onboardDate"] = bkg_data["additionalInformation"]["onboardDate"]
            
            if "totalShipment" in bkg_data:
                data["totalShipment"] = bkg_data["totalShipment"]
            
            if "outOfGaugeDimensions" in bkg_data:
                data["outOfGaugeDimensions"] = bkg_data["outOfGaugeDimensions"]
            
            if "dangerousGoods" in bkg_data:
                data["dangerousGoods"] = bkg_data["dangerousGoods"]
            
            if "reeferSettings" in bkg_data:
                data["reeferSettings"] = bkg_data["reeferSettings"]
            
            return data
        except json.JSONDecodeError:
            print("Failed to parse JSON. Retrying...")
    else:
        print("No valid JSON structure found. Retrying...")
    return None

def create_samples(model, tokenizer, sample_data_bkg, sample_data_si, output_dir_bkg, output_dir_si, num_samples=10000):
    sample_data_bkg = load_sample_data(sample_data_bkg)
    sample_data_si = load_sample_data(sample_data_si)
    
    os.makedirs(output_dir_bkg, exist_ok=True)
    os.makedirs(output_dir_si, exist_ok=True)
    
    print(f"Generating {num_samples} pairs of Booking and SI samples...")
    for i in tqdm(range(num_samples), desc="Generating samples"):
        booking_reference = generate_booking_reference()
        
        retry_count = 0
        while retry_count < 5:  # Limit the number of retries
            bkg_data = generate_bkg_sample(model, tokenizer, sample_data_bkg, booking_reference)
            if bkg_data:
                si_data = generate_si_sample(model, tokenizer, sample_data_si, bkg_data)
                if si_data:
                    bkg_filename = f"bkg_{booking_reference}.json"
                    si_filename = f"si_{booking_reference}.json"
                    
                    with open(os.path.join(output_dir_bkg, bkg_filename), 'w') as f:
                        json.dump(bkg_data, f, indent=2)
                    with open(os.path.join(output_dir_si, si_filename), 'w') as f:
                        json.dump(si_data, f, indent=2)
                    break
            retry_count += 1
        if retry_count == 5:
            print(f"Failed to generate valid JSON for sample {i+1} after 5 attempts. Skipping...")

if __name__ == "__main__":
    model, tokenizer = load_model_and_tokenizer()
    sample_data_bkg = "./bkg/"
    sample_data_si = "./si/"
    output_dir_bkg = "./bkg_data/"
    output_dir_si = "./si_data/"
    create_samples(model, tokenizer, sample_data_bkg, sample_data_si, output_dir_bkg, output_dir_si, num_samples=10000)
    print("Sample generation completed.")

## More aligned data generation for bkg data format

In [None]:
import json
import os
import random
from datetime import datetime, timedelta
from tqdm import tqdm
from mlx_lm import load, generate

# Constants for the specific conditions
WESTBOUND_PORTS = {
    "loading": ["NINGBO", "SHANGHAI", "YANTIAN", "SINGAPORE", "TANGER", "LE HAVRE"],
    "discharge": ["LE HAVRE", "HAMBURG", "GDANSK", "ROTTERDAM", "ALGECIRAS", "PORT KLANG", "NINGBO"]
}

EASTBOUND_PORTS = {
    "loading": ["LE HAVRE", "HAMBURG", "GDANSK", "ROTTERDAM", "ALGECIRAS", "PORT KLANG", "NINGBO"],
    "discharge": ["NINGBO", "SHANGHAI", "YANTIAN", "SINGAPORE", "TANGER", "LE HAVRE"]
}

VESSELS = [
    "APL CHANGI", "APL MERLION", "APL RAFFLES", "APL SINGAPURA", "APL TEMASEK",
    "APL VANDA", "CMA CGM ALEXANDER VON HUMBOLDT", "CMA CGM BENJAMIN FRANKLIN",
    "CMA CGM BOUGAINVILLE", "CMA CGM EOURES", "CMA CGM GEORG FORSTER",
    "CMA CGM GRACE BAY", "CMA CGM ROQUEVAIRE", "CMA CGM VASCO DE GAMA",
    "CMA CGM ZHENG HE"
]

def load_model_and_tokenizer():
    print("Loading model and tokenizer...")
    model, tokenizer = load("mlx-community/gemma-2-27b-it-4bit")
    return model, tokenizer

def load_sample_data(directory):
    sample_data = []
    for filename in os.listdir(directory):
        if filename.endswith('.json'):
            with open(os.path.join(directory, filename), 'r') as f:
                sample_data.append(json.load(f))
    return sample_data

def generate_booking_reference():
    today = datetime.now()
    random_days = random.randint(0, 60)  # 0에서 60일 사이의 랜덤한 일수
    future_date = today + timedelta(days=random_days)
    return f"CHERRY{future_date.strftime('%Y%m%d')}{random.randint(1000, 9999)}"

def generate_bkg_sample(model, tokenizer, sample_data, booking_reference, max_tokens=2048):
    bound = random.choice(["W", "E"])
    ports = WESTBOUND_PORTS if bound == "W" else EASTBOUND_PORTS
    vessel = random.choice(VESSELS)
    port_of_loading = random.choice(ports["loading"])
    port_of_discharge = random.choice(ports["discharge"])

    sample_structure = random.choice(sample_data)

    data = {
        "bookingReference": booking_reference,
        "customerName": f"CUSTOMER {random.randint(1000, 9999)}",
        "shipperName": f"SHIPPER {random.randint(1000, 9999)}",
        "invoiceReceiver": f"INVOICE RECEIVER {random.randint(1000, 9999)}",
        "voyageDetails": {
            "vesselName": vessel,
            "voyageNumber": f"{datetime.now().year}{random.randint(100, 999)}{bound}"
        },
        "cargoDetails": {
            "hsCode": f"{random.randint(100000, 999999)}",
            "chapterDescription": f"SAMPLE CHAPTER DESCRIPTION {random.randint(1, 100)}",
            "commodity": f"SAMPLE COMMODITY {random.randint(1, 100)}"
        },
        "containerDetails": {
            "size": random.choice(["20 DRY", "40 DRY", "40 HIGH CUBE", "45 HIGH CUBE"]),
            "type": random.choice(["GENERAL PURPOSE", "REEFER CONTAINER", "OPEN TOP", "FLAT RACK"]),
            "quantity": random.randint(1, 5)
        },
        "routeDetails": {
            "placeOfReceipt": port_of_loading,
            "portOfLoading": port_of_loading,
            "portOfDischarge": port_of_discharge,
            "placeOfDelivery": port_of_discharge
        },
        "scheduleDetails": {
            "estimatedArrivalAtLoadingPort": (datetime.now() + timedelta(days=random.randint(1, 30))).strftime("%Y-%m-%d %H:%M"),
            "estimatedDepartureFromLoadingPort": (datetime.now() + timedelta(days=random.randint(31, 60))).strftime("%Y-%m-%d %H:%M"),
            "estimatedArrivalAtDischargePort": (datetime.now() + timedelta(days=random.randint(61, 90))).strftime("%Y-%m-%d %H:%M")
        },
        "emptyContainerPickupLocation": f"{random.choice(WESTBOUND_PORTS['loading'])}, {random.choice(['CHINA', 'KOREA', 'JAPAN', 'VIETNAM'])}",
        "shippingTerm": random.choice(["FOB", "CIF", "EXW", "FCA", "CPT", "CIP", "DAP", "DDP"]),
        "remarks": "No special instructions"
    }

    # Ensure all fields from the sample structure are present
    for key, value in sample_structure.items():
        if key not in data:
            data[key] = value

    return data

def generate_out_of_gauge_info(remarks):
    lengths = ["8m", "9m", "10m", "11m", "12m"]
    widths = ["2.5m", "3m", "3.5m", "4m"]
    heights = ["3m", "3.5m", "4m", "4.5m", "5m"]
    
    return {
        "length": random.choice(lengths),
        "width": random.choice(widths),
        "height": random.choice(heights),
        "weight": f"{random.randint(20000, 40000)}kg"
    }

def generate_dangerous_goods_info(remarks):
    un_numbers = ["1202", "1203", "1223", "1267", "1993"]
    classes = ["3", "4.1", "4.2", "5.1", "6.1", "8", "9"]
    proper_shipping_names = [
        "Flammable liquid, n.o.s.",
        "Corrosive liquid, n.o.s.",
        "Toxic liquid, organic, n.o.s.",
        "Oxidizing liquid, n.o.s.",
        "Environmentally hazardous substance, liquid, n.o.s."
    ]
    
    return {
        "unNumber": random.choice(un_numbers),
        "class": random.choice(classes),
        "properShippingName": random.choice(proper_shipping_names),
        "packingGroup": random.choice(["I", "II", "III"]),
        "flashpoint": f"{random.randint(-20, 100)}°C"
    }

def generate_reefer_settings(remarks):
    temperatures = ["-20°C", "-18°C", "-15°C", "-10°C", "-5°C", "0°C", "2°C", "4°C", "10°C", "15°C"]
    ventilations = ["Off", "10%", "20%", "30%", "40%", "50%", "60%", "70%", "80%", "90%", "100%"]
    humidity_levels = ["60%", "65%", "70%", "75%", "80%", "85%", "90%"]
    
    return {
        "temperature": random.choice(temperatures),
        "ventilation": random.choice(ventilations),
        "humidityLevel": random.choice(humidity_levels),
        "o2Level": f"{random.randint(2, 21)}%",
        "co2Level": f"{random.randint(0, 30)}%"
    }

def generate_si_sample(model, tokenizer, sample_data, bkg_data, max_tokens=2048):
    instruction = f"""
    Generate a new JSON object for a Shipping Instruction based on the following structure and examples:
    {json.dumps(random.choice(sample_data), indent=2)}

    Ensure all fields are filled with realistic and varied data. 
    Modify values, names, and details to create a unique SI while maintaining the overall structure.
    Use the following booking data as reference and ensure consistency:
    {json.dumps(bkg_data, indent=2)}
    The output should be a valid JSON object.
    """

    prompt = f'<s>[INST] {instruction} [/INST]\n'
    
    generated_text = generate(
        model, 
        tokenizer, 
        prompt=prompt, 
        max_tokens=max_tokens,
        temp=0.7,
        top_p=0.95,
        verbose=False
    )

    generated_text = generated_text.replace(prompt, "").strip()
    
    start = generated_text.find('{')
    end = generated_text.rfind('}')
    
def generate_si_sample(model, tokenizer, sample_data, bkg_data, max_tokens=2048):
    instruction = f"""
    Generate a new JSON object for a Shipping Instruction based on the following structure and examples:
    {json.dumps(random.choice(sample_data), indent=2)}

    Ensure all fields are filled with realistic and varied data. 
    Modify values, names, and details to create a unique SI while maintaining the overall structure.
    Use the following booking data as reference and ensure consistency:
    {json.dumps(bkg_data, indent=2)}
    The output should be a valid JSON object.
    """

    prompt = f'<s>[INST] {instruction} [/INST]\n'
    
    generated_text = generate(
        model, 
        tokenizer, 
        prompt=prompt, 
        max_tokens=max_tokens,
        temp=0.7,
        top_p=0.95,
        verbose=False
    )

    generated_text = generated_text.replace(prompt, "").strip()
    
    start = generated_text.find('{')
    end = generated_text.rfind('}')
    
    if start != -1 and end != -1 and start < end:
        json_str = generated_text[start:end+1]
        try:
            data = json.loads(json_str)
            sample_structure = random.choice(sample_data)
            
            # Ensure the generated data follows the sample structure
            for key in sample_structure.keys():
                if key not in data:
                    data[key] = sample_structure[key]
            
            # Ensure key information is consistent with BKG data
            data["bookingReference"] = bkg_data["bookingReference"]
            data["voyageDetails"] = bkg_data["voyageDetails"]
            data["routeDetails"] = bkg_data["routeDetails"]
            
            # Ensure shipper name is the same
            if "shipperName" in bkg_data:
                if "partyDetails" not in data:
                    data["partyDetails"] = {}
                if "shipper" not in data["partyDetails"]:
                    data["partyDetails"]["shipper"] = {}
                data["partyDetails"]["shipper"]["name"] = bkg_data["shipperName"]
            
            # Process remarks and generate special information
            if "remarks" in bkg_data:
                if "additionalInformation" not in data:
                    data["additionalInformation"] = {}
                data["additionalInformation"]["additionalRemarks"] = bkg_data["remarks"]
                
                remarks = bkg_data["remarks"].lower()
                
                # Check for Out of Gauge
                if "out of gauge" in remarks or "oog" in remarks:
                    data["outOfGaugeDimensions"] = generate_out_of_gauge_info(remarks)
                
                # Check for Dangerous Goods
                if "dangerous goods" in remarks or "hazardous" in remarks:
                    data["dangerousGoods"] = generate_dangerous_goods_info(remarks)
                
                # Check for Reefer
                if "reefer" in remarks or "temperature controlled" in remarks:
                    data["reeferSettings"] = generate_reefer_settings(remarks)
            
            # Ensure onboardDate is the same as estimatedArrivalAtLoadingPort
            if "scheduleDetails" in bkg_data and "estimatedArrivalAtLoadingPort" in bkg_data["scheduleDetails"]:
                if "additionalInformation" not in data:
                    data["additionalInformation"] = {}
                data["additionalInformation"]["onboardDate"] = bkg_data["scheduleDetails"]["estimatedArrivalAtLoadingPort"]
            
            # Remove containerDetails from SI data if it exists
            if "containerDetails" in data:
                del data["containerDetails"]
            
            # Ensure all required fields from the sample structure are present
            for key, value in sample_structure.items():
                if key not in data:
                    data[key] = value
            
            return data
        except json.JSONDecodeError:
            print("Failed to parse JSON. Retrying...")
    else:
        print("No valid JSON structure found. Retrying...")
    return None

def create_samples(model, tokenizer, sample_data_bkg, sample_data_si, output_dir_bkg, output_dir_si, num_samples=10000):
    sample_data_bkg = load_sample_data(sample_data_bkg)
    sample_data_si = load_sample_data(sample_data_si)
    
    os.makedirs(output_dir_bkg, exist_ok=True)
    os.makedirs(output_dir_si, exist_ok=True)
    
    print(f"Generating {num_samples} pairs of Booking and SI samples...")
    for i in tqdm(range(num_samples), desc="Generating samples"):
        booking_reference = generate_booking_reference()
        
        retry_count = 0
        while retry_count < 5:  # Limit the number of retries
            bkg_data = generate_bkg_sample(model, tokenizer, sample_data_bkg, booking_reference)
            if bkg_data:
                si_data = generate_si_sample(model, tokenizer, sample_data_si, bkg_data)
                if si_data:
                    bkg_filename = f"bkg_{booking_reference}.json"
                    si_filename = f"si_{booking_reference}.json"
                    
                    with open(os.path.join(output_dir_bkg, bkg_filename), 'w') as f:
                        json.dump(bkg_data, f, indent=2)
                    with open(os.path.join(output_dir_si, si_filename), 'w') as f:
                        json.dump(si_data, f, indent=2)
                    break
            retry_count += 1
        if retry_count == 5:
            print(f"Failed to generate valid JSON for sample {i+1} after 5 attempts. Skipping...")

if __name__ == "__main__":
    model, tokenizer = load_model_and_tokenizer()
    sample_data_bkg = "./bkg/"
    sample_data_si = "./si/"
    output_dir_bkg = "./bkg_data/"
    output_dir_si = "./si_data/"
    create_samples(model, tokenizer, sample_data_bkg, sample_data_si, output_dir_bkg, output_dir_si, num_samples=10000)
    print("Sample generation completed.")

## Variable remarks in bkg data

In [1]:
import json
import os
import random
from datetime import datetime, timedelta
from tqdm import tqdm
from mlx_lm import load, generate

# Constants for the specific conditions
WESTBOUND_PORTS = {
    "loading": ["NINGBO", "SHANGHAI", "YANTIAN", "SINGAPORE", "TANGER", "LE HAVRE"],
    "discharge": ["LE HAVRE", "HAMBURG", "GDANSK", "ROTTERDAM", "ALGECIRAS", "PORT KLANG", "NINGBO"]
}

EASTBOUND_PORTS = {
    "loading": ["LE HAVRE", "HAMBURG", "GDANSK", "ROTTERDAM", "ALGECIRAS", "PORT KLANG", "NINGBO"],
    "discharge": ["NINGBO", "SHANGHAI", "YANTIAN", "SINGAPORE", "TANGER", "LE HAVRE"]
}

VESSELS = [
    "APL CHANGI", "APL MERLION", "APL RAFFLES", "APL SINGAPURA", "APL TEMASEK",
    "APL VANDA", "CMA CGM ALEXANDER VON HUMBOLDT", "CMA CGM BENJAMIN FRANKLIN",
    "CMA CGM BOUGAINVILLE", "CMA CGM EOURES", "CMA CGM GEORG FORSTER",
    "CMA CGM GRACE BAY", "CMA CGM ROQUEVAIRE", "CMA CGM VASCO DE GAMA",
    "CMA CGM ZHENG HE"
]



def load_model_and_tokenizer():
    print("Loading model and tokenizer...")
    model, tokenizer = load("mlx-community/gemma-2-27b-it-4bit")
    return model, tokenizer

def load_sample_data(directory):
    sample_data = []
    for filename in os.listdir(directory):
        if filename.endswith('.json'):
            with open(os.path.join(directory, filename), 'r') as f:
                sample_data.append(json.load(f))
    return sample_data

def generate_booking_reference():
    today = datetime.now()
    random_days = random.randint(0, 60)  # 0에서 60일 사이의 랜덤한 일수
    future_date = today + timedelta(days=random_days)
    return f"CHERRY{future_date.strftime('%Y%m%d')}{random.randint(1000, 9999)}"

def generate_remarks():
    remarks = []
    
    # 컨테이너 타입 선택
    container_types = ["DRY", "HC", "FLAT RACK", "OPEN TOP", "REEFER"]
    cargo_type = random.choice(["GENERAL", "OUT_OF_GAUGE", "REEFER", "DANGEROUS"])
    
    if cargo_type == "OUT_OF_GAUGE":
        container_type = random.choice(["FLAT RACK", "OPEN TOP"])
        length = random.randint(2000, 12000)
        width = random.randint(2000, 3000)
        height = random.randint(2000, 3000)
        remarks.append(f"Container Type: {container_type}")
        remarks.append(f"Out of Gauge Cargo, LxWxH(mm): {length}x{width}x{height}")
    
    elif cargo_type == "REEFER":
        container_type = "REEFER"
        temp = random.randint(-20, 20)
        remarks.append(f"Container Type: {container_type}")
        remarks.append(f"Temperature Controlled Cargo: {temp}°C")
    
    elif cargo_type == "DANGEROUS":
        container_type = random.choice(["DRY", "HC"])  # 주로 DRY나 HC 사용
        un_number = random.randint(1000, 3500)
        remarks.append(f"Container Type: {container_type}")
        remarks.append(f"Dangerous Goods, UN {un_number}")
    
    else:  # GENERAL cargo
        container_type = random.choice(["DRY", "HC"])
        remarks.append(f"Container Type: {container_type}")
    
    # 화물 가치
    if random.random() < 0.5:  # 50% 확률로 화물 가치 정보 추가
        value = random.randint(10000, 1000000)
        remarks.append(f"Value: USD{value:,}")
    
    # Free Time
    if random.random() < 0.4:  # 40% 확률로 Free Time 정보 추가
        days = random.choice([7, 14, 21, 30])
        remarks.append(f"{days} Days Free Time at Destination")
    
    # 특별 취급 지침
    special_instructions = [
        "Handle with care",
        "Keep dry",
        "This side up",
        "Fragile",
        "Do not stack"
    ]
    if random.random() < 0.3:  # 30% 확률로 특별 취급 지침 추가
        remarks.append(random.choice(special_instructions))
    
    # remarks가 비어있으면 기본 메시지 추가
    if not remarks:
        remarks.append("No special instructions")
    
    return ", ".join(remarks)

def generate_bkg_sample(model, tokenizer, sample_data, booking_reference, max_tokens=2048):
    sample_structure = random.choice(sample_data)

    instruction = f"""
    Generate a new JSON object for a Booking based on the following structure:
    {json.dumps(sample_structure, indent=2)}

    Use the following booking reference: {booking_reference}
    
    Consider the following ports for the route:
    Westbound: {', '.join(WESTBOUND_PORTS['loading'] + WESTBOUND_PORTS['discharge'])}
    Eastbound: {', '.join(EASTBOUND_PORTS['loading'] + EASTBOUND_PORTS['discharge'])}
    
    Available vessels: {', '.join(VESSELS)}

    Ensure all fields are filled with realistic and varied data. 
    The output should be a valid JSON object.
    """

    prompt = f'<s>[INST] {instruction} [/INST]\n'
    
    generated_text = generate(
        model, 
        tokenizer, 
        prompt=prompt, 
        max_tokens=max_tokens,
        temp=0.7,
        top_p=0.95,
        verbose=False
    )

    generated_text = generated_text.replace(prompt, "").strip()
    
    start = generated_text.find('{')
    end = generated_text.rfind('}')
    
    if start != -1 and end != -1 and start < end:
        json_str = generated_text[start:end+1]
        try:
            data = json.loads(json_str)
            
            # Ensure all required fields from the sample structure are present
            for key, value in sample_structure.items():
                if key not in data:
                    data[key] = value
            
            # Ensure booking reference is correct
            data["bookingReference"] = booking_reference
            
            # Generate remarks if not present
            if "remarks" not in data or not data["remarks"]:
                data["remarks"] = generate_remarks()
            
            return data
        except json.JSONDecodeError:
            print("Failed to parse JSON. Retrying...")
    else:
        print("No valid JSON structure found. Retrying...")
    return None

def generate_special_cargo_info(model, tokenizer, remarks, cargo_type, max_tokens=512):
    instruction = f"""
    Based on the following remarks for a shipping booking:
    "{remarks}"

    Generate realistic and detailed {cargo_type} information in JSON format.
    Include all relevant fields and ensure the information is consistent with the remarks.
    """

    prompt = f'<s>[INST] {instruction} [/INST]\n'
    
    generated_text = generate(
        model, 
        tokenizer, 
        prompt=prompt, 
        max_tokens=max_tokens,
        temp=0.7,
        top_p=0.95,
        verbose=False
    )

    generated_text = generated_text.replace(prompt, "").strip()
    
    start = generated_text.find('{')
    end = generated_text.rfind('}')
    
    if start != -1 and end != -1 and start < end:
        json_str = generated_text[start:end+1]
        try:
            data = json.loads(json_str)
            return data
        except json.JSONDecodeError:
            print(f"Failed to parse {cargo_type} JSON. Using default generation.")
    else:
        print(f"No valid {cargo_type} JSON structure found. Using default generation.")
    
    # Fallback to default generation if LLM fails
    if cargo_type == "Out of Gauge":
        return generate_out_of_gauge_info(remarks)
    elif cargo_type == "Dangerous Goods":
        return generate_dangerous_goods_info(remarks)
    elif cargo_type == "Reefer Settings":
        return generate_reefer_settings(remarks)

def generate_si_sample(model, tokenizer, sample_data, bkg_data, max_tokens=2048):
    instruction = f"""
    Generate a new JSON object for a Shipping Instruction based on the following structure and examples:
    {json.dumps(random.choice(sample_data), indent=2)}

    Ensure all fields are filled with realistic and varied data. 
    Modify values, names, and details to create a unique SI while maintaining the overall structure.
    Use the following booking data as reference and ensure consistency:
    {json.dumps(bkg_data, indent=2)}
    The output should be a valid JSON object.
    Do not include any special cargo information (such as outOfGaugeDimensions, dangerousGoods, or reeferSettings) unless explicitly mentioned in the booking remarks.
    """

    prompt = f'<s>[INST] {instruction} [/INST]\n'
    
    generated_text = generate(
        model, 
        tokenizer, 
        prompt=prompt, 
        max_tokens=max_tokens,
        temp=0.7,
        top_p=0.95,
        verbose=False
    )

    generated_text = generated_text.replace(prompt, "").strip()
    
    start = generated_text.find('{')
    end = generated_text.rfind('}')
    
    if start != -1 and end != -1 and start < end:
        json_str = generated_text[start:end+1]
        try:
            data = json.loads(json_str)
            sample_structure = random.choice(sample_data)
            
            # Ensure the generated data follows the sample structure
            for key in sample_structure.keys():
                if key not in data:
                    data[key] = sample_structure[key]
            
            # Ensure key information is consistent with BKG data
            data["bookingReference"] = bkg_data["bookingReference"]
            data["voyageDetails"] = bkg_data["voyageDetails"]
            data["routeDetails"] = bkg_data["routeDetails"]
            
            # Ensure shipper name is the same
            if "shipperName" in bkg_data:
                if "partyDetails" not in data:
                    data["partyDetails"] = {}
                if "shipper" not in data["partyDetails"]:
                    data["partyDetails"]["shipper"] = {}
                data["partyDetails"]["shipper"]["name"] = bkg_data["shipperName"]
            
            # Process remarks and generate special information
            if "remarks" in bkg_data:
                remarks = bkg_data["remarks"].lower()
                
                # Always remove any existing special cargo information
                data.pop("outOfGaugeDimensions", None)
                data.pop("dangerousGoods", None)
                data.pop("reeferSettings", None)
                
                # Check for Out of Gauge (only if explicitly mentioned)
                if "out of gauge" in remarks or "oog" in remarks:
                    data["outOfGaugeDimensions"] = generate_special_cargo_info(model, tokenizer, bkg_data["remarks"], "Out of Gauge")
                
                # Check for Dangerous Goods (only if explicitly mentioned)
                if "dangerous goods" in remarks or "hazardous" in remarks:
                    data["dangerousGoods"] = generate_special_cargo_info(model, tokenizer, bkg_data["remarks"], "Dangerous Goods")
                
                # Check for Reefer (only if explicitly mentioned)
                if "reefer" in remarks or "temperature controlled" in remarks:
                    data["reeferSettings"] = generate_special_cargo_info(model, tokenizer, bkg_data["remarks"], "Reefer Settings")
                
                # Add remarks to additionalInformation
                if "additionalInformation" not in data:
                    data["additionalInformation"] = {}
                data["additionalInformation"]["additionalRemarks"] = bkg_data["remarks"]
            
            # Ensure onboardDate is the same as estimatedArrivalAtLoadingPort
            if "scheduleDetails" in bkg_data and "estimatedArrivalAtLoadingPort" in bkg_data["scheduleDetails"]:
                if "additionalInformation" not in data:
                    data["additionalInformation"] = {}
                data["additionalInformation"]["onboardDate"] = bkg_data["scheduleDetails"]["estimatedArrivalAtLoadingPort"]
            
            # Remove containerDetails from SI data if it exists
            if "containerDetails" in data:
                del data["containerDetails"]
            
            # Ensure all required fields from the sample structure are present
            for key, value in sample_structure.items():
                if key not in data:
                    data[key] = value
            
            # Final check to remove any special cargo information not explicitly mentioned in remarks
            if "remarks" in bkg_data:
                remarks = bkg_data["remarks"].lower()
                if "out of gauge" not in remarks and "oog" not in remarks:
                    data.pop("outOfGaugeDimensions", None)
                if "dangerous goods" not in remarks and "hazardous" not in remarks:
                    data.pop("dangerousGoods", None)
                if "reefer" not in remarks and "temperature controlled" not in remarks:
                    data.pop("reeferSettings", None)
            
            return data
        except json.JSONDecodeError:
            print("Failed to parse JSON. Retrying...")
    else:
        print("No valid JSON structure found. Retrying...")
    return None

def create_samples(model, tokenizer, sample_data_bkg, sample_data_si, output_dir_bkg, output_dir_si, num_samples=10000):
    sample_data_bkg = load_sample_data(sample_data_bkg)
    sample_data_si = load_sample_data(sample_data_si)
    
    os.makedirs(output_dir_bkg, exist_ok=True)
    os.makedirs(output_dir_si, exist_ok=True)
    
    print(f"Generating {num_samples} pairs of Booking and SI samples...")
    for i in tqdm(range(num_samples), desc="Generating samples"):
        booking_reference = generate_booking_reference()
        
        retry_count = 0
        while retry_count < 5:  # Limit the number of retries
            bkg_data = generate_bkg_sample(model, tokenizer, sample_data_bkg, booking_reference)
            if bkg_data:
                si_data = generate_si_sample(model, tokenizer, sample_data_si, bkg_data)
                if si_data:
                    bkg_filename = f"bkg_{booking_reference}.json"
                    si_filename = f"si_{booking_reference}.json"
                    
                    with open(os.path.join(output_dir_bkg, bkg_filename), 'w') as f:
                        json.dump(bkg_data, f, indent=2)
                    with open(os.path.join(output_dir_si, si_filename), 'w') as f:
                        json.dump(si_data, f, indent=2)
                    break
            retry_count += 1
        if retry_count == 5:
            print(f"Failed to generate valid JSON for sample {i+1} after 5 attempts. Skipping...")

if __name__ == "__main__":
    model, tokenizer = load_model_and_tokenizer()
    sample_data_bkg = "./bkg/"
    sample_data_si = "./si/"
    output_dir_bkg = "./bkg_data/"
    output_dir_si = "./si_data/"
    create_samples(model, tokenizer, sample_data_bkg, sample_data_si, output_dir_bkg, output_dir_si, num_samples=10000)
    print("Sample generation completed.")


Loading model and tokenizer...


Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

Generating 10000 pairs of Booking and SI samples...


Generating samples:   0%|          | 25/10000 [1:04:18<423:03:40, 152.68s/it]

Failed to parse JSON. Retrying...


Generating samples:   1%|          | 57/10000 [2:32:08<428:11:43, 155.03s/it]

Failed to parse JSON. Retrying...


Generating samples:   1%|          | 62/10000 [2:48:35<447:26:31, 162.08s/it]

Failed to parse JSON. Retrying...


Generating samples:   1%|          | 65/10000 [3:00:11<459:01:04, 166.33s/it]


KeyboardInterrupt: 