Load the training data

In [1]:
from google.colab import files
import os

filename = "housing_data_simulation2.csv"

# Check if the file already exists
if not os.path.exists(filename):
    # If it doesn't exist, upload it
    uploaded = files.upload()
    filename = list(uploaded.keys())[0]  # Get the filename of the uploaded file
    print(f"File '{filename}' uploaded successfully.")
else:
    print(f"File '{filename}' already exists. Skipping upload.")

File 'housing_data_simulation.csv' already exists. Skipping upload.


If necessary, install bitsandbytes

In [12]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-

Train the model

In [None]:
import os
from huggingface_hub import login
from google.colab import userdata


# Your current token setup code is fine, just make sure it runs before your model code
hf_token = userdata.get('llama3.1token')  # or use os.environ.get('HUGGINGFACE_TOKEN')
os.environ['HUGGINGFACE_TOKEN'] = hf_token
login(token=hf_token)

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    LlamaTokenizer,
    LlamaForCausalLM,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig
)
from sklearn.model_selection import train_test_split
import random
from tqdm import tqdm
import os
from peft import get_peft_model, LoraConfig, TaskType
import bitsandbytes


from google.colab import drive
drive.mount('/content/drive')

# Define a persistent storage location on your Google Drive
MODEL_SAVE_PATH = "/content/drive/MyDrive/ml_models/llama3_real_estate_model"

# Set random seeds for reproducibility
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed_val)

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

# Load and preprocess the data
def load_data(file_path):
    """Load and preprocess the dataset."""
    try:
        df = pd.read_csv(file_path, encoding='utf-8')
    except UnicodeDecodeError:
        df = pd.read_csv(file_path, encoding='latin1')


    # Convert categorical features to strings
    df['Bedrooms'] = df['Bedrooms'].astype(str)
    df['Bathrooms'] = df['Bathrooms'].astype(str)

    # Normalize numerical features
    df['Price'] = df['Price'] / 1000000  # Convert to millions
    df['Square Footage'] = df['Square Footage'] / 1000  # Convert to thousands
    df['Lot Size (Acres)'] = df['Lot Size (Acres)']  # Keep as is

    # Process features list into a string
    if isinstance(df['Features'].iloc[0], str):
        # If features are stored as strings that look like lists, convert them
        try:
            df['Features'] = df['Features'].apply(eval).apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
        except:
            # If eval fails, assume it's already a string
            pass
    elif isinstance(df['Features'].iloc[0], list):
        df['Features'] = df['Features'].apply(lambda x: ', '.join(x))

    return df

# Custom dataset class for Llama
class RealEstateLlamaDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.instructions = [
            "Write a compelling real estate listing description.",
            "Describe this property in a persuasive and engaging way.",
            "Craft an attractive and accurate listing for this home.",
            "Generate a professional real estate description.",
            "Provide a detailed and enticing listing for this property."
        ]
        self.styles = ["<luxury>", "<cozy>", "<modern>", "<family>", "<urban>"]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        property_text = f"Price: ${row['Price']:.6f}M, Bedrooms: {row['Bedrooms']}, Bathrooms: {row['Bathrooms']}, " \
                        f"Square Footage: {row['Square Footage']:.4f}K sq ft, Lot Size: {row['Lot Size (Acres)']:.3f} acres, " \
                        f"Features: {row['Features']}"

        # Pick a random instruction prompt and style
        instruction = random.choice(self.instructions)
        style_token = random.choice(self.styles)

        # You can optionally add a style token here, covered below
        full_text = (
            f"<s>[INST] {style_token} {instruction} "
            f"Make sure that the adjectives match the features. Price, Square Footage, and Lot Size must appear EXACTLY as input.\n\n"
            f"{property_text} [/INST] {row['Listing Description']}</s>"
        )

        encodings = self.tokenizer(
            full_text,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt"
        )

        labels = encodings['input_ids'].clone()
        inst_tokens = self.tokenizer.encode(" [/INST]", add_special_tokens=False)
        inst_end_positions = [i + len(inst_tokens) - 1 for i in range(len(labels[0]) - len(inst_tokens) + 1)
                              if labels[0][i:i+len(inst_tokens)].tolist() == inst_tokens]
        if inst_end_positions:
            labels[0, :inst_end_positions[0]+1] = -100

        return {
            'input_ids': encodings['input_ids'].squeeze(),
            'attention_mask': encodings['attention_mask'].squeeze(),
            'labels': labels.squeeze(),
            'raw_property': property_text,
            'raw_description': row['Listing Description']
        }


# Training function using Trainer API
def train_llama_model(model, train_dataset, val_dataset, tokenizer, output_dir, epochs=1, lr=2e-4, batch_size=1):
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=4,  # Accumulate gradients to simulate larger batch
        warmup_steps=50,  # Reduce warmup steps
        weight_decay=0.01,
        logging_dir=f"{output_dir}/logs",
        logging_steps=10,
        eval_strategy="steps",  # Use eval_strategy instead of evaluation_strategy
        eval_steps=100,
        save_strategy="steps",
        save_steps=100,
        save_total_limit=2,  # Keep only 2 checkpoints to save space
        load_best_model_at_end=True,
        report_to="none",
        learning_rate=lr,
        fp16=True,  # Use fp16 for training
        optim="paged_adamw_8bit",  # Memory-efficient optimizer
        max_grad_norm=0.3,  # Gradient clipping
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
    )

    print("Starting training...")
    trainer.train()

    print("Training complete. Saving model...")
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    return model

# Generate a sample description
def generate_sample(model, tokenizer, dataloader):
    model.eval()

    # Get a random sample
    batch = next(iter(dataloader))
    property_info = batch['raw_property'][0]
    actual_description = batch['raw_description'][0]

    print("\nSample Property:")
    print(property_info)

    print("\nActual Description:")
    print(actual_description)

    # Generate description
    generated_text = generate_listing_description(model, tokenizer, property_info)

    print("\nGenerated Description:")
    print(generated_text)

# Generate listing description
def generate_listing_description(model, tokenizer, property_info, style_token=None, max_length=150):
    model.eval()

    # Prepare input text with prompt in the instruction format
    if style_token is None:
        style_token = random.choice(["<luxury>","<cozy>","<modern>","<family>","<urban>"])
    instruction = (f"{style_token} Write a compelling real estate listing. "
                   f"Price, Square Footage, Lot Size must match exactly.")
    input_text = f"<s>[INST] {instruction}\n\n{property_info} [/INST]"
    tokenized_input = tokenizer(input_text, return_tensors="pt")
    input_ids = tokenized_input["input_ids"].to(device)
    attention_mask = tokenized_input["attention_mask"].to(device)

    # Generate text
    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            attention_mask = attention_mask,
            max_new_tokens=max_length,
            temperature=1.0,
            top_p=0.9,
            repetition_penalty=1.3,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )

    # Decode the output
    generated_text = tokenizer.decode(output[0], skip_special_tokens=False)

    description_part = generated_text

    # First extract the part after [/INST] if present
    if "[/INST]" in description_part:
        description_part = description_part.split("[/INST]")[1]

    # Then remove any end tokens
    end_tokens = ["</s>", "<|end_of_text|>"]
    for token in end_tokens:
        if token in description_part:
            description_part = description_part.split(token)[0]

    description_part = description_part.strip()

    return description_part

# Function to format a new property for prediction
def format_property_input(property_dict):
    price_in_millions = property_dict['Price'] / 1000000
    sq_ft_in_thousands = property_dict['Square Footage'] / 1000

    features = property_dict['Features']
    if isinstance(features, list):
        features = ', '.join(features)

    property_text = f"Price: ${price_in_millions:.6f}M, Bedrooms: {property_dict['Bedrooms']}, " \
                     f"Bathrooms: {property_dict['Bathrooms']}, " \
                     f"Square Footage: {sq_ft_in_thousands:.4f}K sq ft, " \
                     f"Lot Size: {property_dict['Lot Size (Acres)']:.3f} acres, " \
                     f"Features: {features}"

    return property_text

# Setup LoRA for parameter-efficient fine-tuning
def setup_lora_model(base_model_path="meta-llama/Llama-3.1-8B", hf_token=None):
    print(f"Loading base model: {base_model_path}...")

    from transformers import AutoTokenizer, AutoModelForCausalLM

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        base_model_path,
        token=hf_token,
        use_fast=True
    )

    style_tokens = ["<luxury>", "<cozy>", "<modern>", "<family>", "<urban>"]
    tokenizer.add_special_tokens({'additional_special_tokens': style_tokens})

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token


    # Configure 4-bit quantization
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True
    )

    # Load the model with 4-bit quantization
    model = AutoModelForCausalLM.from_pretrained(
        base_model_path,
        quantization_config=bnb_config,
        device_map="auto",
        token=hf_token
    )

    model.resize_token_embeddings(len(tokenizer))

    # Define LoRA configuration
    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=8,  # Lower rank for less memory
        lora_alpha=16,
        lora_dropout=0.1,
        target_modules=["q_proj", "v_proj"]  # Optimize for memory: target fewer modules
    )

    # Get the PEFT model
    model = get_peft_model(model, peft_config)

    # Print trainable parameters
    print("Trainable parameters:")
    model.print_trainable_parameters()

    return model, tokenizer

# Main execution
def main(file_path, output_dir="./llama3_real_estate_model", batch_size=4, epochs=3, model_path="meta-llama/Llama-3.1-8B", hf_token=None):
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Load data
    df = load_data(file_path)

    # Split the data
    train_df, val_df = train_test_split(df, test_size=0.3, random_state=42)

    # Initialize model and tokenizer with LoRA for efficient fine-tuning
    model, tokenizer = setup_lora_model(model_path, hf_token)

    # Create datasets
    train_dataset = RealEstateLlamaDataset(train_df, tokenizer)
    val_dataset = RealEstateLlamaDataset(val_df, tokenizer)

    # Train model
    trained_model = train_llama_model(
        model,
        train_dataset,
        val_dataset,
        tokenizer,
        output_dir,
        epochs=epochs,
        batch_size=batch_size
    )

    # Create a small dataloader for generating samples
    sample_dataloader = DataLoader(val_dataset, batch_size=1)

    # Generate a sample
    generate_sample(trained_model, tokenizer, sample_dataloader)

    # Test with a new property
    test_property = {
        'Price': 450000,
        'Bedrooms': '3',
        'Bathrooms': '2',
        'Square Footage': 1800,
        'Lot Size (Acres)': 0.25,
        'Features': ['Hardwood floors', 'Updated kitchen', 'Finished basement', 'Deck']
    }

    property_text = format_property_input(test_property)
    description = generate_listing_description(trained_model, tokenizer, property_text)

    print("\nGenerated listing for test property:")
    print(description)

    # Save the trained model
    print("Saving the fine-tuned model...")

    # Save the LoRA adapter weights
    trained_model.save_pretrained(f"{output_dir}/lora_adapter")

    # Save the tokenizer
    tokenizer.save_pretrained(f"{output_dir}/tokenizer")

    print(f"Model and tokenizer saved to {output_dir}")

#hf_token = os.environ.get('HUGGINGFACE_TOKEN', 'COPY TOKEN HERE')

main(
    file_path=filename,
    output_dir = MODEL_SAVE_PATH,
    batch_size=1,
    epochs=2,
    model_path="meta-llama/Llama-3.1-8B",
    hf_token=hf_token
)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using cuda device
Loading base model: meta-llama/Llama-3.1-8B...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Trainable parameters:
trainable params: 3,407,872 || all params: 8,033,710,080 || trainable%: 0.0424
Starting training...


Step,Training Loss,Validation Loss
100,0.1236,0.109914
200,0.0495,0.052632
300,0.0431,0.04434


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Training complete. Saving model...





Sample Property:
Price: $0.368600M, Bedrooms: 4, Bathrooms: 2.5, Square Footage: 1.7740K sq ft, Lot Size: 0.470 acres, Features: outdoor living spaces, custom lighting, designer windows, spa-like bathrooms, high-end appliances, smart home features, walk-in closets, gourmet kitchen, hardwood floors

Actual Description:
Welcome to your dream home! This breathtaking 4-bedroom residence redefines modern living. With 2.5 meticulously designed bathrooms and 1774 square feet of living space, every inch has been carefully considered. The 0.5-acre lot provides ample room for your lifestyle. Highlights include custom lighting, hardwood floors, walk-in closets. At $368,600.0, this home is a rare find you won't want to miss.

Generated Description:
Prepare to be impressed by this breathtaking 4-bedroom masterpiece. Anchored in a tree-lined street, the 1774 square foot home is a testament to sophisticated living. 2.5 luxurious bathrooms complement the expansive 0.5-acre property. Experience the ex

Import test file

In [1]:
from google.colab import files
import os

filename = "housing_data_simulation test copy.csv"

# Check if the file already exists
if not os.path.exists(filename):
    # If it doesn't exist, upload it
    uploaded = files.upload()
    filename = list(uploaded.keys())[0]  # Get the filename of the uploaded file
    print(f"File '{filename}' uploaded successfully.")
else:
    print(f"File '{filename}' already exists. Skipping upload.")

File 'housing_data_simulation test copy.csv' already exists. Skipping upload.


Test the model on 50 observations for later annotation

In [3]:
import re
import pandas as pd
import torch
import os
import gc
import random
from tqdm import tqdm
from transformers import AutoTokenizer, LlamaForCausalLM, LlamaTokenizer
from peft import PeftModel, PeftConfig

def post_process_description(generated_text, property_info):
    """
    Ensures that numerical values in the generated text match the desired format:
    - Price: $300,000 (full dollar amount with commas)
    - Square footage: 1234 (whole number)
    - Acres: 0.15 (decimal number)

    Args:
        generated_text (str): The generated listing description
        property_info (str): The original property information string

    Returns:
        str: The processed description with correctly formatted numerical values
    """
    # Extract the original values from property_info
    price_match = re.search(r'Price: \$([\d.]+)M', property_info)
    sqft_match = re.search(r'Square Footage: ([\d.]+)K', property_info)
    lot_match = re.search(r'Lot Size: ([\d.]+) acres', property_info)

    # Convert to the desired formats
    if price_match:
        # Convert $X.YM to $X,YY0,000
        price_in_millions = float(price_match.group(1))
        price_in_dollars = int(price_in_millions * 1000000)
        formatted_price = "${:,}".format(price_in_dollars)
    else:
        formatted_price = None

    if sqft_match:
        # Convert X.YK to whole number
        sqft_in_thousands = float(sqft_match.group(1))
        formatted_sqft = str(int(sqft_in_thousands * 1000))
    else:
        formatted_sqft = None

    if lot_match:
        # Keep exact decimal format for acreage
        formatted_lot = lot_match.group(1)
    else:
        formatted_lot = None

    # Only proceed with replacements if we have valid values
    if not (formatted_price or formatted_sqft or formatted_lot):
        print("Warning: Could not extract values from property_info")
        return generated_text

    processed_text = generated_text

    # Replace price variations
    if formatted_price:
        # Handle $X.YM format
        processed_text = re.sub(r'\$([\d,.]+)M', formatted_price, processed_text)
        processed_text = re.sub(r'\$([\d,.]+) million', formatted_price, processed_text)
        processed_text = re.sub(r'\$([\d,.]+) Million', formatted_price, processed_text)

        # Handle normal dollar amounts
        processed_text = re.sub(r'\$([\d,]+)', formatted_price, processed_text)

        # Handle spelled out numbers
        million_words = ['million', 'Million']
        for word in million_words:
            pattern = rf'(\$[\d,.]+ {word}|\$[\d,.]+{word})'
            processed_text = re.sub(pattern, formatted_price, processed_text)

    # Replace square footage variations
    if formatted_sqft:
        # Handle XK sq ft format
        processed_text = re.sub(r'([\d,.]+)K sq ft', formatted_sqft + " sq ft", processed_text)
        processed_text = re.sub(r'([\d,.]+)K square feet', formatted_sqft + " square feet", processed_text)

        # Handle normal square footage formats
        processed_text = re.sub(r'([\d,.]+) sq\. ft\.', formatted_sqft + " sq. ft.", processed_text)
        processed_text = re.sub(r'([\d,.]+) square feet', formatted_sqft + " square feet", processed_text)
        processed_text = re.sub(r'([\d,.]+) sf', formatted_sqft + " sq ft", processed_text)

        # Handle thousand variations
        thousand_words = ['thousand square feet', 'thousand sq ft', 'thousand sq. ft.']
        for word in thousand_words:
            pattern = rf'([\d,.]+) {word}'
            processed_text = re.sub(pattern, formatted_sqft + " square feet", processed_text)

    # Replace lot size variations
    if formatted_lot:
        # Standard lot size formats
        processed_text = re.sub(r'([\d,.]+) acre(?!s)', formatted_lot + " acre", processed_text)
        processed_text = re.sub(r'([\d,.]+) acres', formatted_lot + " acres", processed_text)
        processed_text = re.sub(r'([\d,.]+)-acre', formatted_lot + "-acre", processed_text)

    # Check if values were properly inserted and add them if missing
    if formatted_price and formatted_price not in processed_text:
        processed_text = f"Priced at {formatted_price}. " + processed_text

    if formatted_sqft and formatted_sqft + " sq" not in processed_text:
        processed_text = f"This {formatted_sqft} sq ft home " + processed_text

    if formatted_lot and formatted_lot + " acre" not in processed_text:
        if "acre" in processed_text:
            # Try to insert near an existing mention of acres
            processed_text = re.sub(r'([\d,.]+) acres?', f"{formatted_lot} acres", processed_text, count=1)
        else:
            # Add to beginning if no mention exists
            processed_text = f"Situated on {formatted_lot} acres. " + processed_text

    return processed_text


# Define style tokens and instruction prompts
STYLE_TOKENS = ["<luxury>", "<cozy>", "<modern>", "<family>", "<urban>"]
INSTRUCTIONS = [
    "Write a compelling real estate listing description.",
    "Describe this property in a persuasive and engaging way.",
    "Craft an attractive and accurate listing for this home.",
    "Generate a professional real estate description.",
    "Provide a detailed and enticing listing for this property."
]


def generate_batch_descriptions(
    input_csv_path,
    output_csv_path,
    model_path="./llama3_real_estate_model",
    max_length=150,
    save_interval=5
):
    # Load data
    df = pd.read_csv(input_csv_path)
    if 'Generated Listing Description' not in df.columns:
        df['Generated Listing Description'] = None

    # Prepare tokenizer and ensure it matches training vocab
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    # Add style tokens (must match training)
    tokenizer.add_special_tokens({'additional_special_tokens': STYLE_TOKENS})
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def format_property_input(row):
        price_m = float(row['Price']) / 1_000_000 if row.get('Price') else 0
        sqft_k = float(row['Square Footage']) / 1000 if row.get('Square Footage') else 0
        lot = float(row['Lot Size (Acres)']) if row.get('Lot Size (Acres)') else 0
        features = row.get('Features', "None")
        if pd.isna(features): features = "None"
        return (
            f"Price: ${price_m:.6f}M, Bedrooms: {row.get('Bedrooms','0')}, "
            f"Bathrooms: {row.get('Bathrooms','0')}, "
            f"Square Footage: {sqft_k:.4f}K sq ft, Lot Size: {lot:.3f} acres, Features: {features}"
        )

    def load_model():
        gc.collect(); torch.cuda.empty_cache()
        peft_config_path = os.path.join(model_path, "adapter_config.json")
        is_peft = os.path.exists(peft_config_path)

        if is_peft:
            # Load base model
            peft_cfg = PeftConfig.from_pretrained(model_path)
            base_model = LlamaForCausalLM.from_pretrained(
                peft_cfg.base_model_name_or_path,
                device_map="auto",
                torch_dtype=torch.float16,
                load_in_4bit=True,
                low_cpu_mem_usage=True
            )
            # Resize embeddings to include style tokens
            base_model.resize_token_embeddings(len(tokenizer))
            # Load PEFT adapter weights
            model = PeftModel.from_pretrained(
                base_model,
                model_path,
                torch_dtype=torch.float16
            )
        else:
            # Load full model
            model = LlamaForCausalLM.from_pretrained(
                model_path,
                device_map="auto",
                torch_dtype=torch.float16,
                low_cpu_mem_usage=True
            )
            # Resize embeddings in case tokenizer vocab differs
            model.resize_token_embeddings(len(tokenizer))

        model.eval()
        return model

    def generate_listing_description(model, property_info, style_token, instruction, max_tokens=max_length):
        # Extract numeric values
        price_m = re.search(r'Price: \$(?P<p>[\d.]+)M', property_info)
        sqft_k = re.search(r'Square Footage: (?P<s>[\d.]+)K', property_info)
        lot = re.search(r'Lot Size: (?P<l>[\d.]+) acres', property_info)

        # Format for prompt
        price_val = f"${int(float(price_m.group('p'))*1e6):,}" if price_m else "unknown price"
        sqft_val = f"{int(float(sqft_k.group('s'))*1000)}" if sqft_k else "unknown sqft"
        lot_val = lot.group('l') if lot else "unknown acreage"

        prompt = (
            f"<s>[INST] {style_token} {instruction} Make sure adjectives match features. "
            f"Price: {price_val}, Square Footage: {sqft_val} sq ft, Lot Size: {lot_val} acres.\n\n"
            f"{property_info} [/INST]"
        )
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_new_tokens=max_tokens,
                temperature=1.0,
                top_p=0.95,
                repetition_penalty=1.3,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )
        text = tokenizer.decode(output[0], skip_special_tokens=False)
        if "[/INST]" in text:
            text = text.split("[/INST]")[1]
        for tok in ["</s>", "<|end_of_text|>"]:
            text = text.split(tok)[0]
        return post_process_description(text.strip(), property_info)

    # Main generation loop
    df_out = df.copy()
    remaining = df_out.index[df_out['Generated Listing Description'].isna()].tolist()
    if remaining:
        model = load_model()
        for i, idx in enumerate(tqdm(remaining), 1):
            row = df_out.loc[idx]
            property_text = format_property_input(row)
            style = random.choice(STYLE_TOKENS)
            instr = random.choice(INSTRUCTIONS)
            desc = generate_listing_description(model, property_text, style, instr)
            df_out.at[idx, 'Generated Listing Description'] = desc
            if i % save_interval == 0 or i == len(remaining):
                df_out.to_csv(output_csv_path, index=False)
                print(f"Saved {i}/{len(remaining)} descriptions")
        del model; torch.cuda.empty_cache(); gc.collect()
    else:
        print("No properties to process.")
    return df_out

if __name__ == "__main__":
    # Update these paths as needed
    input_csv = "/content/housing_data_simulation test copy.csv"  # Your input CSV file
    output_csv = "housing_data_with_descriptions.csv"  # Where to save the results
    model_dir = "/content/drive/MyDrive/ml_models/llama3_real_estate_model"  # Directory containing your trained model

    # Generate descriptions for all properties in the CSV
    generate_batch_descriptions(
        input_csv_path=input_csv,
        output_csv_path=output_csv,
        model_path=model_dir,
        max_length=150,
        save_interval=5  # Save after each property to prevent data loss
    )

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

 10%|█         | 5/50 [00:39<05:48,  7.74s/it]

Saved 5/50 descriptions


 20%|██        | 10/50 [01:11<04:06,  6.16s/it]

Saved 10/50 descriptions


 30%|███       | 15/50 [01:51<04:40,  8.02s/it]

Saved 15/50 descriptions


 40%|████      | 20/50 [02:26<03:42,  7.43s/it]

Saved 20/50 descriptions


 50%|█████     | 25/50 [02:57<02:31,  6.05s/it]

Saved 25/50 descriptions


 60%|██████    | 30/50 [03:31<02:11,  6.59s/it]

Saved 30/50 descriptions


 66%|██████▌   | 33/50 [03:53<02:00,  7.08s/it]


KeyboardInterrupt: 

Potentially more creative?

In [None]:
import re
import pandas as pd
import torch
import os
import gc
import random
from tqdm import tqdm
from transformers import AutoTokenizer, LlamaForCausalLM
from peft import PeftModel, PeftConfig

# Expanded style tokens and instruction prompts for greater variety
STYLE_TOKENS = [
    "<luxury>", "<cozy>", "<modern>", "<family>", "<urban>",
    "<rustic>", "<minimalist>", "<elegant>", "<eclectic>", "<charming>"
]
INSTRUCTIONS = [
    "Write a compelling real estate listing description.",
    "Describe this property in a persuasive and engaging way.",
    "Craft an attractive and accurate listing for this home.",
    "Generate a professional real estate description.",
    "Provide a detailed and enticing listing for this property.",
    "Capture the unique charm and features of this home.",
    "Highlight the best selling points of this property in an engaging narrative.",
    "Create a vivid and inviting description for potential buyers.",
    "Compose a descriptive and energetic real estate listing.",
    "Frame this home in a lifestyle-oriented and enticing way."
]

# Sampling configuration ranges for diversity
SAMPLING_CONFIG = {
    "temperature": (0.7, 1.2),
    "top_p": (0.8, 0.98),
    "repetition_penalty": (1.0, 1.4)
}


def post_process_description(generated_text, property_info):
    # (unchanged post-processing logic...)
    return generated_text


def generate_batch_descriptions(
    input_csv_path,
    output_csv_path,
    model_path="./llama3_real_estate_model",
    max_length=150,
    save_interval=5
):
    # Load data
    df = pd.read_csv(input_csv_path)
    df.setdefault('Generated Listing Description', None)

    # Tokenizer setup
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    tokenizer.add_special_tokens({'additional_special_tokens': STYLE_TOKENS})
    tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def format_property_input(row):
        price_m = float(row.get('Price', 0)) / 1_000_000
        sqft_k = float(row.get('Square Footage', 0)) / 1000
        lot = float(row.get('Lot Size (Acres)', 0))
        features = row.get('Features', "None")
        if pd.isna(features): features = "None"
        return (
            f"Price: ${price_m:.6f}M, Bedrooms: {row.get('Bedrooms','0')}, "
            f"Bathrooms: {row.get('Bathrooms','0')}, "
            f"Square Footage: {sqft_k:.4f}K sq ft, Lot Size: {lot:.3f} acres, Features: {features}"
        )

    def load_model():
        # Clear memory
        gc.collect(); torch.cuda.empty_cache()
        peft_cfg_path = os.path.join(model_path, "adapter_config.json")
        is_peft = os.path.exists(peft_cfg_path)
        if is_peft:
            peft_cfg = PeftConfig.from_pretrained(model_path)
            base = LlamaForCausalLM.from_pretrained(
                peft_cfg.base_model_name_or_path,
                device_map="auto", torch_dtype=torch.float16,
                load_in_4bit=True, low_cpu_mem_usage=True
            )
            base.resize_token_embeddings(len(tokenizer))
            model = PeftModel.from_pretrained(base, model_path, torch_dtype=torch.float16)
        else:
            model = LlamaForCausalLM.from_pretrained(
                model_path,
                device_map="auto", torch_dtype=torch.float16,
                low_cpu_mem_usage=True
            )
            model.resize_token_embeddings(len(tokenizer))
        model.eval()
        return model

    def generate_listing_description(model, property_info, style_token, instruction):
        # Randomize sampling parameters
        temp = random.uniform(*SAMPLING_CONFIG['temperature'])
        top_p = random.uniform(*SAMPLING_CONFIG['top_p'])
        rep_pen = random.uniform(*SAMPLING_CONFIG['repetition_penalty'])

        # Extract and format numeric constraints
        price_m = re.search(r'Price: \$(?P<p>[\d.]+)M', property_info)
        sqft_k = re.search(r'Square Footage: (?P<s>[\d.]+)K', property_info)
        lot = re.search(r'Lot Size: (?P<l>[\d.]+) acres', property_info)
        price_val = f"${int(float(price_m.group('p'))*1e6):,}" if price_m else ""
        sqft_val = f"{int(float(sqft_k.group('s'))*1000)}" if sqft_k else ""
        lot_val = lot.group('l') if lot else ""

        prompt = (
            f"<s>[INST] {style_token} {instruction} "
            f"Include exact values: Price: {price_val}, Square Footage: {sqft_val} sq ft, Lot Size: {lot_val} acres.\n\n"
            f"{property_info} [/INST]"
        )
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_new_tokens=max_length,
                temperature=temp,
                top_p=top_p,
                repetition_penalty=rep_pen,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )
        text = tokenizer.decode(output[0], skip_special_tokens=False)
        if "[/INST]" in text:
            text = text.split("[/INST]")[1]
        for tok in ["</s>", "<|end_of_text|>"]:
            text = text.split(tok)[0]
        return post_process_description(text.strip(), property_info)

    # Generation loop
    df_out = df.copy()
    to_process = df_out[df_out['Generated Listing Description'].isna()].index.tolist()
    if not to_process:
        print("No new properties to process.")
        return df_out

    model = load_model()
    for idx in tqdm(to_process, desc="Generating listings"):
        row = df_out.loc[idx]
        prop_text = format_property_input(row)
        style = random.choice(STYLE_TOKENS)
        instr = random.choice(INSTRUCTIONS)
        desc = generate_listing_description(model, prop_text, style, instr)
        df_out.at[idx, 'Generated Listing Description'] = desc
        if (to_process.index(idx)+1) % save_interval == 0:
            df_out.to_csv(output_csv_path, index=False)
    # Final save
    df_out.to_csv(output_csv_path, index=False)
    # Cleanup
    del model; torch.cuda.empty_cache(); gc.collect()
    return df_out

if __name__ == "__main__":
    generate_batch_descriptions(
        input_csv_path="properties.csv",
        output_csv_path="output.csv",
        model_path="./llama3_real_estate_model",
        max_length=150,
        save_interval=5
    )


We could use the evaluations to train a model to rate listing descriptions. That way you could produce several but just return the highest rated one.