<a href="https://colab.research.google.com/github/frank-morales2020/Cloud_curious/blob/master/ft_tourism_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## dataset FOR tourism travel planning

In [1]:
import json
import random
from datetime import datetime, timedelta

# --- 1. Define your expanded base data pools (these would be much larger in reality) ---
# Each destination should have diverse options for attractions, hotels, etc.
DESTINATIONS_DATA = {
    "Kyoto": {
        "attractions": [
            {"name": "Kinkaku-ji (Golden Pavilion)", "type": "temple", "visit_time": 1.5, "cost": 5, "transport_notes": "Bus from Kyoto Station"},
            {"name": "Fushimi Inari-taisha Shrine", "type": "shrine/hiking", "visit_time": 2.5, "cost": 0, "transport_notes": "JR Nara Line from Kyoto Station"},
            {"name": "Arashiyama Bamboo Grove", "type": "nature/sightseeing", "visit_time": 2, "cost": 0, "transport_notes": "JR Sagano Line from Kyoto Station"},
            {"name": "Gion District", "type": "historic district", "visit_time": 3, "cost": 0, "transport_notes": "Walk or Bus"},
            {"name": "Kiyomizu-dera Temple", "type": "temple", "visit_time": 2, "cost": 4, "transport_notes": "Walk or Bus"},
            {"name": "Nishiki Market", "type": "market/food", "visit_time": 2, "cost": 0, "transport_notes": "Walk or Bus"}
        ],
        "hotels": [
            {"name": "Ryokan Yoshida-sanso", "price_per_night": 200, "rating": 4.7, "location_tags": ["Higashiyama", "Traditional"]},
            {"name": "Hotel Granvia Kyoto", "price_per_night": 150, "rating": 4.5, "location_tags": ["Kyoto Station", "Modern"]},
            {"name": "Capsule Hotel Kyoto", "price_per_night": 50, "rating": 4.0, "location_tags": ["Downtown", "Budget"]}
        ],
        "restaurants": [
            {"name": "Kyoto Gion Karyo", "cuisine": "Kaiseki", "avg_cost": 100, "type": "luxury"},
            {"name": "Menbakaichidai (Flaming Ramen)", "cuisine": "Ramen", "avg_cost": 20, "type": "casual"},
            {"name": "Nishiki Market Food Stalls", "cuisine": "Street Food", "avg_cost": 15, "type": "budget"},
            {"name": "Saryo Tsujiri (Matcha Desserts)", "cuisine": "Desserts", "avg_cost": 12, "type": "cafe"}
        ],
        "weather_patterns": {
            "October": {"avg_temp_c": 17, "precip_days": 7, "notes": "mild & pleasant"},
            "November": {"avg_temp_c": 11, "precip_days": 6, "notes": "cool, autumn foliage"}
        },
        "local_transport": [
            {"type": "Kyoto City Bus", "cost_per_ride": 2.5, "notes": "Most common for sightseeing"},
            {"type": "Subway", "cost_per_ride": 2, "notes": "Limited lines, good for specific routes"},
            {"type": "JR Trains", "cost_per_ride": 3, "notes": "Efficient for key attractions"}
        ]
    },
    "Rome": {
        "attractions": [
            {"name": "Colosseum", "type": "ancient ruin", "visit_time": 2, "cost": 20, "transport_notes": "Metro B"},
            {"name": "Roman Forum & Palatine Hill", "type": "ancient ruin", "visit_time": 3, "cost": 0, "transport_notes": "Walk from Colosseum"},
            {"name": "Vatican Museums & Sistine Chapel", "type": "museum", "visit_time": 4, "cost": 25, "transport_notes": "Metro A"},
            {"name": "St. Peter's Basilica", "type": "cathedral", "visit_time": 2, "cost": 0, "transport_notes": "Walk from Vatican Museums"},
            {"name": "Pantheon", "type": "ancient temple/church", "visit_time": 1, "cost": 0, "transport_notes": "Walk/Bus"},
            {"name": "Trevi Fountain", "type": "landmark", "visit_time": 0.5, "cost": 0, "transport_notes": "Walk"}
        ],
        "hotels": [
            {"name": "Hotel Artemide", "price_per_night": 180, "rating": 4.6, "location_tags": ["Termini", "Central"]},
            {"name": "Generator Rome", "price_per_night": 60, "rating": 4.0, "location_tags": ["Esquilino", "Budget"]},
            {"name": "Hotel Santa Maria", "price_per_night": 150, "rating": 4.3, "location_tags": ["Trastevere", "Charming"]}
        ],
        "restaurants": [
            {"name": "Trattoria Da Cesare al Casaletto", "cuisine": "Roman", "avg_cost": 40, "type": "mid-range"},
            {"name": "Da Enzo al 29", "cuisine": "Roman", "avg_cost": 30, "type": "mid-range"},
            {"name": "Supplì Roma", "cuisine": "Street food", "avg_cost": 8, "type": "budget"},
            {"name": "Pizzeria Baffetto 2", "cuisine": "Pizza", "avg_cost": 20, "type": "casual"}
        ],
        "weather_patterns": {
            "April": {"avg_temp_c": 14, "precip_days": 8, "notes": "mild, spring showers"},
            "July": {"avg_temp_c": 26, "precip_days": 2, "notes": "hot, sunny"}
        },
        "local_transport": [
            {"type": "Metro", "cost_per_ride": 1.5, "notes": "Connects major points"},
            {"type": "Bus/Tram", "cost_per_ride": 1.5, "notes": "Extensive network"},
            {"type": "Walk", "cost_per_ride": 0, "notes": "Best for historic center"}
        ]
    },
    # ... Add more destinations with rich data (Orlando, etc.)
}

# --- User Profile Archetypes (based on your questionnaire) ---
USER_PROFILE_ARCHETYPES = [
    # Solo Traveler
    {"traveler_type": "Solo", "budget_level": "medium", "interests": ["cultural", "history"], "pace": "balanced", "app_comfort": "Tech-savvy"},
    {"traveler_type": "Solo", "budget_level": "low", "interests": ["food", "local events"], "pace": "relaxed", "app_comfort": "Average"},
    # Couple Traveler
    {"traveler_type": "Couple", "budget_level": "high", "interests": ["art", "fine dining"], "pace": "relaxed", "app_comfort": "Tech-savvy"},
    {"traveler_type": "Couple", "budget_level": "medium", "interests": ["adventure", "nature"], "pace": "full schedule", "app_comfort": "Average"},
    # Family Traveler
    {"traveler_type": "Family", "budget_level": "high", "interests": ["theme parks", "kid-friendly"], "pace": "full schedule", "app_comfort": "Tech-savvy"},
    {"traveler_type": "Family", "budget_level": "medium", "interests": ["educational", "outdoor"], "pace": "balanced", "app_comfort": "Average"},
    # ... Add more archetypes
]

# --- 2. Main Generation Loop (Enhanced Itinerary Logic) ---
generated_dataset = []
num_entries_to_generate = 1000

# Helper function to generate a weather forecast for a day (remains the same)
def get_daily_weather(historical_weather):
    temps = [historical_weather['avg_temp_c'] + random.randint(-3, 3) for _ in range(3)]
    temp_c = int(sum(temps) / len(temps))
    temp_f = int(temp_c * 9/5 + 32)
    precip_chance = historical_weather['precip_days'] / 30
    weather_desc = random.choices(
        ['Clear skies', 'Partly cloudy', 'Sunny', 'Light rain', 'Cloudy'],
        weights=[0.4, 0.3, 0.15, 0.1, 0.05]
    )[0]
    if random.random() < precip_chance * 0.7:
        weather_desc = 'Light rain' if 'rain' not in weather_desc else weather_desc
    if random.random() < precip_chance * 0.3:
        weather_desc = 'Heavy rain' if 'rain' not in weather_desc else weather_desc
    return f"{weather_desc}, {temp_c}°C/{temp_f}°F"

# --- Main loop for generating entries ---
for i in range(num_entries_to_generate):
    # a. Randomly select profile, destination, duration
    profile = random.choice(USER_PROFILE_ARCHETYPES)
    destination_name, destination_data = random.choice(list(DESTINATIONS_DATA.items()))
    trip_duration_days = random.randint(3, 7)

    # Randomly select a month for the trip from available weather data
    trip_month = random.choice(list(destination_data["weather_patterns"].keys()))
    historical_weather = destination_data["weather_patterns"][trip_month]

    start_date = datetime(2025, random.randint(1, 12), random.randint(1, 20))

    # b. Construct user_query based on profile and selected destination/dates (remains the same)
    user_query = (
        f"Plan a {trip_duration_days}-day {profile['traveler_type'].lower()} trip to {destination_name} "
        f"in {trip_month}. My budget is {profile['budget_level']} and I'm interested in "
        f"{', '.join(profile['interests'])}. I prefer a {profile['pace']} pace and am "
        f"{profile['app_comfort']} with mobile apps."
    )

    # c. Populate available_information (filtering logic slightly adjusted for better distribution)

    # Filter and select attractions - aim to select *more* than can fit, then assign
    # Added .get('', '') for safety in case 'type' is missing
    filtered_attractions = [a for a in destination_data["attractions"] if any(interest in a.get('type', '').lower() for interest in profile['interests']) or random.random() < 0.5]

    num_available_attractions = len(filtered_attractions)

    # Define the lower bound for the number of attractions to sample
    # This is the minimum required, capped by the actual number available
    min_attractions_to_sample = min(max(3, trip_duration_days - 1), num_available_attractions)

    # Define the upper bound for the number of attractions to sample
    # This is simply the total number available
    max_attractions_to_sample = num_available_attractions

    # Ensure the upper bound for random.randint is at least the lower bound.
    # If num_available_attractions is 0, both min and max sample counts will be 0,
    # and random.randint(0, 0) is valid (returns 0).
    upper_bound_for_randint = max(min_attractions_to_sample, max_attractions_to_sample)

    # Determine the actual number of attractions to sample
    # If no attractions are available, sample 0
    if upper_bound_for_randint == 0:
         num_attractions_to_sample = 0
    else:
        # Sample a random number between the adjusted lower bound and the total available
        # This ensures the range for random.randint is always valid
        num_attractions_to_sample = random.randint(min_attractions_to_sample, upper_bound_for_randint)


    # Select the pool of potential attractions to choose from during itinerary building
    # Use num_attractions_to_sample as the 'k' for random.sample
    pool_attractions = random.sample(filtered_attractions, k=num_attractions_to_sample)


    # Filter and select hotels
    selected_hotels = [h for h in destination_data["hotels"] if profile['budget_level'] == "low" and h['price_per_night'] < 100 or
                                                            profile['budget_level'] == "medium" and 50 < h['price_per_night'] < 250 or
                                                            profile['budget_level'] == "high" and h['price_per_night'] > 150]
    selected_hotels = random.sample(selected_hotels, k=1) if selected_hotels else random.sample(destination_data["hotels"], k=1) # Fallback

    # Filter and select restaurants - select a larger pool
    filtered_restaurants = [r for r in destination_data["restaurants"] if (profile['budget_level'] == "low" and r['avg_cost'] < 20) or
                                                    (profile['budget_level'] == "medium" and 10 < r['avg_cost'] < 50) or
                                                    (profile['budget_level'] == "high" and r['avg_cost'] > 30)]
    # Select a pool of potential restaurants
    # Determine the number of restaurants to sample safely
    num_available_restaurants = len(filtered_restaurants)
    min_restaurants_to_sample = max(3, trip_duration_days)

    # Ensure the lower bound for random.randint is not greater than the upper bound (num_available_restaurants)
    lower_bound = min(min_restaurants_to_sample, num_available_restaurants)
    upper_bound = num_available_restaurants

    if upper_bound < lower_bound:
        # This case handles when num_available_restaurants is 0 or very small, and min_restaurants_to_sample is larger.
        # In this scenario, sample whatever is available (which might be 0).
        num_restaurants_to_sample = num_available_restaurants
    else:
        # Sample a random number between the adjusted lower bound and the total available
        num_restaurants_to_sample = random.randint(lower_bound, upper_bound)

    # Now sample the restaurants
    pool_restaurants = random.sample(filtered_restaurants, k=num_restaurants_to_sample)


    available_info = {
        "total_budget_usd": random.randint(profile['budget_level'] == "low" and 500 or profile['budget_level'] == "medium" and 1500 or 3000,
                                           profile['budget_level'] == "low" and 1000 or profile['budget_level'] == "medium" and 2500 or 5000),
        "flights": [], # Needs actual flight generation logic
        "hotels": selected_hotels,
        "attractions": pool_attractions, # Use the pool of attractions
        "restaurants": pool_restaurants, # Use the pool of restaurants
        "historical_weather": historical_weather,
        "local_transportation_options": destination_data["local_transport"],
    }

    # d. Construct day_by_day_itinerary (Enhanced Logic)
    day_by_day_itinerary = []

    # Keep track of visited attractions and chosen restaurants to avoid repetition
    visited_attractions = set()
    chosen_restaurants = set()

    # Determine daily activity slots based on pace
    if profile['pace'] == 'full schedule':
        daily_slots = ['morning', 'afternoon', 'evening'] # Can fit 3 main things + potentially more
        activity_capacity_per_day = random.randint(2, 4) # Number of activities/meals to aim for
    elif profile['pace'] == 'balanced':
        daily_slots = ['morning', 'afternoon', 'evening'] # Can fit 2-3 main things
        activity_capacity_per_day = random.randint(2, 3)
    else: # 'relaxed'
        daily_slots = ['morning', 'afternoon', 'evening'] # Can fit 1-2 main things
        activity_capacity_per_day = random.randint(1, 2)

    # Simple time allocation simulation (hours per slot) - This is a simplification
    slot_times = {'morning': 3, 'afternoon': 3, 'evening': 4} # Example: Morning 9-12, Afternoon 1-4, Evening 6-10

    for day_num in range(1, trip_duration_days + 1):
        current_date = start_date + timedelta(days=day_num - 1)
        daily_weather = get_daily_weather(historical_weather)

        day_plan = {
            "day_number": day_num,
            "date": current_date.strftime("%Y-%m-%d"),
            "weather_forecast": daily_weather,
            "focus": f"Day {day_num} in {destination_name}",
            "morning": [],
            "afternoon": [],
            "evening": []
        }

        available_slots = daily_slots[:] # Copy the list

        # Handle arrival day
        if day_num == 1:
            day_plan["morning"].append("Arrival at destination.")
            if available_info["hotels"]:
                day_plan["afternoon"].append(f"Check into {available_info['hotels'][0]['name']}.")
                # Assume this takes some time, reduce afternoon capacity slightly
                current_afternoon_time = slot_times['afternoon'] - 1.5 # e.g., check-in takes 1.5 hours
            else:
                 current_afternoon_time = slot_times['afternoon']

            current_morning_time = 0 # Morning largely used for arrival
            current_evening_time = slot_times['evening']

            # Remove morning from available slots for planning activities
            if 'morning' in available_slots: available_slots.remove('morning')

        # Handle departure day
        elif day_num == trip_duration_days:
             day_plan["evening"].append("Check out from accommodation and depart.")
             if available_info["hotels"]:
                 day_plan["evening"][-1] = f"Check out from {available_info['hotels'][0]['name']} and depart."

             current_morning_time = slot_times['morning']
             current_afternoon_time = slot_times['afternoon']
             current_evening_time = 0 # Evening largely used for departure

             # Remove evening from available slots for planning activities/meals
             if 'evening' in available_slots: available_slots.remove('evening')

        # Normal days
        else:
             current_morning_time = slot_times['morning']
             current_afternoon_time = slot_times['afternoon']
             current_evening_time = slot_times['evening']


        # --- Distribute Attractions and Restaurants ---
        # Simple distribution: Try to add activities/meals to slots while respecting time

        # Combine attractions and restaurants available for this trip, prioritizing interests
        available_items = sorted(
            pool_attractions + pool_restaurants,
            key=lambda x: any(interest in x.get('type', '').lower() or interest in x.get('cuisine', '').lower() for interest in profile['interests']) or any(interest in ' '.join(x.get('location_tags', [])).lower() for interest in profile['interests']),
            reverse=True # Prioritize items matching interests
        )

        assigned_items_today = []
        remaining_capacity = activity_capacity_per_day

        # Iterate through available items and try to place them
        for item in available_items:
            if remaining_capacity <= 0:
                break # Stop if we've assigned enough items for today

            item_name = item.get('name', 'Unnamed Item')
            item_type = item.get('type', '') or item.get('cuisine', '')
            item_cost = item.get('cost', 0) or item.get('avg_cost', 0)
            visit_time = item.get('visit_time', 2) # Default visit time
            item_notes = item.get('transport_notes', '') # Use transport notes for attractions

            # Skip if already visited/chosen or already assigned today
            if item_name in visited_attractions or item_name in chosen_restaurants or item_name in assigned_items_today:
                continue

            # Decide which slot to try to place it in
            # Simple logic: Attractions in morning/afternoon, Restaurants in evening (mostly)
            potential_slots = []
            if 'attraction' in item_type.lower() or 'temple' in item_type.lower() or 'shrine' in item_type.lower() or 'museum' in item_type.lower() or 'landmark' in item_type.lower() or 'nature' in item_type.lower() or 'historic district' in item_type.lower():
                 potential_slots = [slot for slot in ['morning', 'afternoon'] if slot in available_slots]
                 activity_desc = f"Visit {item_name}"
                 if item_notes: activity_desc += f" ({item_notes})"
                 if item_cost > 0: activity_desc += f" (Cost: ${item_cost})"

            elif 'restaurant' in item_type.lower() or 'food' in item_type.lower() or 'cafe' in item_type.lower() or 'desserts' in item_type.lower():
                 potential_slots = [slot for slot in ['evening'] if slot in available_slots]
                 activity_desc = f"Dinner at {item_name}"
                 if item_cost > 0: activity_desc += f" (Budget: ~${item_cost})"
                 visit_time = 1.5 # Assume fixed time for meals

            # Try to place in the first available potential slot
            placed = False
            for slot in potential_slots:
                # Check if there's enough time in the slot (simplified)
                if slot == 'morning' and current_morning_time >= visit_time:
                     day_plan['morning'].append(activity_desc)
                     current_morning_time -= visit_time
                     placed = True
                elif slot == 'afternoon' and current_afternoon_time >= visit_time:
                     day_plan['afternoon'].append(activity_desc)
                     current_afternoon_time -= visit_time
                     placed = True
                elif slot == 'evening' and current_evening_time >= visit_time:
                     day_plan['evening'].append(activity_desc)
                     current_evening_time -= visit_time
                     placed = True

                if placed:
                    assigned_items_today.append(item_name)
                    if 'attraction' in item_type.lower() or 'temple' in item_type.lower() or 'shrine' in item_type.lower() or 'museum' in item_type.lower() or 'landmark' in item_type.lower() or 'nature' in item_type.lower() or 'historic district' in item_type.lower():
                         visited_attractions.add(item_name)
                    elif 'restaurant' in item_type.lower() or 'food' in item_type.lower() or 'cafe' in item_type.lower() or 'desserts' in item_type.lower():
                         chosen_restaurants.add(item_name)
                    remaining_capacity -= 1
                    break # Move to the next item after placing this one

        # Add some generic fillers if slots are empty after placing main activities
        if 'morning' in available_slots and not day_plan['morning'] and day_num != 1: # Don't add filler on arrival morning
            day_plan['morning'].append("Start the day with breakfast and preparation.")
        if 'afternoon' in available_slots and not day_plan['afternoon'] and (day_num != 1 or not available_info["hotels"]): # Don't add filler if checking in
             # Add a random local transport note as a filler
             if destination_data["local_transport"] and random.random() < 0.5:
                  transport_option = random.choice(destination_data["local_transport"])
                  day_plan['afternoon'].append(f"Explore the area using {transport_option['type']}.")
             else:
                 day_plan['afternoon'].append("Free time or explore the local area.")
        if 'evening' in available_slots and not day_plan['evening'] and day_num != trip_duration_days: # Don't add filler on departure evening
             # Add a generic dining suggestion if no restaurant was placed
             if not any('Dinner at' in item for item in day_plan['evening']):
                  day_plan['evening'].append("Enjoy dinner at a local restaurant.")
             day_plan['evening'].append("Evening relaxation or optional activity.")


        # Clean up empty lists if no activities were assigned
        # Create a list of slots to check to avoid modifying the dict during iteration
        slots_to_check = ['morning', 'afternoon', 'evening']
        for slot in slots_to_check:
             # Check if the key exists before trying to access it and check if list is empty
             if slot in day_plan and not day_plan[slot]:
                  del day_plan[slot] # Remove the key if the list is empty


        day_by_day_itinerary.append(day_plan)

    generated_dataset.append({
        "user_query": user_query,
        "available_information": available_info,
        "day_by_day_itinerary": day_by_day_itinerary
    })

# --- 3. Save to JSONL (remains the same) ---
output_filename = f"synthetic_tourism_dataset_{num_entries_to_generate}_entries_enhanced.jsonl" # New filename to avoid overwriting
with open(output_filename, 'w', encoding='utf-8') as f:
    for entry in generated_dataset:
        f.write(json.dumps(entry, ensure_ascii=False) + '\n')

print(f"Generated {len(generated_dataset)} synthetic entries with enhanced itineraries to {output_filename}")
print("\nNOTE: The itinerary generation logic has been enhanced but is still a simulation. More sophisticated planning (like geographical routing, opening hours, booking details, etc.) would require more complex code and data.")

Generated 1000 synthetic entries with enhanced itineraries to synthetic_tourism_dataset_1000_entries_enhanced.jsonl

NOTE: The itinerary generation logic has been enhanced but is still a simulation. More sophisticated planning (like geographical routing, opening hours, booking details, etc.) would require more complex code and data.


## fine tune

In [None]:
# --- 1. Set Up Your Environment ---
!pip install scikit-learn -q # For potential evaluation metrics (optional)
!pip install -U transformers -q
!pip install -U datasets -q
!pip install -U accelerate -q
!pip install -U peft -q
!pip install -U trl -q # For SFTTrainer
!pip install -U bitsandbytes -q
!pip install unsloth -q # Recommended for speed and efficiency
!pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git # For latest Unsloth

In [None]:
# 0. Initial Setup
# Install necessary libraries if running in Colab/Jupyter (ensure these are installed first)
# !pip install -U transformers datasets accelerate peft trl bitsandbytes unsloth scikit-learn
import torch
import io
import pandas as pd
import json
from datasets import load_dataset, Dataset # Added Dataset for potential manual splits
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments, TextStreamer, AutoTokenizer
from huggingface_hub import login # Optional: for pushing model to Hub

# Ensure you are logged into Hugging Face if you plan to push models or use private datasets
# login() # Uncomment and run if needed

# 1. Load the Model and Tokenizer
print("Loading DeepSeek-R1 model and tokenizer...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/DeepSeek-R1-Distill-Llama-8B",
    max_seq_length=2048, # Adjust if your combined input/output is longer
    dtype=None, # Automatically chooses bfloat16 or float16 based on GPU
    load_in_4bit=True, # Enable 4-bit quantization for memory efficiency
)
print("Model and tokenizer loaded.")

# 2. Apply LoRA Adapters
print("Applying LoRA adapters...")
model = FastLanguageModel.get_peft_model(
    model,
    r=16, # Rank of the LoRA matrices (common values: 8, 16, 32, 64)
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16, # Scaling factor for LoRA weights
    lora_dropout=0, # Dropout rate for LORA (set to 0 for inference)
    bias="none", # Or "all", "lora_only"
    use_gradient_checkpointing=True, # Recommended for memory saving
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)
print("LoRA adapters applied.")

# 3. Load Your Synthetic Dataset
print("Loading and preparing synthetic dataset...")

# --- IMPORTANT: Make sure 'synthetic_tourism_dataset_100_entries.jsonl' is in the same directory
# --- as your script, or provide the full path to the file.

# Load the full dataset from your .jsonl file
#dataset_path = 'synthetic_tourism_dataset_100_entries.jsonl'
dataset_path = output_filename
full_dataset = load_dataset('json', data_files=dataset_path)

# Split the dataset into training and evaluation sets (e.g., 90% train, 10% eval)
# The 'train' split comes from the default behavior of load_dataset for single file.
raw_dataset_split = full_dataset['train'].train_test_split(test_size=0.1, seed=42)
train_dataset_raw = raw_dataset_split['train']
eval_dataset_raw = raw_dataset_split['test']

print(f"Raw dataset loaded. Training entries: {len(train_dataset_raw)}, Evaluation entries: {len(eval_dataset_raw)}")


# 4. Define the **MODIFIED** Formatting Function
# This function converts your synthetic dataset entries into the
# chat format that the DeepSeek-R1 model will be trained on.

def format_daily_itinerary_example(example):
    user_query = example["user_query"]
    available_info = example["available_information"]
    day_by_day_itinerary = example["day_by_day_itinerary"]

    # --- Construct the 'Available Information' string for the prompt ---
    formatted_available_info_str = "\nAvailable Information:\n"
    if "flights" in available_info and available_info["flights"]:
        formatted_available_info_str += "Flights:\n" + "\n".join([json.dumps(f) for f in available_info["flights"]]) + "\n"
    if "hotels" in available_info and available_info["hotels"]:
        formatted_available_info_str += "Accommodation:\n" + "\n".join([json.dumps(h) for h in available_info["hotels"]]) + "\n"
    if "attractions" in available_info and available_info["attractions"]:
        formatted_available_info_str += "Attractions:\n" + "\n".join([json.dumps(a) for a in available_info["attractions"]]) + "\n"
    if "restaurants" in available_info and available_info["restaurants"]:
        formatted_available_info_str += "Dining:\n" + "\n".join([json.dumps(r) for r in available_info["restaurants"]]) + "\n"
    if "total_budget_usd" in available_info:
        formatted_available_info_str += f"Total Budget: ${available_info['total_budget_usd']}\n"
    # Use the correct keys for historical weather data
    if "historical_weather" in available_info:
        # Ensure both keys exist before trying to access them
        avg_temp_c = available_info['historical_weather'].get('avg_temp_c', 'N/A')
        notes = available_info['historical_weather'].get('notes', 'N/A')
        formatted_available_info_str += f"Historical Weather Context: Avg Temp {avg_temp_c}°C, {notes}\n"
    if "local_transportation_options" in available_info:
        formatted_available_info_str += "Local Transportation Options:\n" + "\n".join([f"- {t['type']}: {t['notes']}" for t in available_info["local_transportation_options"]]) + "\n"


    # --- Construct the TARGET 'assistant' output (the per-day itinerary) ---
    generated_itinerary_text = "Proposed Travel Itinerary:\n"
    for day_plan in day_by_day_itinerary:
        generated_itinerary_text += f"\nDay {day_plan['day_number']}: {day_plan['date']}"
        if 'weather_forecast' in day_plan:
            generated_itinerary_text += f" (Weather: {day_plan['weather_forecast']})"
        generated_itinerary_text += f"\nFocus: {day_plan['focus']}\n" if 'focus' in day_plan else "\n"
        if 'morning' in day_plan:
            generated_itinerary_text += f"  Morning: {day_plan['morning']}\n"
        if 'afternoon' in day_plan:
            generated_itinerary_text += f"  Afternoon: {day_plan['afternoon']}\n"
        if 'evening' in day_plan:
            generated_itinerary_text += f"  Evening: {day_plan['evening']}\n"

    # --- Combine into the chat format for training ---
    messages = [
        {"role": "user", "content": f"User's travel request: {user_query}\n\n{formatted_available_info_str}\n\nBased on the user's travel request and the available information, please generate a detailed travel itinerary broken down by day, including flights, activities, restaurant recommendations, and accommodation options. Prioritize minimizing travel time between activities. Format as a day-by-day plan."}
        ,
        {"role": "assistant", "content": generated_itinerary_text}
    ]

    example["text"] = tokenizer.apply_chat_template(messages, tokenize=False, add_special_tokens=False)
    return example

# Apply the formatting function to your training and evaluation datasets
print("Applying formatting function to datasets...")
train_dataset = train_dataset_raw.map(format_daily_itinerary_example, batched=False)
eval_dataset = eval_dataset_raw.map(format_daily_itinerary_example, batched=False)
print("Dataset preparation complete with per-day formatting.")


# 5. Set Up and Configure the Trainer
print("Setting up SFTTrainer...")
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset, # Now using the formatted datasets
    eval_dataset=eval_dataset,   # Now using the formatted datasets
    dataset_text_field="text", # This field holds the formatted chat messages
    max_seq_length=2048, # Ensure this is sufficient for your long itineraries
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=1,
        warmup_steps=10,
        num_train_epochs=3,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(), # Use fp16 if bfloat16 not supported
        bf16=torch.cuda.is_bf16_supported(),     # Use bf16 if supported (recommended)
        logging_steps=10,
        output_dir="./deepseek_r1_tourism_planner_finetuned", # Consistent output directory name
        optim="adamw_8bit",
        seed=3407,
        save_steps=500,
        save_total_limit=2,
        eval_strategy="steps",
        eval_steps=500,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False, # Lower loss is better
        report_to="none", # Disable logging to Weights & Biases if not needed
    ),
)
print("SFTTrainer configured.")


# 8. Model Evaluation (Conceptual usage after fine-tuning)
# This section demonstrates how you would use the fine-tuned model for inference.
# You'd load the model from `output_dir` and pass new prompts to it.
# Example inference code would be similar to what was outlined in the evaluation code.

In [None]:
# 6. Start Training
print('\n')
print("Starting training...")
from unsloth import unsloth_train
# trainer_stats = trainer.train() << Buggy gradient accumulation
# https://unsloth.ai/blog/gradient
trainer_stats = unsloth_train(trainer)
#trainer.train() # Uncomment to start the training
print("Training complete.")
print('\n')


# 7. Save Your Fine-tuned Model
output_dir = "./deepseek_r1_tourism_planner_finetuned"
print(f"Saving fine-tuned model to {output_dir}...")
model.save_pretrained(output_dir, tokenizer) # Uncomment to save the model and tokenizer
print("Model saved locally.")



Starting training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 900 | Num Epochs = 3 | Total steps = 1,350
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 1 x 1) = 2
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


Step,Training Loss,Validation Loss


Unsloth: Will smartly offload gradients to save VRAM!


## Model evaluation

In [None]:
!pip install colab-env -q
import colab_env

In [6]:
!cp -r /content/deepseek_r1_tourism_planner_finetuned /content/gdrive/MyDrive/model/deepseek_r1_tourism_planner_finetuned

In [None]:
!pip install rouge_score -q
!pip install sacrebleu -q

In [None]:
# --- Evaluation Code Block ---

import torch
import json
import pandas as pd # For saving results
from datasets import load_dataset # For loading evaluation data

# Import libraries for model loading and generation
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer # TextStreamer is optional
from unsloth import FastLanguageModel # Using Unsloth for efficient loading

# Import libraries for metrics calculation
from rouge_score import rouge_scorer
import sacrebleu
import numpy as np # For calculating averages

# --- Configuration ---
# Define the path where your fine-tuned model was saved
# Make sure this path points to the directory containing the saved model files
fine_tuned_model_path = "/content/gdrive/MyDrive/model/deepseek_r1_tourism_planner_finetuned" # Example Google Drive path
# fine_tuned_model_path = "./deepseek_r1_tourism_planner_finetuned" # Example local path

# Define the path to your synthetic dataset file (JSONL format)
dataset_path = 'synthetic_tourism_dataset_100_entries.jsonl'

# Number of examples from the evaluation set to run inference on
# Set to a smaller number for quick testing, or len(eval_dataset) for full evaluation
num_examples_to_evaluate = 10 # Example: Evaluate the first 10 entries

# Maximum sequence length used during fine-tuning (must match)
max_seq_length = 2048

# --- 1. Load the Fine-tuned Model and Tokenizer ---
print(f"Loading fine-tuned model from {fine_tuned_model_path}...")
try:
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=fine_tuned_model_path, # Load from your saved model directory
        max_seq_length=max_seq_length,     # Must match the max_seq_length used during fine-tuning
        dtype=None,                        # Will auto-detect from saved config
        load_in_4bit=True,                 # Load in 4-bit for memory efficiency
    )
    # Ensure the model is on GPU if available
    if torch.cuda.is_available():
        model.to("cuda")
        print("Model moved to GPU.")
    else:
        print("CUDA not available. Model loading on CPU (will be slower).")
except Exception as e:
    print(f"Error loading model: {e}")
    print("Please check that the model path is correct and the model files exist.")
    # Exit or handle the error appropriately if model loading fails
    exit() # Example: Exit the script


print("Model and tokenizer loaded.")

# Optional: Set up TextStreamer for real-time output during generation
# streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

# --- 2. Load the Evaluation Dataset ---
print(f"Loading evaluation dataset from {dataset_path}...")
try:
    full_dataset_for_eval = load_dataset('json', data_files=dataset_path)

    # Assume the dataset is loaded into a 'train' split by default
    # Split the dataset into training and evaluation sets consistently with training
    raw_dataset_split_for_eval = full_dataset_for_eval['train'].train_test_split(test_size=0.1, seed=42)
    eval_dataset = raw_dataset_split_for_eval['test'] # This is your test set for evaluation

    print(f"Evaluation dataset loaded with {len(eval_dataset)} entries.")

    # Adjust num_examples_to_evaluate if it's larger than the available dataset
    num_examples_to_evaluate = min(num_examples_to_evaluate, len(eval_dataset))
    print(f"Evaluating on {num_examples_to_evaluate} examples from the test set.")

except FileNotFoundError:
    print(f"Error: Dataset file not found at {dataset_path}")
    print("Please ensure 'synthetic_tourism_dataset_100_entries.jsonl' is in the correct directory.")
    exit() # Example: Exit if dataset not found
except Exception as e:
    print(f"Error loading dataset: {e}")
    exit()


# --- 3. Define the PROMPT CONSTRUCTION function for inference ---
# This function should mirror the 'user' part of your formatting function used during training.
# It takes raw data and turns it into the prompt the model expects.

def construct_inference_prompt(user_query, available_information):
    """Constructs the full user prompt for the model based on user query and available info."""

    formatted_available_info_str = "\nAvailable Information:\n"
    if "flights" in available_information and available_information["flights"]:
        formatted_available_info_str += "Flights:\n" + "\n".join([json.dumps(f) for f in available_information["flights"]]) + "\n"
    if "hotels" in available_information and available_information["hotels"]:
        formatted_available_info_str += "Accommodation:\n" + "\n".join([json.dumps(h) for h in available_information["hotels"]]) + "\n"
    if "attractions" in available_information and available_information["attractions"]:
        formatted_available_info_str += "Attractions:\n" + "\n".join([json.dumps(a) for a in available_information["attractions"]]) + "\n"
    if "restaurants" in available_information and available_information["restaurants"]:
        formatted_available_info_str += "Dining:\n" + "\n".join([json.dumps(r) for r in available_information["restaurants"]]) + "\n"
    if "total_budget_usd" in available_information:
        formatted_available_info_str += f"Total Budget: ${available_information['total_budget_usd']}\n"
    if "historical_weather" in available_information:
        # Safely access weather keys using .get() to handle potential variations
        avg_temp_c = available_information['historical_weather'].get('avg_temp_c', available_information['historical_weather'].get('avg_temp_celsius', 'N/A'))
        avg_temp_f = available_information['historical_weather'].get('avg_temp_fahrenheit', 'N/A') # Keep F if it might exist from original data
        notes = available_information['historical_weather'].get('notes', 'N/A')

        formatted_weather_str = f"Historical Weather Context:"
        if avg_temp_c != 'N/A':
             formatted_weather_str += f" Avg Temp {avg_temp_c}°C"
        if avg_temp_f != 'N/A':
             formatted_weather_str += f" ({avg_temp_f}°F)"
        if notes != 'N/A':
             formatted_weather_str += f", {notes}"
        formatted_available_info_str += formatted_weather_str + "\n"

    if "local_transportation_options" in available_information:
        formatted_available_info_str += "Local Transportation Options:\n" + "\n".join([f"- {t.get('type', 'N/A')}: {t.get('notes', 'N/A')}" for t in available_information["local_transportation_options"]]) + "\n"

    # The general instruction for the model to generate a per-day itinerary
    instruction = "Based on the user's travel request and the available information, please generate a detailed travel itinerary broken down by day, including flights, activities, restaurant recommendations, and accommodation options. Prioritize minimizing travel time between activities. Format as a day-by-day plan."

    # Construct the full user prompt for the model
    full_user_prompt = f"User's travel request: {user_query}\n\n{formatted_available_info_str}\n{instruction}"

    # Apply the chat template for the single user turn
    messages = [{"role": "user", "content": full_user_prompt}]
    # Tokenize the prompt for generation
    # return_tensors="pt" makes it a PyTorch tensor
    tokenized_input = tokenizer.apply_chat_template(messages, return_tensors="pt", add_special_tokens=True)

    # Move input to GPU if CUDA is available
    if torch.cuda.is_available():
        tokenized_input = tokenized_input.to("cuda")

    return tokenized_input


# --- 4. Evaluation Loop with Metrics ---
print("\n--- Starting Evaluation Loop ---")
results = []
rouge_scores = []
bleu_scores = []

# Initialize ROUGE scorer
# using 'rouge1', 'rouge2', and 'rougeL' f-measure (F1 score)
rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Generation parameters (can be adjusted)
generation_kwargs = {
    "max_new_tokens": 1024, # Max length of the generated itinerary
    "use_cache": True,
    "temperature": 0.7,     # Controls randomness (lower = more deterministic)
    "top_p": 0.95,          # Nucleus sampling (filter tokens based on cumulative probability)
    "do_sample": True,      # Enable sampling
    "pad_token_id": tokenizer.eos_token_id, # Set padding token
    #"streamer": streamer,  # Uncomment if using TextStreamer
}


for i in range(num_examples_to_evaluate):
    example = eval_dataset[i]
    original_query = example["user_query"]
    reference_info = example["available_information"]
    # The ground truth ideal itinerary (structured) from the dataset
    ground_truth_itinerary_structured = example["day_by_day_itinerary"]

    print(f"\n--- Evaluation Case {i+1}/{num_examples_to_evaluate} ---")
    print(f"Original Query: {original_query}")

    # Convert ground truth structured data to a single string for text-based metrics
    # This format should match the expected output format of the model
    ground_truth_text = "Proposed Travel Itinerary:\n"
    for day_plan in ground_truth_itinerary_structured:
        ground_truth_text += f"\nDay {day_plan.get('day_number', 'N/A')}: {day_plan.get('date', 'N/A')}"
        if 'weather_forecast' in day_plan:
            ground_truth_text += f" (Weather: {day_plan['weather_forecast']})"
        ground_truth_text += f"\nFocus: {day_plan.get('focus', 'N/A')}\n"
        if 'morning' in day_plan:
            ground_truth_text += f"  Morning: {day_plan.get('morning', 'N/A')}\n"
        if 'afternoon' in day_plan:
            ground_truth_text += f"  Afternoon: {day_plan.get('afternoon', 'N/A')}\n"
        if 'evening' in day_plan:
            ground_truth_text += f"  Evening: {day_plan.get('evening', 'N/A')}\n"
    # Ensure there's a newline at the end for consistency if needed
    ground_truth_text = ground_truth_text.strip() + "\n"


    print("\n--- Ground Truth Itinerary (from dataset) ---\n")
    print(ground_truth_text)


    # Construct the prompt for the model and get token IDs
    input_ids = construct_inference_prompt(original_query, reference_info)

    # Generate the itinerary using the model
    try:
        outputs = model.generate(
            input_ids=input_ids,
            **generation_kwargs # Pass the generation parameters
        )

        # Decode the generated text, skipping the prompt part
        # inputs_ids.shape[1] gives the number of tokens in the prompt
        generated_text = tokenizer.decode(outputs[0][input_ids.shape[1]:], skip_special_tokens=True)

    except Exception as e:
        print(f"Error during text generation for example {i+1}: {e}")
        generated_text = "Error generating itinerary." # Indicate failure


    print("\n--- Generated Itinerary ---\n")
    print(generated_text)

    # --- Calculate Metrics for the current example ---
    current_rouge_scores = None
    current_bleu_score = None

    if generated_text != "Error generating itinerary.":
        try:
            # ROUGE Score Calculation
            # Pass ground truth and generated text strings to the scorer
            current_rouge_scores = rouge_scorer_obj.score(ground_truth_text, generated_text)
            rouge_scores.append(current_rouge_scores)
            print(f"ROUGE Scores: ROUGE-1: {current_rouge_scores['rouge1'].fmeasure:.4f}, ROUGE-2: {current_rouge_scores['rouge2'].fmeasure:.4f}, ROUGE-L: {current_rouge_scores['rougeL'].fmeasure:.4f}")

            # BLEU Score Calculation
            # sacrebleu expects references as a list of lists of strings
            # and the hypothesis (generated text) as a list of strings.
            # For a single example, it's [hypothesis] and [[reference]]
            current_bleu = sacrebleu.corpus_bleu([generated_text], [[ground_truth_text]])
            current_bleu_score = current_bleu.score
            bleu_scores.append(current_bleu_score)
            print(f"BLEU Score: {current_bleu_score:.4f}")

        except Exception as e:
            print(f"Error calculating metrics for example {i+1}: {e}")
            # Metrics for this example will be None
    else:
         print("Skipping metric calculation due to generation error.")


    # Store results for later analysis
    results.append({
        "original_query": original_query,
        "generated_itinerary": generated_text,
        "ground_truth_itinerary": ground_truth_text, # Store ground truth as text string
        "rouge_scores": current_rouge_scores,       # Store the detailed ROUGE scores dict
        "bleu_score": current_bleu_score            # Store the BLEU score (float or None)
    })

print("\n--- Evaluation Loop Finished ---")

# --- 5. Analysis and Metrics Summary ---
print("\n--- Overall Evaluation Summary ---")

# Calculate and print average metrics
# Filter out None values before calculating averages
valid_rouge_scores = [s for s in rouge_scores if s is not None]
valid_bleu_scores = [s for s in bleu_scores if s is not None]

if valid_rouge_scores:
    avg_rouge1 = np.mean([s['rouge1'].fmeasure for s in valid_rouge_scores])
    avg_rouge2 = np.mean([s['rouge2'].fmeasure for s in valid_rouge_scores])
    avg_rougeL = np.mean([s['rougeL'].fmeasure for s in valid_rouge_scores])
    print(f"Average ROUGE-1 F-measure (over {len(valid_rouge_scores)} examples): {avg_rouge1:.4f}")
    print(f"Average ROUGE-2 F-measure (over {len(valid_rouge_scores)} examples): {avg_rouge2:.4f}")
    print(f"Average ROUGE-L F-measure (over {len(valid_rouge_scores)} examples): {avg_rougeL:.4f}")
else:
    print("No valid ROUGE scores were calculated.")


if valid_bleu_scores:
    avg_bleu = np.mean(valid_bleu_scores)
    print(f"Average BLEU Score (over {len(valid_bleu_scores)} examples): {avg_bleu:.4f}")
else:
    print("No valid BLEU scores were calculated.")

# --- Save Results ---
# Convert the list of results dictionaries to a pandas DataFrame and save to JSONL
try:
    df_results = pd.DataFrame(results)
    output_filename = "evaluation_results_with_metrics.jsonl"
    df_results.to_json(output_filename, orient="records", lines=True)
    print(f"\nEvaluation complete. Detailed results saved to '{output_filename}'.")
except Exception as e:
    print(f"Error saving results to JSONL: {e}")