In [1]:
# importing the depenendencies

from transformers import pipeline
import pandas as pd
import re
import unicodedata
from transformers import T5ForConditionalGeneration , T5Tokenizer

In [2]:
import pandas as pd
from datasets import Dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments

# --- 1. Load the NEW Prepared Data ---
print("Loading the prepared training dataset...")
try:
    df = pd.read_csv("/content/final_training_data (1).csv")
except FileNotFoundError:
    print("Error: '/content/final_training_data.csv' not found.")
    print("Please make sure you have successfully run the 'prepare_data.py' script first.")
    exit()

# The new dataset already has 'query' and 'plan' columns.
# We just need to rename them for the training function.
df = df.rename(columns={"query": "input_text", "plan": "target_text"})

# Create a Hugging Face Dataset
dataset = Dataset.from_pandas(df[['input_text', 'target_text']])
dataset = dataset.train_test_split(test_size=0.1, seed=42)

# --- 2. Tokenizer and Model ---
model_name = "t5-small"
print(f"Loading tokenizer and model for '{model_name}'...")
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# --- 3. Tokenize the Dataset ---
def tokenize_function(examples):
    # For T5, it's good practice to add a prefix to the input.
    prefix = "generate travel plan: "
    inputs = [prefix + doc for doc in examples['input_text']]

    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")

    # Tokenize the target texts (the 'plan' column)
    labels = tokenizer(text_target=examples['target_text'], max_length=512, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("Tokenizing the dataset...")
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# --- 4. Set Up Training ---
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,  # Increased epochs for better learning on the new data
    per_device_train_batch_size=4, # Smaller batch size for potentially longer sequences
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

# --- 5. Train the Model ---
print("\nStarting model training on the new data...")
trainer.train()
print("Training complete.")

# --- 6. Save the Final Model ---
final_model_path = "./t5-travel-planner-finetuned"
print(f"Saving the fine-tuned model to '{final_model_path}'...")
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)
print("Model saved successfully!")



Loading the prepared training dataset...
Loading tokenizer and model for 't5-small'...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Tokenizing the dataset...


Map:   0%|          | 0/751 [00:00<?, ? examples/s]

Map:   0%|          | 0/84 [00:00<?, ? examples/s]


Starting model training on the new data...


Epoch,Training Loss,Validation Loss
1,11.95,0.532432
2,0.4657,0.252358
3,0.2314,0.121119
4,0.1341,0.0696
5,0.0928,0.047887
6,0.0711,0.03619
7,0.0595,0.029718
8,0.0504,0.026331
9,0.0487,0.024185
10,0.0456,0.023621


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Training complete.
Saving the fine-tuned model to './t5-travel-planner-finetuned'...
Model saved successfully!


In [15]:
from transformers import pipeline
import pandas as pd
import re
import unicodedata

# 1. Define Model and Data Paths
model_path = "./t5-travel-planner-finetuned"
data_path = "./cleaned_dataset_travelPlanner.csv"

# 2. Load the Model and Dataset
print("Loading your fine-tuned model and dataset...")
try:
    planner = pipeline("text2text-generation", model=model_path, tokenizer=model_path)
    df = pd.read_csv(data_path)
except (OSError, FileNotFoundError) as e:
    print(f"Error: Could not find a required model or data file. {e}")
    print("Please make sure the model path and data path are correct.")
    exit()

# Utility function for cleaning text
def normalize(s):
    if pd.isnull(s):
        return ""
    s = str(s)
    s = unicodedata.normalize("NFKD", s).encode("ascii", "ignore").decode("utf-8")
    return s.strip().lower()

# 3. Get User Input
query = input("\nEnter your travel query:\n> ")

# 4. Prepare the Prompt for AI
prefix = "generate travel plan: "
prefixed_query = prefix + query

# 5. Generate the AI Plan
generated_plan = planner(prefixed_query, max_length=512, num_beams=5, early_stopping=True)[0]['generated_text']

# Extract details from query using regex
city_match = re.search(r'to\s+([\w\s]+?)\s+for', query, re.IGNORECASE)
destination_city = city_match.group(1).strip() if city_match else "Goa" # Default for example
normalized_city = normalize(destination_city)

days_match = re.search(r'(\d+)\s+days?', query, re.IGNORECASE)
num_days = int(days_match.group(1)) if days_match else 3

budget_match = re.search(r'budget of\s+[₹Rs]*\s*([\d,]+)', query, re.IGNORECASE)
total_budget = int(budget_match.group(1).replace(",", "")) if budget_match else 15000

# Budget allocation
hotel_budget = total_budget * 0.4
food_budget = total_budget * 0.3
attraction_budget = total_budget * 0.3

hotel_budget_per_night = hotel_budget / num_days
food_budget_per_meal = (food_budget / num_days) / 3 # Assuming 3 meals
attraction_budget_per_day = attraction_budget / num_days

# Prepare the single dataset for filtering
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
df.dropna(subset=['price', 'rating', 'city', 'category', 'name'], inplace=True)

city_df = df[df['city'].apply(normalize).str.contains(normalized_city, na=False)]
hotels = city_df[(city_df['category'] == 'hotel') & (city_df['price'] <= hotel_budget_per_night)].sort_values('rating', ascending=False).head(5)
attractions = city_df[(city_df['category'] == 'attraction') & (city_df['price'] <= attraction_budget_per_day)].sort_values('rating', ascending=False).head(5)

# First, try to find restaurants within the budget
restaurants = city_df[(city_df['category'] == 'restaurant') & (city_df['price'] <= food_budget_per_meal)].sort_values('rating', ascending=False).head(5)

# A flag to know which message to display later
using_fallback_restaurants = False
# If no restaurants are found within the budget, find the top 5 regardless of price
if restaurants.empty and not city_df.empty:
    print("No restaurants found within budget. Finding top-rated options instead...")
    restaurants = city_df[city_df['category'] == 'restaurant'].sort_values('rating', ascending=False).head(5)
    using_fallback_restaurants = True


# 7. Display the Combined Results
print("      Your Complete Travel Itinerary")

print(f"\nDestination: {destination_city} ({num_days} days, Budget: ₹{total_budget:,})")
print(f"  - Hotel Budget: ~₹{hotel_budget_per_night:,.0f}/night")
print(f"  - Food Budget: ~₹{food_budget_per_meal:,.0f}/meal")
print(f"  - Attractions Budget: ~₹{attraction_budget_per_day:,.0f}/day")

# Display Hotels
if not hotels.empty:
    print("\n🏨 Top Hotel Suggestions (within budget):")
    for _, row in hotels.iterrows():
        print(f"  - {row['name']} | Rating: {row['rating']} | Cost/Night: ₹{row['price']:,.0f}")
else:
    print("\n🏨 No hotels found within budget.")

# Display Restaurants
if not restaurants.empty:
    # Display a different header depending on whether we used the fallback logic
    if using_fallback_restaurants:
        print("\n🍜 Top-Rated Restaurant Suggestions (may be outside budget):")
    else:
        print("\n🍜 Top Restaurant Suggestions (within budget):")

    for _, row in restaurants.iterrows():
        print(f"  - {row['name']} | Rating: {row['rating']} | Cost: ₹{row['price']:,.0f} | Cuisine: {row['cuisine_or_amenities']}")
else:
    # This message now only shows if there are no restaurants at all in the city's data
    print("\n🍜 No restaurants found in this city.")

# Display Attractions
if not attractions.empty:
    print("\n🏛️ Top Attractions (within budget):")
    for _, row in attractions.iterrows():
        print(f"  - {row['name']} | Rating: {row['rating']} | Entry Fee: ₹{row['price']:,.0f}")
else:
    print("\n🏛️ No attractions found within budget.")



Loading your fine-tuned model and dataset...


Device set to use cuda:0
  df = pd.read_csv(data_path)



Enter your travel query:
> plan a trip from goa to ahmedabad for 3 days in budget 20000 Rs


Both `max_new_tokens` (=256) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


      Your Complete Travel Itinerary

Destination: ahmedabad (3 days, Budget: ₹15,000)
  - Hotel Budget: ~₹2,000/night
  - Food Budget: ~₹500/meal
  - Attractions Budget: ~₹1,500/day

🏨 Top Hotel Suggestions (within budget):
  - Tulsi Residency | Rating: 1.0 | Cost/Night: ₹800
  - Elis Hospitality Service Apartment,Prahladnagar | Rating: 0.0 | Cost/Night: ₹1,600
  - Club O7 Service Apartment | Rating: 0.0 | Cost/Night: ₹800
  - Hotel Palliate | Rating: 0.0 | Cost/Night: ₹800
  - Hotel Rajkamal | Rating: 0.0 | Cost/Night: ₹800

🍜 Top Restaurant Suggestions (within budget):
  - Grameen Kulfi | Rating: 4.9 | Cost: ₹120 | Cuisine: Ice Cream,Desserts
  - Shree Balaji Cafe | Rating: 4.9 | Cost: ₹150 | Cuisine: Indian
  - Ambika Dalwada | Rating: 4.8 | Cost: ₹150 | Cuisine: Fast Food,Snacks
  - R-Andhra Cafe | Rating: 4.8 | Cost: ₹150 | Cuisine: South Indian
  - Olio Pizzeria | Rating: 4.8 | Cost: ₹200 | Cuisine: Pizzas,Continental

🏛️ Top Attractions (within budget):
  - Sabarmati Ashram | R