# Data Preprecessing

## Import Required Libraries

In [55]:
import pandas as pd
import re
from datasets import Dataset
from transformers import AutoTokenizer
import pycountry
from transformers import pipeline

## Load Reddit Itinerary Data

In [56]:
MODEL_NAME = "t5-small"
MAX_LEN = 512

# Ensure head() shows all columns
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

# Load collected travel itinerary data
df = pd.read_csv("../data/raw/raw_itineraries.csv")

# Only keep rows where "day 1" appears in the content
df = df[df["content"].str.contains(r"\bday 1\b", case = False, na = False)]

df["content"] = df["content"].str.replace(r"\(.*?\)", "", regex = True)

print("Total Itineraries:", len(df))
print("Columns:", df.columns)

Total Itineraries: 429
Columns: Index(['title', 'content', 'url', 'score', 'created_utc', 'subreddit'], dtype='object')


## Clean Text and Extract Day-Wise Content

In [57]:
# Clean full itinerary text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"http\S+", "", text)
    return text.strip()

# Extract Day-wise itinerary sections
def extract_itinerary_sections(text):
    # Normalize line breaks and spacing
    text = re.sub(r"\r\n|\r", "\n", text)

    # Match "Day" followed by number and text, case-insensitive
    matches = re.findall(r"(day\s*\d+[:\-\.]?\s*.*?)(?=day\s*\d+[:\-\.]?\s*|$)", text, flags = re.IGNORECASE | re.DOTALL)
    
    return "\n".join(m.strip() for m in matches) if matches else ""

# Remove asterisks, pipes, and standardize "Day" formatting
def normalize_text(text):
    # Remove * and |
    text = re.sub(r"[\*\|]", "", text)

    # Normalize various Day formats (e.g., "Day 1 -", "Day 1.", etc.) to "Day 1:"
    text = re.sub(r"(day\s*\d+)\s*[-–—\.]?", r"\1:", text, flags = re.IGNORECASE)

    return text

# Remove days of the week and dates
def remove_days_and_dates(text):
    # Remove days of the week (short and long forms, case-insensitive)
    text = re.sub(r"\b(monday|tuesday|wednesday|thursday|friday|saturday|sunday)\b", "", text, flags = re.IGNORECASE)
    text = re.sub(r"\b(mon|tue|wed|thu|fri|sat|sun)\b", "", text, flags = re.IGNORECASE)

    # Remove dates in formats like: "January 5", "5th of July", "05/12/2023", "2023-05-12"
    text = re.sub(r"\b(?:jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec)[a-z]*\s+\d{1,2}(st|nd|rd|th)?\b", "", text, flags = re.IGNORECASE)
    text = re.sub(r"\b\d{1,2}(st|nd|rd|th)?\s+of\s+(?:jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec)[a-z]*\b", "", text, flags = re.IGNORECASE)
    text = re.sub(r"\b\d{1,2}[/-]\d{1,2}([/-]\d{2,4})?\b", "", text)
    text = re.sub(r"\b\d{4}-\d{2}-\d{2}\b", "", text)

    return text

df["content"] = df["content"].apply(remove_days_and_dates)
df["content"] = df["content"].apply(normalize_text)
df["content"] = df["content"].str.replace("::", ":")
df["itinerary_text"] = df["content"].apply(clean_text).apply(extract_itinerary_sections)



#df.head(2)

## Split Data into Daily Chunks

In [58]:
# Split into daily chunks
def split_into_days(text):
    return re.findall(r"(day \d+.*?)(?=day \d+|$)", text.lower(), flags = re.DOTALL)

# Clean days
def clean_day(text):
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"^\s*day", "Day", text, flags = re.I)
    return text.capitalize()

df["itinerary_by_day"] = df["itinerary_text"].apply(split_into_days)
df["itinerary_by_day"] = df["itinerary_by_day"].apply(lambda days: [clean_day(d) for d in days])

#df.head(2)

## Summarize Day Activities

In [59]:
import warnings
warnings.filterwarnings("ignore")

summarizer = pipeline("summarization", model = "t5-small")

def summarize_day_model(text):
    summary = summarizer(text, max_length = 15, min_length = 3, do_sample = False)
    return summary[0]["summary_text"]

df["itinerary_by_day"] = df["itinerary_by_day"].apply(
    lambda days: [summarize_day_model(day) for day in days]
)

Device set to use cpu
Your max_length is set to 15, but your input_length is only 10. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=5)
Your max_length is set to 15, but your input_length is only 11. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=5)
Your max_length is set to 15, but your input_length is only 14. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)
Your max_length is set to 15, but your input_length is only 13. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_

## Extract Key Itinerary Information

In [60]:
# List of all country names
country_list = [country.name.lower() for country in pycountry.countries]

# Extract country name
def extract_country(text):
    text = text.lower()
    for country in country_list:
        if country in text:
            return country.title()
    return "Unknown"

df["country"] = df["title"].apply(extract_country)

# Extract Trip length
df["trip_length"] = df["itinerary_by_day"].apply(lambda x: len(x) if isinstance(x, list) else 0)

# Extract season
def extract_season(text):
    text = text.lower()
    for season in ["spring", "summer", "fall", "autumn", "winter"]:
        if season in text:
            return season
    return "any season"

df["season"] = df["content"].apply(extract_season)

# Extract preference details
def extract_preferences(text):
    preferences = []
    text = text.lower()
    if "food" in text or "restaurant" in text:
        preferences.append("local food")
    if "hike" in text or "trail" in text or "nature" in text:
        preferences.append("hiking")
    if "museum" in text or "history" in text:
        preferences.append("museums")
    if "beach" in text:
        preferences.append("beaches")
    if "shopping" in text:
        preferences.append("shopping")
    return preferences if preferences else ["varied interests"]

df["preferences"] = df["content"].apply(extract_preferences)


## Generate Prompts for Each Itinerary

In [61]:
# prompt generation
def make_prompt(row):
    location = row.get("country", "a destination")
    days = f"{row['trip_length']} day" if "trip_length" in row else "multi-day"
    season = row.get("season", "any season")
    preferences = ", ".join(row["preferences"]) if isinstance(row.get("preferences"), list) else "varied interests"
    
    return (
        f"Generate a {days} itinerary in {location} during {season}. "
        f"Activity preferences include {preferences}."
    )

# Put Itinerary in curom format
def format_response(row):
    location = row.get("country", "the destination")
    days = f"{row['trip_length']}" if "trip_length" in row else "multi-day"
    title = f"{days} Day Itinerary for {location}"
    
    # Join daily chunks with two line breaks between each day for clarity
    body = "\n".join(row["itinerary_by_day"])
    
    return f"{title}\n\n{body}"

df["prompt"] = df.apply(make_prompt, axis = 1)
df["response"] = df["itinerary_by_day"].apply(lambda days: " ".join(days))
df["response"] = df.apply(format_response, axis = 1)

test_df = df[["prompt", "response",]]
test_df.head(10)

Unnamed: 0,prompt,response
2,"Generate a 5 day itinerary in El Salvador during fall. Activity preferences include hiking, beaches.","5 Day Itinerary for El Salvador\n\nplan to land at 9am but can't check into my air\nday 2: cafe albania in the morning then artesania\nday 3: santa ana volcano hike, lake coat\nday 4: san salvador walking tour in the morning\ni need to be there no later than 9:30am to check"
3,Generate a 6 day itinerary in Unknown during any season. Activity preferences include shopping.,6 Day Itinerary for Unknown\n\narrival arrive to hnd 4pm customs/esim\nasakusa kaminarimon gate sensoji temple n\nday 3: western tokyo meiji shrine takeshita \nteam borderless plan was to do some extra shopping/catch anything\nsumo experience in sumida city- 8am-12pm su\nday 6: departure early morning train . depart early morning .
4,"Generate a 11 day itinerary in Peru during any season. Activity preferences include local food, hiking, museums.","11 Day Itinerary for Peru\n\nday 1:arrive in lima early am, local sightseeing in\nday 2:fly to cusco, train to urub\nday 3:day trip to aguas caliente\nday 4:sacred valley exploration - pisac\nday 5:train to cusco, local sightseeing in cusco \nday 6:day trip from cusco - maybe to the sac\nday 7:chill daycusco .\nday 8:fly to lima in the pmlima\nday 9:local sightseeing in lima - food tourslim\nday 10:day trip to paracas lima .\ni have purposely left out amazon region for next visit ."
8,"Generate a 13 day itinerary in Japan during fall. Activity preferences include local food, hiking, shopping.",13 Day Itinerary for Japan\n\nday 1: 31st march arriving at narita airport\nday 2: 1st april grab some breakfast in\nnintendo store/pokemon center then make\nday 3: 2nd april senso-ji\nday 4:3rd april mount fuji\nday 5: 4th april is our 1 year anniversary\nbreakfast shinkansen from tokyo to ky\nday 7: 6 april early start by heading to \nkodaiji temple hanamikoji-d\nday 9: 8 april i struggled planning this\ncandeo hotels osaka the tower- is this\nday 11: 10th april osaka castle\nday 12: we fly out to south korea!
12,Generate a 7 day itinerary in New Zealand during any season. Activity preferences include hiking.,7 Day Itinerary for New Zealand\n\nqueenstown arrive around 3pm- trying to find earlier flight \nmilford sound day trip- going by bus could stay in queens\nday 3: drive to glenorchy- walk along lagoon\ncheck out of queenstown drive to mount cook national park .\nhike hooker valley track and the church of good shepherd lake tek\ndrive to christchurch a bit of a mess after day\nlookin at pictures didn't feel like to be worth it
13,Generate a 13 day itinerary in Spain during any season. Activity preferences include local food.,"13 Day Itinerary for Spain\n\ncheck in to airbnb, arc de triumpf\nday 2: sagrada familia, casa mila,\nday 3: las ramblas, walking tour goth\nday 4: day trip to montserrat . day 4\nday 5: travel to seville .\nroyal alcazar, barrio santa cru\nseville cathedral, maria luisa park, plaza esp\nday 8: travel to lisbon .\nday 9: walking tour, praca de comercio,\nday 10: time out market, st george castle\nday 11: day trip to sintra .\nday 12: day trip to evora .\nbelem, lx factory we are hoping to keep the"
16,Generate a 10 day itinerary in Unknown during any season. Activity preferences include varied interests.,"10 Day Itinerary for Unknown\n\nhoi an land in da nang , marble mountains drive\nday 2: full day in hoi an.\nday 3: hoi an -> da nang head from\nday 4: full day in hanoi .\nday 5: full day in hanoi .\nday 6: hanoi to cat ba island morning bus for cat ba\nday 7: full day in cat ba island .\nday 8: cat ba -> ninh binh morning\nday 9 in ninh binh evening bus or train to\nday 10: fly home in the morning from hanoi . fly"
17,"Generate a 17 day itinerary in Unknown during fall. Activity preferences include local food, hiking, shopping.","17 Day Itinerary for Unknown\n\nday 1:focus on hanami off to meguro,\nday 2:chureito on the way to ishi\nday 3:see sensouji and hikan inari team\nday 4: head to nagano, zenkoji\nday 5:shinkansen to osaka,\nday 6:osaka castle find somewhere nice to eat see\nday 7:leave luggage at hotel, make our way to\nday 8:nachi jinja rest of kuman\nryokan here shikitei explore some of \nday 10:arrive to our hotel sanraku drop reservation\nday 11:early fushimi inari, top and\nday 12:kiyomizudera strut\nday 13:day trip to himeji and kino\nday 14:daytrip to hiroshima and kurash\nday 15:get to our hotel ueno first city day\nday 16:day trip kamakura shopping around .\nday 17:departure spend the morning in sensooji leave"
19,Generate a 6 day itinerary in Unknown during any season. Activity preferences include hiking.,6 Day Itinerary for Unknown\n\nday 1: arrive in cusco at 10am .\nDay 2: explore cusco .\nday 3: inca trail trek is much lower in elevation compared to\nday 4: inca trail trek & machu picchu \nday 5: humantay lake hike through alpaca expedition\ni have limited pto but really want to hike the inca
21,Generate a 12 day itinerary in Japan during any season. Activity preferences include museums.,12 Day Itinerary for Japan\n\nday 1: arrival in tokyo explore ginza visit\nday 2: tokyo harajuku and takeshita\nday 3: tokyo asakusa and senso-\nday 4: travel to nagano train from tokyo\nnagano bus from togakushi nag\nshinkansen hakutaka from nag\nday 7: kyoto arashiyama bamboo grove \nhiroshima day trip shinkansen n\nday 9: kyoto nintendo museum \nday 10: travel to hakone train: 9:56 am\nhakone ropeway lake ashi cruise owakudani\ndeparture from tokyo private transfer to haneda airport


## Convert Data to a Hugging Face Dataset

In [62]:
# Convert to hugging face dataset
final_df = Dataset.from_pandas(df[["prompt", "response"]])

## Tokenize Data

In [63]:
# Tokenize data for the transformer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

def tokenize(example):
    return tokenizer(
        example["prompt"],
        text_target = example["response"],
        truncation = True,
        max_length = MAX_LEN,
        padding = "max_length"
    )

tokenized_dataset = final_df.map(tokenize)

Map: 100%|██████████| 429/429 [00:00<00:00, 727.56 examples/s] 


## Split and Save Dataset

In [64]:
import shutil
import os

# Remove folder if it exists
if os.path.exists("../data/processed/tokenized_dataset"):
    shutil.rmtree("../data/processed/tokenized_dataset")

# Split and save dataset
tokenized_dataset = tokenized_dataset.train_test_split(test_size = 0.2)
tokenized_dataset.save_to_disk("../data/processed/tokenized_dataset")

print("Tokenized dataset saved to:", "../data/processed/tokenized_dataset")

Saving the dataset (1/1 shards): 100%|██████████| 343/343 [00:00<00:00, 11980.33 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 86/86 [00:00<00:00, 9101.26 examples/s] 

Tokenized dataset saved to: ../data/processed/tokenized_dataset



