**1.Data** **Preprocessing**

In [None]:
import os
import re
import json

# -------------------------------
# 1. Load raw text files
# -------------------------------
DATA_DIR = "."

def load_documents(folder):
    docs = {}
    for file in os.listdir(folder):
        if file.endswith(".txt"):
            path = os.path.join(folder, file)
            with open(path, "r", encoding="utf-8") as f:
                docs[file] = f.read()
    return docs

documents = load_documents(DATA_DIR)

print("Loaded files:", list(documents.keys()))

# -------------------------------
# 2. Basic cleaning
# -------------------------------
def clean_text(text):
    text = text.replace("\n", " ").replace("\t", " ")
    text = re.sub(r" +", " ", text)
    text = text.strip()
    return text

documents = {name: clean_text(text) for name, text in documents.items()}

# -------------------------------
# 3. Semantic Chunking
# Chunk size: 300–500 tokens (ideal for LLM retrieval)
# -------------------------------

def smart_chunk(text, max_length=500):
    # split on sentence boundaries
    sentences = re.split(r"(?<=[.!?]) +", text)

    chunks = []
    current = ""

    for s in sentences:
        if len(current.split()) + len(s.split()) <= max_length:
            current += " " + s
        else:
            chunks.append(current.strip())
            current = s

    if current:
        chunks.append(current)

    return chunks


# Create structured chunk objects
all_chunks = []
chunk_id = 0

for filename, text in documents.items():
    chunks = smart_chunk(text)

    for ch in chunks:
        all_chunks.append({
            "chunk_id": f"chunk_{chunk_id}",
            "source": filename,
            "text": ch
        })
        chunk_id += 1

print("Total chunks created:", len(all_chunks))

# -------------------------------
# 4. Save chunks to JSON
# -------------------------------
os.makedirs("processed", exist_ok=True)

with open("processed/travel_chunks.json", "w", encoding="utf-8") as f:
    json.dump(all_chunks, f, indent=2)

print("Saved: processed/travel_chunks.json")


Loaded files: ['customer_feedbacks.txt', 'flight_cancellations.txt', 'airline_policies.txt', 'travel_trends_2024.txt']
Total chunks created: 13
Saved: processed/travel_chunks.json


**2. Embeddings + FAISS Index Code**

In [10]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import json

# Load chunks
with open("processed/travel_chunks.json", "r") as f:
    chunks = json.load(f)

texts = [c["text"] for c in chunks]

# -------------------------------
# Create embeddings
# -------------------------------
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(texts, show_progress_bar=True)

embeddings = np.array(embeddings).astype("float32")

# -------------------------------
# Build FAISS index
# -------------------------------
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

faiss.write_index(index, "processed/travel_faiss.index")

# Save metadata (mapping)
with open("processed/travel_metadata.json", "w") as f:
    json.dump(chunks, f, indent=2)

print("FAISS index and metadata saved!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

FAISS index and metadata saved!


In [9]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m84.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


**3. Retrieval + Multi-Hop RAG Pipeline**

In [None]:
import faiss
import numpy as np
import json
from sentence_transformers import SentenceTransformer
from openai import OpenAI

# -------------------------------
# Load FAISS + metadata
# -------------------------------
index = faiss.read_index("processed/travel_faiss.index")

with open("processed/travel_metadata.json") as f:
    chunks = json.load(f)

model = SentenceTransformer("all-MiniLM-L6-v2")

# -------------------------------
# Retrieval function
# -------------------------------
def retrieve(query, k=3):
    q_emb = model.encode([query]).astype("float32")
    distances, idx = index.search(q_emb, k)

    results = []
    for i in idx[0]:
        results.append(chunks[i])
    return results

# -------------------------------
# Multi-hop RAG
# -------------------------------
def rag_answer(query):
    retrieved = retrieve(query, k=5)

    context = "\n\n".join(
        [f"[{c['source']}] {c['text']}" for c in retrieved]
    )

    prompt = f"""
You are a travel analytics assistant. Use ONLY the context below.

Context:
{context}

Question: {query}

Answer clearly with combined insights across documents.
"""

    client = OpenAI(api_key="enter here openai api key")

    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}]
    )

    return completion.choices[0].message.content


In [18]:
print(rag_answer("Which airline had the most cancellations in 2024 and why?"))

The airline with the most cancellations in 2024 was IndiGo, with a total of 8,920 cancellations. The primary reasons for these cancellations were weather events, particularly dense fog, which accounted for a significant portion of the cancellations (48% of total cancellations in the year). Additionally, technical and maintenance issues contributed to 31% of the cancellations, while crew shortages and operational bottlenecks accounted for 15%. The worst-affected routes included Delhi–Mumbai and Delhi–Srinagar, with the heavy prevalence of fog significantly impacting operations during winter months. This trend continued throughout the year, leading to numerous cancellations primarily driven by adverse weather conditions.


In [19]:
print(rag_answer("Which airline both boasted record bookings early in the year and later reduced payload due to extreme heat?"))


The airline that both boasted record bookings early in the year and later reduced payload due to extreme heat is Indigo. In January, Indigo claimed "record bookings," but by May, due to extreme heat conditions reaching temperatures above 45°C, the airline issued an internal memo instructing crews to reduce payload by 800 kg on high-temperature days. This meant fewer passengers, less baggage, and more cancellations.


In [20]:
print(rag_answer("How did the narrative around Goa tourism differ between large resorts and small boutique hotels?"))


The narrative around Goa tourism highlighted a stark contrast between large resorts and small boutique hotels. 

Large resorts, commonly blamed for "overtourism," were experiencing low occupancy rates, around 42%. They had built large establishments anticipating a return to pre-pandemic levels of tourism, which did not materialize, leading to frustrations over empty rooms. Their marketing focused on attracting traditional vacationers, families, and leisure tourists, which ultimately did not translate into expected demand.

In contrast, small boutique hotels were benefiting significantly, with one owner noting a remarkable 68% increase in bookings. These establishments attracted a different demographic—primarily tech professionals from Bengaluru who were traveling for workations. They were flexible, preferring to fly in on weekends to work remotely while enjoying the beach. This shift had prompted the boutique hotel to adapt by converting spaces for co-working, thus catering to the need