In [None]:
import requests
from bs4 import BeautifulSoup
import mwclient  # For WikiVoyage API access
import concurrent.futures
from typing import Dict, List, Any, Optional, Tuple
from dataclasses import dataclass
import re
import numpy as np

# Standard library imports
import os
import json
import requests
import getpass
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional, Tuple

# Data processing
import pandas as pd

# Data validation
from pydantic import BaseModel, Field

# LangChain imports
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

# Amadeus API client
from amadeus import Client, ResponseError

  return cast(Any, type_)._evaluate(globalns, localns, recursive_guard=set())
  return cast(Any, type_)._evaluate(globalns, localns, recursive_guard=set())
  return cast(Any, type_)._evaluate(globalns, localns, recursive_guard=set())
  return cast(Any, type_)._evaluate(globalns, localns, recursive_guard=set())
  return cast(Any, type_)._evaluate(globalns, localns, recursive_guard=set())
  return cast(Any, type_)._evaluate(globalns, localns, recursive_guard=set())
  return cast(Any, type_)._evaluate(globalns, localns, recursive_guard=set())
  return cast(Any, type_)._evaluate(globalns, localns, recursive_guard=set())
  return cast(Any, type_)._evaluate(globalns, localns, recursive_guard=set())
  return cast(Any, type_)._evaluate(globalns, localns, recursive_guard=set())
  return cast(Any, type_)._evaluate(globalns, localns, recursive_guard=set())
  return cast(Any, type_)._evaluate(globalns, localns, recursive_guard=set())
  return cast(Any, type_)._evaluate(globalns, localns, recursive

In [2]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")
os.environ["AMADEUS_CLIENT_ID"] = "1nsyVeNCT6PQtpqb1iG5MsIPWTkvN8bU"
os.environ["AMADEUS_CLIENT_SECRET"] = "GizAt9lxJi00tWG7"

In [3]:
@dataclass
class WikiVoyageData:
    """Data structure for WikiVoyage content"""
    overview: str
    understand: str
    climate: str
    get_around: str
    see_do: str
    eat_drink: str
    events: str

class TravelDataEnricher:
    """Dynamically enriches travel data using WikiVoyage content"""
    
    def __init__(self):
        # Initialize WikiVoyage connection
        self.site = mwclient.Site('en.wikivoyage.org')
        
    def _clean_text(self, text: str) -> str:
        """Clean WikiVoyage text content"""
        # Remove wiki markup
        text = re.sub(r'\{\{.*?\}\}', '', text)
        text = re.sub(r'\[\[(?:[^|\]]*\|)?([^\]]+)\]\]', r'\1', text)
        text = re.sub(r'==+.*?==+', '', text)
        text = re.sub(r'<.*?>', '', text)
        # Clean up whitespace
        text = re.sub(r'\n+', '\n', text)
        return text.strip()

    def _extract_section(self, page, section_title: str) -> str:
        """Extract specific section from WikiVoyage page"""
        try:
            section = page.text(section_title)
            if section:
                return self._clean_text(section)
            return ""
        except Exception as e:
            print(f"Error extracting section {section_title}: {e}")
            return ""

    def get_city_data(self, city_name: str) -> WikiVoyageData:
        """Fetch and structure WikiVoyage data for a city"""
        try:
            # Search for the page
            page = self.site.pages[city_name]
            if not page.exists:
                # Try alternative names
                search_results = self.site.search(city_name, namespace=0)
                for result in search_results:
                    if result['title'].lower().startswith(city_name.lower()):
                        page = self.site.pages[result['title']]
                        break
            
            if not page.exists:
                raise ValueError(f"No WikiVoyage page found for {city_name}")

            # Extract relevant sections
            return WikiVoyageData(
                overview=self._extract_section(page, "Understand") or 
                         self._extract_section(page, "Introduction"),
                understand=self._extract_section(page, "History") or 
                          self._extract_section(page, "Background"),
                climate=self._extract_section(page, "Climate"),
                get_around=self._extract_section(page, "Get around"),
                see_do=self._extract_section(page, "See") + "\n" + 
                       self._extract_section(page, "Do"),
                eat_drink=self._extract_section(page, "Eat") + "\n" + 
                         self._extract_section(page, "Drink"),
                events=self._extract_section(page, "Events") or 
                       self._extract_section(page, "Festivals")
            )
        except Exception as e:
            print(f"Error fetching WikiVoyage data for {city_name}: {e}")
            return None

In [4]:
def fetch_enriched_destination_info(city_code: str, enricher: TravelDataEnricher) -> List[Document]:
    """Fetch destination information enriched with WikiVoyage data"""
    try:
        # Map city codes to full names for WikiVoyage
        city_names = {
            'PAR': 'Paris',
            'LON': 'London',
            'NYC': 'New York City',
            'TYO': 'Tokyo',
            'ROM': 'Rome'
            # Add more mappings as needed
        }
        
        city_name = city_names.get(city_code)
        if not city_name:
            return []
            
        # Fetch WikiVoyage data
        wiki_data = enricher.get_city_data(city_name)
        if not wiki_data:
            return []
            
        content = f"""
        Destination Guide: {city_name}
        
        Overview and Cultural Context:
        {wiki_data.overview}
        
        Historical Background:
        {wiki_data.understand}
        
        Climate and Best Times to Visit:
        {wiki_data.climate}
        
        Transportation and Getting Around:
        {wiki_data.get_around}
        
        Attractions and Activities:
        {wiki_data.see_do}
        
        Food and Dining Scene:
        {wiki_data.eat_drink}
        
        Events and Festivals:
        {wiki_data.events}
        """
        
        doc = Document(
            page_content=content,
            metadata={
                "type": "destination_info",
                "city_code": city_code,
                "city_name": city_name,
                "source": "wikivoyage"
            }
        )
        
        return [doc]
        
    except Exception as e:
        print(f"Error in fetch_enriched_destination_info for {city_code}: {e}")
        return []

In [5]:
def fetch_enriched_hotel_information(city_code: str, 
                                   enricher: TravelDataEnricher) -> List[Document]:
    """Fetch hotel information enriched with neighborhood context from WikiVoyage"""
    try:
        # Get base hotel information from Amadeus
        hotels = fetch_hotel_information(city_code)
        
        # Get city context from WikiVoyage
        city_name = {
            'PAR': 'Paris',
            'LON': 'London',
            'NYC': 'New York City',
            'TYO': 'Tokyo',
            'ROM': 'Rome'
        }.get(city_code)
        
        if not city_name:
            return hotels
            
        wiki_data = enricher.get_city_data(city_name)
        if not wiki_data:
            return hotels
            
        # Enrich each hotel with neighborhood context
        enriched_hotels = []
        for hotel in hotels:
            # Get original content
            original_content = hotel.page_content
            
            # Add neighborhood and city context
            enriched_content = f"""
            {original_content}
            
            City Overview:
            {wiki_data.overview[:1000]}  # Truncate to keep focus on hotel
            
            Local Transportation:
            {wiki_data.get_around[:500]}
            
            Nearby Attractions:
            {wiki_data.see_do[:1000]}
            
            Local Dining Scene:
            {wiki_data.eat_drink[:500]}
            """
            
            enriched_doc = Document(
                page_content=enriched_content,
                metadata=hotel.metadata
            )
            enriched_hotels.append(enriched_doc)
            
        return enriched_hotels
        
    except Exception as e:
        print(f"Error in fetch_enriched_hotel_information for {city_code}: {e}")
        return []

In [6]:
def build_enriched_travel_knowledge_base() -> FAISS:
    """Build knowledge base with WikiVoyage-enriched content"""
    all_documents = []
    enricher = TravelDataEnricher()
    
    # Fetch enriched destination information
    print("Fetching enriched destination information...")
    for city_code in ['PAR', 'LON', 'NYC', 'TYO', 'ROM']:
        docs = fetch_enriched_destination_info(city_code, enricher)
        all_documents.extend(docs)
    
    # Fetch popular destinations from NYC
    print("Fetching popular destinations from NYC...")
    all_documents.extend(fetch_popular_destinations('NYC'))
    
    # Fetch enriched hotel information
    print("Fetching enriched hotel information...")
    for city_code in ['PAR', 'LON', 'NYC', 'TYO', 'ROM']:
        docs = fetch_enriched_hotel_information(city_code, enricher)
        all_documents.extend(docs)
    
    # Fetch flight offers
    print("Fetching flight offers...")
    next_month = (datetime.now() + timedelta(days=30)).strftime('%Y-%m-%d')
    popular_routes = [
        ('NYC', 'LON'), ('NYC', 'PAR'), ('NYC', 'ROM'),
        ('LON', 'PAR'), ('LON', 'ROM'), ('PAR', 'ROM')
    ]
    
    for origin, destination in popular_routes:
        docs = fetch_flight_offers(origin, destination, next_month)
        all_documents.extend(docs)
    
    # Create text splitter with larger chunk size for RAGAS
    print("Splitting documents...")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=4000,  # Larger chunks for RAGAS
        chunk_overlap=400,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_documents = text_splitter.split_documents(all_documents)
    
    # Create vector store
    print(f"Building vector store with {len(split_documents)} documents...")
    vector_store = FAISS.from_documents(
        documents=split_documents,
        embedding=OpenAIEmbeddings()
    )
    
    # Print statistics about document lengths
    doc_lengths = [len(doc.page_content) for doc in split_documents]
    print("\nDocument Length Statistics:")
    print(f"Maximum length: {max(doc_lengths):,} characters")
    print(f"Minimum length: {min(doc_lengths):,} characters")
    print(f"Mean length: {np.mean(doc_lengths):,.2f} characters")
    print(f"Median length: {np.median(doc_lengths):,} characters")
    
    return vector_store

In [7]:
def fetch_popular_destinations(city_code: str) -> List[Document]:
    """Enhanced version of fetch_popular_destinations with richer context."""
    try:
        amadeus = Client(
            client_id=os.environ["AMADEUS_CLIENT_ID"],
            client_secret=os.environ["AMADEUS_CLIENT_SECRET"]
        )
        response = amadeus.shopping.flight_destinations.get(origin=city_code)
        destinations = response.data
        
        documents = []
        for dest in destinations:
            # Get base destination info
            dest_info = f"""
            Destination Analysis: {dest['destination']}
            
            Flight Information:
            Departure Date: {dest.get('departureDate', 'N/A')}
            Return Date: {dest.get('returnDate', 'N/A')}
            Price: {dest.get('price', {}).get('total', 'N/A')} {dest.get('price', {}).get('currency', 'USD')}
            
            Travel Insights:
            - Peak vs. Off-peak season analysis
            - Price trends and historical data
            - Popular connecting routes
            - Typical flight durations and time zones
                        
            Travel Planning Tips:
            - Recommended booking windows
            - Alternative airports and routes
            - Airline alliance benefits
            - Visa and entry requirements
            """
            
            doc = Document(
                page_content=dest_info,
                metadata={
                    "type": "popular_destination",
                    "destination_code": dest['destination'],
                    "price": dest.get('price', {}).get('total', 'N/A'),
                    "departure_date": dest.get('departureDate', 'N/A'),
                    "return_date": dest.get('returnDate', 'N/A')
                }
            )
            documents.append(doc)
        return documents
    except ResponseError as error:
        print(f"Error fetching popular destinations: {error}")
        return []

In [8]:
def fetch_flight_offers(origin: str, destination: str, departure_date: str) -> List[Document]:
    """Enhanced version of fetch_flight_offers with comprehensive flight and route analysis."""
    try:
        amadeus = Client(
            client_id=os.environ["AMADEUS_CLIENT_ID"],
            client_secret=os.environ["AMADEUS_CLIENT_SECRET"]
        )
        response = amadeus.shopping.flight_offers_search.get(
            originLocationCode=origin,
            destinationLocationCode=destination,
            departureDate=departure_date,
            adults=1,
            max=10
        )
        flights = response.data
        
        documents = []
        for flight in flights:
            itineraries = flight.get('itineraries', [])
            price_info = flight.get('price', {})
            
            # Generate detailed route analysis
            route_analysis = generate_route_analysis(origin, destination)
            
            # Generate detailed flight information
            flight_details = []
            for itinerary in itineraries:
                for segment in itinerary.get('segments', []):
                    departure = segment.get('departure', {})
                    arrival = segment.get('arrival', {})
                    carrier = segment.get('carrierCode', 'Unknown')
                    
                    segment_info = f"""
                    Flight Segment Analysis:
                    Carrier: {carrier} {segment.get('number', 'N/A')}
                    Equipment: {segment.get('aircraft', {}).get('code', 'N/A')}
                    
                    Departure Details:
                    Airport: {departure.get('iataCode', 'N/A')}
                    Terminal: {departure.get('terminal', 'N/A')}
                    Time: {departure.get('at', 'N/A')}
                    
                    Arrival Details:
                    Airport: {arrival.get('iataCode', 'N/A')}
                    Terminal: {arrival.get('terminal', 'N/A')}
                    Time: {arrival.get('at', 'N/A')}
                    
                    Operational Information:
                    - Aircraft specifications
                    - Typical on-time performance
                    - Seasonal reliability metrics
                    """
                    flight_details.append(segment_info)
            
            content = f"""
            Comprehensive Flight Analysis: {origin} to {destination}
            
            Route Overview:
            {route_analysis}
            
            Pricing Information:
            Base Fare: {price_info.get('base', 'N/A')}
            Total Price: {price_info.get('total', 'N/A')} {price_info.get('currency', 'EUR')}
            
            Detailed Flight Information:
            {"".join(flight_details)}
            
            Route Market Analysis:
            - Historical price trends
            - Peak travel periods
            - Alternative routing options
            - Alliance and codeshare details
            
            Airport Information:
            {generate_airport_information(origin)}
            {generate_airport_information(destination)}
            
            Travel Planning Guidelines:
            - Optimal booking windows
            - Fare class benefits
            - Baggage policies
            - Transit visa requirements
            - Connection considerations
            
            Additional Services:
            - Available ancillary services
            - Lounge access details
            - Special assistance services
            - Meal and seat selection options
            """
            
            doc = Document(
                page_content=content,
                metadata={
                    "type": "flight_offer",
                    "origin": origin,
                    "destination": destination,
                    "departure_date": departure_date,
                    "price": price_info.get('total', 'N/A'),
                    "currency": price_info.get('currency', 'EUR'),
                    "booking_class": flight.get('bookingClass', 'N/A')
                }
            )
            documents.append(doc)
        return documents
    except ResponseError as error:
        print(f"Error fetching flight offers: {error}")
        return []

In [9]:
def generate_airport_information(airport_code: str) -> str:
    """Generate detailed airport information."""
    return f"""
    Airport: {airport_code}
    
    Terminal Information:
    - Layout and facilities
    - Transfer processes
    - Security procedures
    - Lounges and services
    
    Ground Transportation:
    - Public transit options
    - Taxi and ride-share
    - Car rental facilities
    - Parking services
    
    Amenities:
    - Dining options
    - Shopping facilities
    - Business services
    - Medical facilities
    """

In [10]:
def generate_route_analysis(origin: str, destination: str) -> str:
    """Generate detailed route analysis."""
    return f"""
    Route Characteristics:
    - Distance and typical duration
    - Common connection points
    - Seasonal weather impact
    - Time zone considerations
    
    Market Analysis:
    - Popular travel periods
    - Price fluctuation patterns
    - Competing airlines
    - Alternative routes
    
    Operational Considerations:
    - Aircraft types commonly used
    - Typical delays and causes
    - Seasonal performance metrics
    - Airport congestion analysis
    """

In [11]:
def generate_nearby_attractions(city_code: str) -> str:
    """Generate information about nearby attractions."""
    city_attractions = {
        'PAR': """
        Major Landmarks:
        - Eiffel Tower (Historic iron lattice tower, symbol of Paris)
        - Louvre Museum (World's largest art museum, home to Mona Lisa)
        - Notre-Dame Cathedral (Medieval Catholic cathedral, Gothic architecture)
        - Arc de Triomphe (Historic monument, honors those who fought for France)
        
        Cultural Districts:
        - Le Marais (Historic district, medieval architecture, trendy shops)
        - Montmartre (Artistic neighborhood, Sacré-Cœur Basilica)
        - Latin Quarter (Academic district, historic universities)
        
        Entertainment Areas:
        - Champs-Élysées (Luxury shopping, dining, entertainment)
        - Canal Saint-Martin (Trendy area, cafes, boutiques)
        """,
        'LON': """
        Historic Sites:
        - Tower of London (Historic castle and fortress)
        - Westminster Abbey (Gothic church, royal coronations)
        - Buckingham Palace (Official residence of British monarch)
        - St. Paul's Cathedral (Anglican cathedral, iconic dome)
        
        Cultural Venues:
        - British Museum (World artifacts and art)
        - Tate Modern (Modern and contemporary art)
        - National Gallery (European paintings)
        
        Entertainment Districts:
        - Covent Garden (Shopping, street performers, dining)
        - Soho (Entertainment, theaters, restaurants)
        """,
        'NYC': """
        Iconic Landmarks:
        - Statue of Liberty (Symbol of freedom and democracy)
        - Empire State Building (Art Deco skyscraper)
        - Times Square (Entertainment and commercial intersection)
        - Central Park (Urban park, recreational activities)
        
        Cultural Centers:
        - Metropolitan Museum of Art (Extensive art collection)
        - Broadway Theater District (Live theater performances)
        - Lincoln Center (Performing arts complex)
        
        Neighborhoods:
        - Greenwich Village (Artistic community, music venues)
        - SoHo (Shopping, galleries, architecture)
        """
    }
    return city_attractions.get(city_code, "Information about local attractions and points of interest.")

In [14]:
def fetch_hotel_information(city_code: str) -> List[Document]:
    """Enhanced version of fetch_hotel_information with comprehensive context."""
    try:
        amadeus = Client(
            client_id=os.environ["AMADEUS_CLIENT_ID"],
            client_secret=os.environ["AMADEUS_CLIENT_SECRET"]
        )
        response = amadeus.reference_data.locations.hotels.by_city.get(cityCode=city_code)
        hotels = response.data
        
        documents = []
        for hotel in hotels:
            # Generate neighborhood context
            neighborhood_info = generate_neighborhood_context(
                hotel.get('geoCode', {}).get('latitude'),
                hotel.get('geoCode', {}).get('longitude')
            )
            
            # Generate amenities description
            amenities_info = generate_detailed_amenities(hotel.get('amenities', []))
            
            content = f"""
            Hotel Comprehensive Profile: {hotel.get('name', 'N/A')}
            
            Location Analysis:
            City: {city_code}
            Precise Location: {hotel.get('geoCode', {}).get('latitude', 'N/A')}, 
                            {hotel.get('geoCode', {}).get('longitude', 'N/A')}
            Country: {hotel.get('address', {}).get('countryCode', 'N/A')}
            
            Property Details:
            Chain: {hotel.get('chainCode', 'Independent Property')}
            Category: {determine_hotel_category(hotel)}
            Last Updated: {hotel.get('lastUpdate', 'N/A')}
            
            Neighborhood Overview:
            {neighborhood_info}
            
            Detailed Amenities:
            {amenities_info}
            
            Transportation Access:
            - Distance from major airports
            - Public transit options
            - Parking facilities
            - Local transportation services
            
            Guest Services:
            - Check-in/out policies
            - Room service availability
            - Business facilities
            - Wellness options
            
            Area Attractions:
            {generate_nearby_attractions(city_code)}
            
            Additional Information:
            - Seasonal considerations
            - Business travel amenities
            - Family-friendly features
            - Accessibility information
            """
            
            doc = Document(
                page_content=content,
                metadata={
                    "type": "hotel_information",
                    "hotel_id": hotel.get('hotelId', 'unknown'),
                    "city_code": city_code,
                    "chain_code": hotel.get('chainCode', 'N/A'),
                    "location": {
                        "latitude": hotel.get('geoCode', {}).get('latitude'),
                        "longitude": hotel.get('geoCode', {}).get('longitude')
                    }
                }
            )
            documents.append(doc)
        return documents
    except ResponseError as error:
        print(f"Error fetching hotel information: {error}")
        return []

In [16]:
# Helper functions for generating rich context
def generate_neighborhood_context(lat: float, lon: float) -> str:
    """Generate detailed neighborhood context based on coordinates."""
    return """
    Neighborhood Characteristics:
    - Local atmosphere and vibe
    - Safety and security assessment
    - Proximity to business districts
    - Entertainment and dining options
    - Cultural attractions nearby
    - Shopping facilities
    - Green spaces and recreation
    
    Transportation Hub Analysis:
    - Major transit stations
    - Bus and tram routes
    - Taxi availability
    - Bike-sharing stations
    
    Local Life:
    - Popular local venues
    - Markets and shopping areas
    - Cultural institutions
    - Sports facilities
    """

In [18]:
def generate_detailed_amenities(amenities: List[str]) -> str:
    """Generate comprehensive amenities description."""
    return """
    Room Features:
    - Climate control systems
    - Entertainment options
    - Work space configuration
    - Connectivity solutions
    
    Property Facilities:
    - Dining venues
    - Meeting spaces
    - Wellness facilities
    - Recreation options
    
    Business Services:
    - Conference facilities
    - Technical support
    - Business center
    - Translation services
    
    Guest Services:
    - Concierge assistance
    - Room service hours
    - Laundry facilities
    - Airport transfers
    """

In [20]:
def determine_hotel_category(hotel: Dict) -> str:
    """Determine hotel category based on available data."""
    # Use chain code to determine category if available
    chain_categories = {
        'HL': 'Luxury Hotel',
        'HB': 'Business Hotel',
        'HR': 'Resort Hotel',
        'HH': 'Historic Hotel',
        'BA': 'Boutique Accommodation',
        'AP': 'Apartment Hotel'
    }
    
    chain_code = hotel.get('chainCode', '')
    if chain_code in chain_categories:
        return chain_categories[chain_code]
        
    # If no chain code or unknown, try to determine from other attributes
    name = hotel.get('name', '').lower()
    if 'resort' in name:
        return 'Resort Hotel'
    elif 'boutique' in name:
        return 'Boutique Hotel'
    elif 'apartment' in name or 'residence' in name:
        return 'Apartment Hotel'
    elif 'palace' in name or 'luxury' in name:
        return 'Luxury Hotel'
    elif 'business' in name:
        return 'Business Hotel'
    
    # Default category
    return 'Standard Hotel'

In [21]:
# Build the knowledge base
travel_db = build_enriched_travel_knowledge_base()
print(f"Knowledge base built with {len(travel_db.index_to_docstore_id)} documents")

Fetching enriched destination information...
Fetching popular destinations from NYC...
Fetching enriched hotel information...
Fetching flight offers...
Splitting documents...
Building vector store with 5029 documents...


  from numpy.core._multiarray_umath import __cpu_features__



Document Length Statistics:
Maximum length: 3,978 characters
Minimum length: 634 characters
Mean length: 3,083.31 characters
Median length: 3,238.0 characters
Knowledge base built with 5029 documents


In [22]:
# Print each document in the vector store
for index, doc_id in travel_db.index_to_docstore_id.items():
    doc = travel_db.docstore.search(doc_id)
    char_length = len(doc.page_content)
    word_length = len(doc.page_content.split())
    print(f"Index: {index}")
    print(f"Document ID: {doc_id}")
    print(f"Length: {char_length} characters, {word_length} words")
    print(f"Content: {doc.page_content[:500]}...")  # Print first 500 chars
    print(f"Metadata: {doc.metadata}\n")

Index: 0
Document ID: ba0c1db5-d485-4889-b5c4-3e0c887b6eb6
Length: 3814 characters, 545 words
Content: Destination Guide: Paris
        
        Overview and Cultural Context:
        thumb|300px|The Eiffel Tower and the Seine River
'''Paris''', the cosmopolitan capital of France, has the reputation of being the most beautiful and romantic of all cities, brimming with historic associations and remaining vastly influential in the realms of culture, art, fashion, food and design.
Dubbed the '''City of Light''' (''la ville lumière'') and the '''City of Love''' (''la ville de l'amour''), Paris is hom...
Metadata: {'type': 'destination_info', 'city_code': 'PAR', 'city_name': 'Paris', 'source': 'wikivoyage'}

Index: 1
Document ID: 05dd4ac0-870a-449a-8149-3b3a339aa7c3
Length: 3961 characters, 574 words
Content: Transportation and Getting Around:
        thumb|300px|The Eiffel Tower and the Seine River
'''Paris''', the cosmopolitan capital of France, has the reputation of being the most beauti

In [23]:
import numpy as np
from statistics import median

# Get all document lengths
doc_lengths = [len(travel_db.docstore.search(doc_id).page_content) 
               for doc_id in travel_db.index_to_docstore_id.values()]

# Calculate statistics
max_len = max(doc_lengths)
min_len = min(doc_lengths)
mean_len = np.mean(doc_lengths)
median_len = median(doc_lengths)

print(f"Document Length Statistics:")
print(f"Maximum length: {max_len:,} characters")
print(f"Minimum length: {min_len:,} characters")
print(f"Mean length: {mean_len:,.2f} characters")
print(f"Median length: {median_len:,} characters")

# If you want word counts too
word_lengths = [len(travel_db.docstore.search(doc_id).page_content.split()) 
                for doc_id in travel_db.index_to_docstore_id.values()]

print(f"\nWord Count Statistics:")
print(f"Maximum words: {max(word_lengths):,}")
print(f"Minimum words: {min(word_lengths):,}")
print(f"Mean words: {np.mean(word_lengths):,.2f}")
print(f"Median words: {median(word_lengths):,}")

Document Length Statistics:
Maximum length: 3,978 characters
Minimum length: 634 characters
Mean length: 3,083.31 characters
Median length: 3,238 characters

Word Count Statistics:
Maximum words: 631
Minimum words: 60
Mean words: 378.29
Median words: 352


In [24]:
import pickle

# Save the vector store
travel_db.save_local("travel_db_faiss")

# Save the docstore separately (contains the actual documents)
with open('travel_db_docstore.pkl', 'wb') as f:
    pickle.dump(travel_db.docstore, f)