# Exploratory Analysis for LLM Travel Chatbot

- This notebook contains exploratory analysis to inform the development of a large language model (LLM) chatbot designed to assist users with questions about their personal travel itineraries.  
   
   
---
   
## Objectives
- Extract gmail from dafamtripbot@gmail.com
- Use LLM to extract flight info from text 


In [None]:
import base64
import json
import os
from datetime import datetime, date, time
from dotenv import load_dotenv
from pathlib import Path
from typing import Any, Dict, List, Literal, Optional

import gradio as gr
from bs4 import BeautifulSoup  
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build, Resource
from google.auth.transport.requests import Request
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts import ChatPromptTemplate
from langchain.schema import HumanMessage, AIMessage, SystemMessage
from langchain_core.output_parsers import PydanticOutputParser
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field

In [None]:
load_dotenv(dotenv_path=Path.home() / ".env")

In [None]:
SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']

In [None]:
# Data model for flight parser
class Passenger(BaseModel):
    first_name: str
    last_name: str

class FlightDetails(BaseModel):
    flight_number: str
    airline_name: str
    departure_date: Optional[date] = None
    departure_time: Optional[time] = None
    arrival_date: Optional[datetime] = None
    arrival_time: Optional[str] = None
    origin: str
    destination: str
    passengers: List[Passenger]

class FlightManifest(BaseModel):
    flights: List[FlightDetails]
    
flight_parser = PydanticOutputParser(pydantic_object=FlightManifest)


In [None]:
def get_gmail_service() -> Resource:
    """
    Create and return an authenticated Gmail API service instance.

    This function handles authentication with the Gmail API using OAuth 2.0. It first checks
    for saved credentials in a local `token.json` file. If valid credentials are found, they
    are used directly. If not, the function initiates an OAuth 2.0 flow to authenticate
    the user and generates new credentials using client information stored in environment
    variables:
        - TRAVELBOT_GMAIL_CLIENT_ID
        - TRAVELBOT_GMAIL_CLIENT_SECRET

    The credentials are then saved to `token.json` for reuse in future runs.

    Returns:
        googleapiclient.discovery.Resource:
            An authorized Gmail API service instance for making Gmail API calls.

    Raises:
        google.auth.exceptions.GoogleAuthError: If authentication fails or credentials
            cannot be refreshed.
        FileNotFoundError: If `token.json` is missing and OAuth flow cannot retrieve credentials.
    """
    creds: Credentials = None
    # Load saved credentials if available
    if Path('token.json').exists():
        creds = Credentials.from_authorized_user_file('token.json', SCOPES)

    # If no valid credentials, run OAuth flow
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            # Get credentials from environment variables
            client_config = {
                "installed": {
                    "client_id": os.getenv("TRAVELBOT_GMAIL_CLIENT_ID"),
                    "client_secret": os.getenv("TRAVELBOT_GMAIL_CLIENT_SECRET"),
                    "auth_uri": "https://accounts.google.com/o/oauth2/auth",
                    "token_uri": "https://oauth2.googleapis.com/token",
                    "redirect_uris": ["http://localhost"]
                }
            }

            flow = InstalledAppFlow.from_client_config(client_config, SCOPES)
            creds = flow.run_local_server(port=0)

        # Save the credentials for next run
        with open('token.json', 'w') as token:
            token.write(creds.to_json())

    return build('gmail', 'v1', credentials=creds)

In [None]:
def get_latest_email_id(client: Optional[Resource] = None) -> Optional[str]:
    """
    Retrieve the message ID of the most recent email from the user's Gmail inbox.

    This function uses the Gmail API to fetch the most recent email for the 
    authenticated user. If a Gmail API client is not provided, one will be created 
    using `get_gmail_service()`.

    Args:
        client (Optional[googleapiclient.discovery.Resource]): 
            An authenticated Gmail API client. If not provided, a new client 
            will be created internally.

    Returns:
        Optional[str]: 
            The message ID of the most recent email if available, otherwise `None`.

    Raises:
        googleapiclient.errors.HttpError: 
            If the Gmail API request fails.# create gmail client if not provided
    """
    # create gmail client if not provided
    if not client:
        client = get_gmail_service()
    # Get list of messages
    results = client.users().messages().list(userId='me', maxResults=1).execute()
    messages = results.get('messages', [])

    if not messages:
        print("No messages found.")
        return

    # Get the message details
    msg_id = messages[0]['id']
    return msg_id

 

In [None]:
def extract_gmail_as_json(service: Resource, message_id: str) -> Dict[str, Optional[str]]:
    """
    Extract email metadata and body content from the Gmail API and return it as JSON-like data.

    This function retrieves an email message by its ID using the Gmail API, 
    extracts key metadata fields (From, To, Date, Subject, Message ID), 
    and extracts the message body as plain text. If only HTML content 
    is available, it is converted to plain text using BeautifulSoup.

    Args:
        service (googleapiclient.discovery.Resource):
            An authenticated Gmail API service client.
        message_id (str):
            The unique Gmail message ID of the email to retrieve.

    Returns:
        Dict[str, Optional[str]]:
            A dictionary containing the email metadata and body content:
            {
                "from": str or None,
                "to": str or None,
                "date": str or None,
                "subject": str or None,
                "message_id": str,
                "body": str or None
            }

    Raises:
        googleapiclient.errors.HttpError:
            If the Gmail API request fails.

    Example:
        >>> service = get_gmail_service()
        >>> email_data = extract_email_as_json(service, "17c6932b2b4f1a2c")
        >>> print(email_data["subject"])
        'Your Flight Itinerary'
    """
    msg: Dict[str, Any] = service.users().messages().get(userId='me', id=message_id, format='full').execute()

    payload = msg.get("payload", {})
    headers = payload.get("headers", [])

    def get_header(name):
        return next((h['value'] for h in headers if h['name'].lower() == name.lower()), None)

    # Extract headers
    email_data = {
        "from": get_header("From"),
        "to": get_header("To"),
        "date": get_header("Date"),
        "subject": get_header("Subject"),
        "message_id": message_id,
        "body": None  # populated below
    }

    # Extract body (prefer text/plain over text/html)
    def extract_body(payload):
        if payload.get("mimeType") == "text/plain":
            return base64.urlsafe_b64decode(payload["body"].get("data", "")).decode("utf-8", errors="ignore")
        elif payload.get("mimeType") == "text/html":
            html = base64.urlsafe_b64decode(payload["body"].get("data", "")).decode("utf-8", errors="ignore")
            return BeautifulSoup(html, "html.parser").get_text()
        elif "parts" in payload:
            for part in payload["parts"]:
                body = extract_body(part)
                if body:
                    return body
        return None

    email_data["body"] = extract_body(payload)

    return email_data

In [None]:
# Extract flight info from most recent gmail
msg_id = get_latest_email_id()
data = extract_gmail_as_json(get_gmail_service(), msg_id)
flight_email = data.get('body', '')

In [None]:
# Uses environment variable to authenticate 
client = ChatOpenAI(model="gpt-4o-mini")

In [None]:
# Create chain for extracting flight data from email in JSON format
extract_flight = """I am building a trip itinerary for a family vacation. The following text in triple quotes contains flight information from the email confirmation I received from the airline.  There can be multiple passengers and flights in a single email confirmation. Extract the following relevant passenger and flight information.

The following pieces of information should be collected for each passenger that is traveling. Remember, there can be multiple passengers on each flight
- first_name 
- last_name 

The following pieces of information should be collected for each flight. There are usually multiple flights per email. 
- departure_date
- departure_time
- arrival_date
- arrival_time 
- origin
- destination
- flight_number
- airline_name

 ```{email}```

{format_instructions}
"""

extract_flight_prompt = ChatPromptTemplate.from_messages([
    ("system", "Extract structured flight and passenger information from the text and convert it to JSON using ISO 8601 format for all datetime fields."),
    ("human", extract_flight)
])
flight_chain = extract_flight_prompt | client | flight_parser

In [None]:
# Extract flight info using chain
flight_chainparams = {'email': flight_email,
                      'format_instructions': flight_parser.get_format_instructions()}
flight_response = flight_chain.invoke(flight_chainparams)

In [None]:
flight_response.dict()

In [None]:
# Sample itinerary for testing
itinerary_txt = """
Agenda Call July 9, 2025 

Day 1 (August 2 Saturday):  

Arti and Jay arrive at 11:00am; Kiran and Craig arrive at 10:30pm; rest arrive 11:30pm 

Westin Calgory to Hotel: Airport Shuttle on request: +1 403-452-5406 

7 Rooms: (Utpal family), (Manisha’s family), (Beena, mami, tinu and Priya), (Kiran, Craig, and Avi), (Arti and two boys), (Jay and Iyla), (Dad and Quentin).  

Pick up cars from Budget after 8pm. Arti to pick up one and Kiran and Craig to pick up other two. 

Day 2 (August 3 Sunday): 9:00am start 

No entry fee for Banff or Jasper in August 

Canmore Check-in is after 4:00pm 

Can someone check on where we should park to get to Banff Gondola. Should we take shuttle or park with cars and 15 passenger van? (https://www.banfflakelouise.com/getting-around) 

Banff Gondola and Upper Hot Springs (Book through Pursuit group rates for gondola and cruise (Adult $102/ $67 child; $56 for just gondola) 

Lunch: Eat out? Farm & Fire is a vegetarian-friendly wood-fired pizza place. 

Bow Falls River Trail to Bow Falls (easy 30 minutes): Trailhead is south end of Banff Pedestrian Bridge. Also can access from parking lot for Bow Falls 

Tunnel Mountain Trial to Tunnel Mountain (easy to moderate 2 hours): Trainhead is on St. Julien Road. There is a parking lot. For a slightly shorter trail start from Tunnel Mountain Dr. 

Try to Park at Bear Street Parkade. If full, try Beaver Street Lot. If both full, park at Banff Train Park and Ride 

Roam Transit 1 goes to Gondola (Can catch from Elk Street Transit Hub) 

Roam Transit 2 goes to Tunnel Mountain (Catch from Elk Street Transit Hub) 

Roam Transit 4 goes to Bow Falls but walking is easier 

Parking Lot at Bow Falls or Bow Avenue Lot 

Day 3 (August 4 Monday): 8:00 am start 

Lake Minnewanka in morning 

For those not doing Via Ferrata, Lake Cruise 1 hr is $55 fits 15 (Good option for those not doing Alpine via ferrata). Also option for Explorer crew. 

Need to figure out if we should drive (which is what I am inclined to do) or take Roam Bus #6.  

Norquay Mountain 1:00pm (Alpine) and 3:00pm (Explorer): chairlift to Cliffhouse Bistro and do Upper Stoney Trial at base. 

Day 4 (August 5 Tuesday): 9:00am start 

Lake Louise & Moraine. 

Looking into driving which suggests early departure (7 am) for a parking space. Maybe shuttle?  

Book shuttle reservations – 48hours in advance (Craig to organize but everyone will need to create accounts to try and book spots) 

Hike to Lake Agnes Tea House is 3 hours 

Can rent Canoes/kayaks 

Possibly visit Vermillion Lakes/ Johnston Canyon if time 

Where to park or how to get there by shuttle? 

Big Group Dinner (Craig and Kiran to research) 

Day 5 (August 6 Wednesday): 9:00am start 

https://maps.app.goo.gl/G9TDuDHji41qiqLD9 

Vermilion Lakes 

Johnston Canyon, Lower Falls 

Bow Lake Viewpoint 

Peyton Lake 

Saskatchewan River Crossing 

Athabasca Glacier 

Sunwapta Falls 

Hiton Home 

Without stops, drive is 5 hours.  

Day 6 (August 7 Thursday): Sleep in.  

Miette Hot Springs 

Jasper telescope @ 10:30pm 

Can decide if we want to take it easy or go anything in the morning. May be nice to just have a relaxing day without anything planned. 

Day 7 (August 8 Friday): 10:00am start 

Rafting @ 12 or Maligne Lake to Ferry to Spirit Island 

Look into Ferry Ticket to Spirit Island for those interested and not doing rafting trip. 

Maligne Canyon Hike 

Medicine Lake 

Pyramid Lake &  Pyramid Mountain 

Big Group Dinner (Craig and Kiran to research) 

https://www.tandoorikonahinton.com/ 

https://www.rojomarron.com/ 

Day 8 (August 9 Saturday): 10:00am checkout 

5.5 hour drive through Obed. 

Check-out downtown Calgary. 

Drop off car by 8:00pm  

Westin Calgary. 

Day 9 (August 10 Sunday) 

Airport shuttle 

9:15am – Jersey Crew flight time 

8:30am – LA Crew flight time 

Kiran?? 

Split Wise Inputting Expenses 

Please create an account and enter any expenses you have incurred and attribute them to the correct people on the application 

Weather and Packing Notes 

Temperature between 50 degree and 75 degree 

Rain (Wed–Fri), so rain jacket/ umbrella  

Hiking shoes 

Layers 

Bug spray 

Sun block (UV index 7)/ hat/ sunglasses 

Swim suit 

slippers """

In [None]:
# Create system prompt that uses CAG to insert reference info
system_instructions = f"""You are a chatbot for a travel app that answers questons about the travel itinerary. You have a trip itinerary for a family vacation to Banff. The following text in triple quotes contains the travel itinerary for all the family members that are traveling. Reference the trip itinerary for information to answer questions.  If there isn't enough details in the question, ask for additional information. If the information doesn't exist in the itinerary, let the user know that not enough informatioon was in the travel itinerary to answer their question. If the question can be accurately answered, respond with an answer.

The following pieces of information might be useful to extract from the travel itinerary:
Traveler Info
- first_name 
- last_name
- email 

Flight Info
- departure_date
- departure_time
- arrival_date
- arrival_time 
- origin
- destination
- flight_number
- airline_name

Activity Info
- activity name
- activity description
- date
- start time
- url

Hotels and Restaurants
- business_name
- address
- url
- date or date range
- business_description

Here is a travel itinerary for the group vacation. Reference information from this itinerary to answer questions.

Use a polite and concise tone when responding. Format the responses so it is intuitive and easy for users to read from a text messaging app.  Organize the information in an easy to read format (e.g., use bulllet points, format as outlines, include url links, format data in tables when appropriate.)
```{itinerary_txt}```
"""

In [None]:
# Create memory for chat
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

In [None]:
# Create chain for chat
question_prompt = ChatPromptTemplate.from_messages([
    ("system", "{system_instructions}"),
    ("ai", "{chat_history}"),
    ("human", "{question}")
])
question_chain = ConversationChain(
    llm=client, 
    prompt=question_prompt.partial(system_instructions=system_instructions),
    input_key='question',
    memory=memory,
    verbose=True
)

In [None]:
# Create chat response 
def ask_question(question):
    response = question_chain.predict(question=question)
    raw_history = question_chain.memory.load_memory_variables({})["chat_history"]
    formatted_history = format_history(raw_history)
    return formatted_history, ""

In [None]:
# Converts instances of messages into single str for displaying
def format_history(messages):
    """
    Converts ConversationBufferMemory messages into a formatted SMS-style string.
    Example output:
    You: Hello
    Bot: Hi there!
    You: What's the weather?
    Bot: It's sunny and 75°F.
    """
    lines = []
    for msg in messages:
        if isinstance(msg, HumanMessage):
            lines.append(f"You: {msg.content}")
        elif isinstance(msg, AIMessage):
            lines.append(f"TravelBot: {msg.content}\n----------")
        elif isinstance(msg, SystemMessage):
            # optional: include system messages if desired
            pass
        else:
            lines.append(f"{msg.type.capitalize()}: {msg.content}")
    return "\n".join(lines)

In [None]:
def clear_memory():
    """Clear the chatbot memory."""
    memory.clear()
    return "Chat memory cleared. Starting a new conversation!",""

In [None]:
# --- Gradio Interface ---
with gr.Blocks() as demo:
    gr.Markdown("## 📱 Chatbot with Memory (iPhone-like UI)")
    
    with gr.Row():
        chat_display = gr.Textbox(
            label="Conversation",
            interactive=False,
            value="",
            lines=15,
            elem_id="chat-history-box"
        )
    
    with gr.Row():
        user_input = gr.Textbox(label="Type your message")
    
    with gr.Row():
        send_btn = gr.Button("Send")
        clear_btn = gr.Button("Clear Memory")
    
    send_btn.click(
        ask_question, 
        inputs=user_input, 
        outputs=chat_display
    )
    user_input.submit(
        ask_question, 
        inputs=user_input, 
        outputs=[chat_display,user_input]
    )
    clear_btn.click(
        clear_memory, 
        outputs=[chat_display,user_input]        
    )


In [None]:
# Test chat response function
question ="What is the schedule of activities for each day of the trip?" 
# question ="What is the schedule for August 4, 2025? Include times, event names and travelers names when approriate." 
# question ="What was the first questions I asked?" 
# question ="What are some of the names of the people going on the trip?" 
# ask_question(question)



In [None]:
demo.launch(share=True)

In [None]:
demo.close()