In [23]:
# Import Splinter, BeautifulSoup and dependencies
import requests
from bs4 import BeautifulSoup as bs
from pymongo import MongoClient
from splinter import Browser
import time
import pandas as pd
from pprint import pprint
import matplotlib.pyplot as plt

In [24]:
# Create an instance of MongoClient
mongo = MongoClient(port=27017)

# assign the met database to a variable name
db = mongo['kayak']

# assign the collection to a variable
flight_search = db['flight_search']

# assign the met database to a variable name
dbtwo = mongo['kayak']

# assign the collection to a variable
airports = dbtwo['airports']

In [None]:
# flight_search.delete_many({}) #empty the collection

# Airport Code Web Scrape

In [4]:
# Launch browser
browser = Browser('chrome')
time.sleep(2)
url = "https://en.wikipedia.org/wiki/List_of_busiest_airports_by_passenger_traffic"
browser.visit(url)


In [5]:
# Create Beautiful Soup
html = browser.html
soup = bs(html, 'html.parser')
table = soup.find('table', class_='wikitable sortable jquery-tablesorter')

In [10]:
# Create an empty list
airports = []
rows = table.find_all('tr', class_=False)
# Loop through the scraped data to create a list of rows
for row in rows:
    row_data_id = row.find_all('td')
    row = [row.text for row in row_data_id]  
    airports.append(row)


In [12]:
# Create airports dataframe
airports_df = pd.DataFrame(airports, 
                           columns = ["Rank",
                                      "Airport",
                                      "Location",
                                      "Country",
                                      "A/P Code",
                                      "Total Passengers",
                                      "Rank Change",
                                      "Rank pct change"
                                     ])

In [13]:
# Remove unnecessary fields
airports_df = airports_df.drop(columns=["Rank Change", "Rank pct change"])

In [14]:
# Drop blank values
airports_df = airports_df.dropna()

In [15]:
# Split the airport code from the A/P Code.
airports_df[['IATA', 'ICAO Code']] = airports_df['A/P Code'].str.split('/', expand=True)

In [22]:
# Export for collaboration purposes
airports_df.to_csv('../Output/airports_df.csv',index=False) 

# Kayak.com Web Scrape For Airports

In [None]:
# Complete loop for destination airports and kayak flights
airport_codes = airports_df['IATA'].tolist()
departure = 'MCI' 

from_date = input("Enter departure date (YYYY-MM-DD): ")
to_date = input("Enter return date (YYYY-MM-DD): ")

browser = Browser('chrome')
trip_data = []

# Loop through each airport code
for airport in airport_codes[40]:
    destination = airport
    url = f"https://www.kayak.com/flights/{departure}-{destination}/{from_date}/{to_date}"
    browser.visit(url)
    time.sleep(5)
    print(url)
    
    # Create a Beautiful Soup object
    soup = bs(browser.html,'html.parser')

    # Loop through HTML code and extract key metadata
    for trip_id, i in enumerate(soup.find_all(id="listWrapper")[0].find_all('div', class_="nrc6 nrc6-mod-pres-multi-fare")):
        trip = {
            "trip_id": trip_id,
            "outbound_date": from_date,
            "return_date": to_date,
            "outbound_departure_time": "",
            "outbound_arrival_time": "",
            "outbound_flight_duration": "",
            "outbound_route": "",
            "outbound_airline": "",
            "return_departure_time": "",
            "return_arrival_time": "",
            "return_flight_duration": "",
            "return_route": "",
            "return_airline": "",
            "price": "",
            "tier": ""
        }
        
        # Extract outbound flight data
        outbound_flight_info = i.find_all('div', class_='VY2U')[0]
        outbound_flight_times = outbound_flight_info.find_all('span')
        trip["outbound_departure_time"] = outbound_flight_times[0].text.strip()
        trip["outbound_arrival_time"] = outbound_flight_times[2].text.strip()
        outbound_airline_name = outbound_flight_info.find('div', class_='c_cgF c_cgF-mod-variant-default')
        trip["outbound_airline"] = outbound_airline_name.text.strip()
        trip["outbound_flight_duration"] = i.find('div', class_='xdW8').find('div', class_='vmXl vmXl-mod-variant-default').text.strip()
        trip["outbound_route"] = i.find('div', class_='xdW8').find('div', class_='EFvI').text.strip()
        
        # Extract return flight data
        return_flight_info = i.find_all('div', class_='VY2U')[1]
        return_flight_times = return_flight_info.find_all('span')
        trip["return_departure_time"] = return_flight_times[0].text.strip()
        trip["return_arrival_time"] = return_flight_times[2].text.strip()
        return_airline_name = return_flight_info.find('div', class_='c_cgF c_cgF-mod-variant-default')
        trip["return_airline"] = return_airline_name.text.strip()
        trip["return_flight_duration"] = i.find_all('div', class_='xdW8')[1].find('div', class_='vmXl vmXl-mod-variant-default').text.strip()
        trip["return_route"] = i.find_all('div', class_='xdW8')[1].find('div', class_='EFvI').text.strip()
        
        # Extract price and tier
        trip["price"] = i.find('div', class_="f8F1-price-text").text.strip()
        trip["tier"] = i.find('div', class_="aC3z-name aC3z-mod-ellipsis").text.strip()

        # Append HTML elements into trip_data list
        trip_data.append(trip)
        time.sleep(5)

# Insert all trip data into flight search Mondo DB
flight_search.insert_many(trip_data)

browser.quit()

# Testing: Kayak Web Scrapping and HTML Investigation

In [None]:
# kayak URL testing
airport_codes = airport_codes_df['IATA'].tolist()
departure = 'MCI' 

from_date = input("Enter departure date (YYYY-MM-DD): ")
to_date = input("Enter return date (YYYY-MM-DD): ")

# Loop through each airport code
for airport in airport_codes:
    destination = airport
    url = f"https://www.kayak.com/flights/{departure}-{destination}/{from_date}/{to_date}"
    print(url)


In [None]:
# single URL test for web scraping elements
browser = Browser('chrome')
browser.visit('https://www.kayak.com/flights/MCI-SAN/2024-07-04/2024-07-11')
time.sleep(5)
# Create a Beautiful Soup object
soup = bs(browser.html,'html.parser')

In [None]:
# single URL test
# Find all flight text elements in HTML
for i in soup.find_all(id="listWrapper")[0].find_all('div', class_ = "nrc6 nrc6-mod-pres-multi-fare"):
    print(i.text)

In [None]:
# single URL test
# Find flight times in HTML
flight_times_list = []

for flight in soup.find_all('div', class_='VY2U'):  
    flight_times = flight.find_all('span')  
    departure_time = flight_times[0].text.strip()  
    arrival_time = flight_times[2].text.strip() 
    airline_name = flight.find('div', class_='c_cgF c_cgF-mod-variant-default').text.strip()

    flight_times_list.append({
        "departure_time": departure_time,
        "arrival_time": arrival_time,
        "airline": airline_name
    })

print(flight_times_list)

In [None]:
# single URL test
# Find flight durations in HTML
flight_durations_list = []

for i in soup.find_all(id="listWrapper")[0].find_all('div', class_="nrc6 nrc6-mod-pres-multi-fare"):
    duration_route = i.find_all('div', class_='xdW8')
    outbound_flight_duration = duration_route[0].find('div', class_='vmXl vmXl-mod-variant-default').text.strip()
    return_flight_duration = duration_route[1].find('div', class_='vmXl vmXl-mod-variant-default').text.strip()
    flight_durations_list.append({
        "outbound_flight_duration": outbound_flight_duration,
        "return_flight_duration": return_flight_duration
    })

print(flight_durations_list)

In [None]:
# single URL test
# Find flight routes in HTML
flight_routes_list = []

for i in soup.find_all(id="listWrapper")[0].find_all('div', class_="nrc6 nrc6-mod-pres-multi-fare"):
    duration_route = i.find_all('div', class_='xdW8')
    outbound_route = duration_route[0].find('div', class_='EFvI').text.strip()
    return_route = duration_route[1].find('div', class_='EFvI').text.strip()
    flight_routes_list.append({
        "outbound_route": outbound_route,
        "return_route": return_route
    })
print(flight_routes_list)

In [None]:
# single URL test
# Find price information in HTML
price_list = []

for i in soup.find_all(id="listWrapper")[0].find_all('div', class_="nrc6 nrc6-mod-pres-multi-fare"):
    # Extract all price text elements
    price_texts = i.find_all('div', class_="f8F1-price-text")
    
    for price_text in price_texts:
        price = price_text.text.strip()
        price_list.append(price)

print(price_list)

In [None]:
# single URL test
# Find tier information in HTML
tier_list = []

for i in soup.find_all(id="listWrapper")[0].find_all('div', class_="nrc6 nrc6-mod-pres-multi-fare"):
    # Extract all price text elements
    tier_texts = i.find_all('div', class_="aC3z-name aC3z-mod-ellipsis")
    
    for tier_text in tier_texts:
        tier = tier_text.text.strip()
        tier_list.append(tier)

print(tier_list)

In [None]:
# single URL test
# Combine all text elements into one for loop
flight_data = {
    "flight_times": [],
    "flight_durations": [],
    "flight_routes": [],
    "prices": [],
    "tiers": []
}

for i in soup.find_all(id="listWrapper")[0].find_all('div', class_="nrc6 nrc6-mod-pres-multi-fare"):
    for flight in soup.find_all('div', class_='VY2U'):  
        flight_times = flight.find_all('span')  
        departure_time = flight_times[0].text.strip()  
        arrival_time = flight_times[2].text.strip() 
        airline_name = flight.find('div', class_='c_cgF c_cgF-mod-variant-default')
        airline = airline_name.text.strip()
        flight_data["flight_times"].append({
            "departure_time": departure_time,
            "arrival_time": arrival_time,
            "airline": airline
        })
        
    duration_route = i.find_all('div', class_='xdW8')
    for dr in duration_route:
        outbound_flight_duration = duration_route[0].find('div', class_='vmXl vmXl-mod-variant-default').text.strip()
        return_flight_duration = duration_route[1].find('div', class_='vmXl vmXl-mod-variant-default').text.strip()
        flight_data["flight_durations"].append({
            "outbound_flight_duration": outbound_flight_duration,
            "return_flight_duration": return_flight_duration
        })

        outbound_route = duration_route[0].find('div', class_='EFvI').text.strip()
        return_route = duration_route[1].find('div', class_='EFvI').text.strip()
        flight_data["flight_routes"].append({
            "outbound_route": outbound_route,
            "return_route": return_route
        })
    
    # Extract all price text elements
    price_texts = i.find_all('div', class_="f8F1-price-text")
    
    for price_text in price_texts:
        price = price_text.text.strip()
        flight_data["prices"].append(price)

    tier_texts = i.find_all('div', class_="aC3z-name aC3z-mod-ellipsis")
    
    for tier_text in tier_texts:
        tier = tier_text.text.strip()
        flight_data["tiers"].append(tier)

pprint(flight_data)


In [None]:
# single URL test
# Combine all text elements using one for loop and placing records into a list
single_url_scrape = []

for trip_id, i in enumerate(soup.find_all(id="listWrapper")[0].find_all('div', class_="nrc6 nrc6-mod-pres-multi-fare")):
    trip = {
        "trip_id": trip_id,
        "outbound_departure_time": "",
        "outbound_arrival_time": "",
        "outbound_flight_duration": "",
        "outbound_route": "",
        "outbound_airline": "",
        "return_departure_time": "",
        "return_arrival_time": "",
        "return_flight_duration": "",
        "return_route": "",
        "return_airline": "",
        "price": "",
        "tier": ""
    }
    
    # Extract outbound flight data
    outbound_flight_info = i.find_all('div', class_='VY2U')[0]
    outbound_flight_times = outbound_flight_info.find_all('span')
    trip["outbound_departure_time"] = outbound_flight_times[0].text.strip()
    trip["outbound_arrival_time"] = outbound_flight_times[2].text.strip()
    outbound_airline_name = outbound_flight_info.find('div', class_='c_cgF c_cgF-mod-variant-default')
    trip["outbound_airline"] = outbound_airline_name.text.strip()
    trip["outbound_flight_duration"] = i.find('div', class_='xdW8').find('div', class_='vmXl vmXl-mod-variant-default').text.strip()
    trip["outbound_route"] = i.find('div', class_='xdW8').find('div', class_='EFvI').text.strip()
    
    # Extract return flight data
    return_flight_info = i.find_all('div', class_='VY2U')[1]
    return_flight_times = return_flight_info.find_all('span')
    trip["return_departure_time"] = return_flight_times[0].text.strip()
    trip["return_arrival_time"] = return_flight_times[2].text.strip()
    return_airline_name = return_flight_info.find('div', class_='c_cgF c_cgF-mod-variant-default')
    trip["return_airline"] = return_airline_name.text.strip()
    trip["return_flight_duration"] = i.find_all('div', class_='xdW8')[1].find('div', class_='vmXl vmXl-mod-variant-default').text.strip()
    trip["return_route"] = i.find_all('div', class_='xdW8')[1].find('div', class_='EFvI').text.strip()
    
    # Extract price and tier
    trip["price"] = i.find('div', class_="f8F1-price-text").text.strip()
    trip["tier"] = i.find('div', class_="aC3z-name aC3z-mod-ellipsis").text.strip()
    
    single_url_scrape.append(trip)

In [None]:
# single URL test
kayak_df = pd.DataFrame(single_url_scrape)
kayak_df

In [None]:
# single URL test
# Extract all flight text elements
for i in soup.find_all(id="listWrapper")[0].find_all('div', class_ = "M_JD-large-display"):
    print(i.text)

In [None]:
# single URL test
# Extract booking URL
for i in soup.find_all(id="listWrapper")[0].find_all('div', class_="nrc6 nrc6-mod-pres-multi-fare"):
        booking_link_tag = i.find("a")
kayak_url = 'https://kayak.com'
flight_booking_url =
booking_link = {kayak_url}
print(booking_link_tag['href'])

In [None]:
browser.quit()