In [23]:
# Import Splinter, BeautifulSoup and dependencies
import requests
from bs4 import BeautifulSoup as bs
from pymongo import MongoClient
from splinter import Browser
import time
import pandas as pd
from pprint import pprint


In [24]:
# Create an instance of MongoClient
mongo = MongoClient(port=27017)

# assign the met database to a variable name
db = mongo['kayak']

# assign the collection to a variable
flight_search = db['flight_search']


In [None]:
# flight_search.delete_many({}) #empty the collection

# Airport Code Web Scrape

In [4]:
# Launch browser
browser = Browser('chrome')
time.sleep(2)
url = "https://en.wikipedia.org/wiki/List_of_busiest_airports_by_passenger_traffic"
browser.visit(url)


In [5]:
# Create Beautiful Soup
html = browser.html
soup = bs(html, 'html.parser')
table = soup.find('table', class_='wikitable sortable jquery-tablesorter')

In [10]:
# Create an empty list
airports = []
rows = table.find_all('tr', class_=False)
# Loop through the scraped data to create a list of rows
for row in rows:
    row_data_id = row.find_all('td')
    row = [row.text for row in row_data_id]  
    airports.append(row)


In [12]:
# Create airports dataframe
airports_df = pd.DataFrame(airports, 
                           columns = ["Rank",
                                      "Airport",
                                      "Location",
                                      "Country",
                                      "A/P Code",
                                      "Total Passengers",
                                      "Rank Change",
                                      "Rank pct change"
                                     ])

In [13]:
# Remove unnecessary fields
airports_df = airports_df.drop(columns=["Rank Change", "Rank pct change"])

In [14]:
# Drop blank values
airports_df = airports_df.dropna()

In [15]:
# Split the airport code from the A/P Code.
airports_df[['IATA', 'ICAO Code']] = airports_df['A/P Code'].str.split('/', expand=True)

In [22]:
# Export for collaboration purposes
airports_df.to_csv('../Output/airports_df.csv',index=False) 

# Kayak.com Web Scrape & MongoDB Insertion

In [None]:
# Complete loop for destination airports and kayak flights
airport_codes = airports_df['IATA'].tolist()
departure = 'MCI' 

from_date = input("Enter departure date (YYYY-MM-DD): ")
to_date = input("Enter return date (YYYY-MM-DD): ")

browser = Browser('chrome')
trip_data = []

# Loop through each airport code
for airport in airport_codes:
    destination = airport
    url = f"https://www.kayak.com/flights/{departure}-{destination}/{from_date}/{to_date}"
    browser.visit(url)
    time.sleep(10)
    print(url)
    
    # Create a Beautiful Soup object
    soup = bs(browser.html,'html.parser')

    # Loop through HTML code and extract key metadata
    for trip_id, i in enumerate(soup.find_all(id="listWrapper")[0].find_all('div', class_="nrc6 nrc6-mod-pres-multi-fare")):
        trip = {
            "trip_id": trip_id,
            "outbound_date": from_date,
            "return_date": to_date,
            "outbound_departure_time": "",
            "outbound_arrival_time": "",
            "outbound_flight_duration": "",
            "outbound_route": "",
            "outbound_airline": "",
            "return_departure_time": "",
            "return_arrival_time": "",
            "return_flight_duration": "",
            "return_route": "",
            "return_airline": "",
            "price": "",
            "tier": ""
        }
        
        # Extract outbound flight data
        outbound_flight_info = i.find_all('div', class_='VY2U')[0]
        outbound_flight_times = outbound_flight_info.find_all('span')
        trip["outbound_departure_time"] = outbound_flight_times[0].text.strip()
        trip["outbound_arrival_time"] = outbound_flight_times[2].text.strip()
        outbound_airline_name = outbound_flight_info.find('div', class_='c_cgF c_cgF-mod-variant-default')
        trip["outbound_airline"] = outbound_airline_name.text.strip()
        trip["outbound_flight_duration"] = i.find('div', class_='xdW8').find('div', class_='vmXl vmXl-mod-variant-default').text.strip()
        trip["outbound_route"] = i.find('div', class_='xdW8').find('div', class_='EFvI').text.strip()
        
        # Extract return flight data
        return_flight_info = i.find_all('div', class_='VY2U')[1]
        return_flight_times = return_flight_info.find_all('span')
        trip["return_departure_time"] = return_flight_times[0].text.strip()
        trip["return_arrival_time"] = return_flight_times[2].text.strip()
        return_airline_name = return_flight_info.find('div', class_='c_cgF c_cgF-mod-variant-default')
        trip["return_airline"] = return_airline_name.text.strip()
        trip["return_flight_duration"] = i.find_all('div', class_='xdW8')[1].find('div', class_='vmXl vmXl-mod-variant-default').text.strip()
        trip["return_route"] = i.find_all('div', class_='xdW8')[1].find('div', class_='EFvI').text.strip()
        
        # Extract price and tier
        trip["price"] = i.find('div', class_="f8F1-price-text").text.strip()
        trip["tier"] = i.find('div', class_="aC3z-name aC3z-mod-ellipsis").text.strip()

        # Append HTML elements into trip_data list
        trip_data.append(trip)
        time.sleep(10)

# Insert all trip data into flight search Mondo DB
flight_search.insert_many(trip_data)

browser.quit()