In [1]:
#Import Dependencies
from selenium.webdriver.common.action_chains import ActionChains
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup as soup
from selenium import webdriver
from splinter import Browser
import pandas as pd
import numpy as np
import regex as re
import datetime
import random
import time

In [2]:
def button_press(action):
    if action == "keystroke":
        return random.uniform(.091315816564,.15978136531)
    elif action == "enter_search":
        # This value needs to stay above 30 seconds to accommodate for the pause the website imposes when performing a search
        return random.uniform(30.2551845131843, 33.3189651876)
    elif action == "fare_click":
        return random.uniform(1.9098741610684,2.3498135130)
    else:
        return random.uniform(.2098741610684,.3498135130)

In [3]:
def get_dates(depart_day,trip_length,number_of_weeks):
    
    #Get today's date
    today = datetime.date.today()
    
    i = 0
    depart_ = []
    return_ = []
    for x in range(0,number_of_weeks):
        # Get the next possible departure date based on the desired day of the week to leave
        depart_.append((today + datetime.timedelta((depart_day-today.weekday()) % 7 )).strftime('%m/%d/%Y'))
        return_.append((today + datetime.timedelta((depart_day-today.weekday()) % 7 + trip_length)).strftime('%m/%d/%Y'))
        today = today + datetime.timedelta((depart_day-today.weekday()) % 7 + 7)
        i += 1
#         print(f"{depart_[i]} : {return_[i]}")
        
    return depart_,return_

In [4]:
def move_and_click(element):
    action.move_to_element(element).perform()
    time.sleep(button_press(0))
    
    action.click(element).perform()
    time.sleep(button_press(0))   

In [5]:
def set_time_of_day(element):
    
    # Select the element
    move_and_click(element)

#     # Move down to "Early Evening"
#     for x in range(0,4):
#         action.send_keys(Keys.DOWN)
#         time.sleep(button_press("keystroke"))
        
    # move up to "All Day"
    for x in range(0,7):
        action.send_keys(Keys.UP)
        time.sleep(button_press("keystroke"))

    action.click(element).perform()
    time.sleep(button_press(0))

In [6]:
def enter_airport(airport,element):

    move_and_click(element)

    # Delete any 3-letter airport codes that may be currently in the field
    for x in range(0,3):
        action.send_keys(Keys.BACKSPACE).perform()
        time.sleep(button_press("keystroke"))

    # write in the new airport code
    for letter in airport:
        action.send_keys(letter).perform()
        time.sleep(button_press("keystroke"))

    time.sleep(button_press(0))  

In [7]:
def enter_dates(date,element):

    move_and_click(element)
    
    for x in range(0,10):
        action.send_keys(Keys.BACKSPACE).perform()
        time.sleep(button_press("keystroke"))
    
    for letter in date:
        action.send_keys(letter).perform()
        time.sleep(button_press("keystroke"))
        
    time.sleep(button_press(0))  

In [8]:
def execute_search(home_airports,visit_airports,depart_dates,return_dates):

    # navigate to the website
    driver.get("https://www.aa.com/booking/find-flights")
    time.sleep(button_press(0))
    
    # Set Time of Day for Departure and Return
    set_time_of_day(driver.find_element(by=By.XPATH,value='//*[@id="segments0.travelTime"]'))
    set_time_of_day(driver.find_element(by=By.XPATH,value='//*[@id="segments1.travelTime"]'))
    
    for from_airport in home_airports:

        # Set the home airports
        enter_airport(from_airport,driver.find_element(by=By.XPATH,value=('//*[@id="segments0.origin"]')))

        for to_airport in visit_airports:

            # Set the destination airport
            enter_airport(to_airport,driver.find_element(by=By.XPATH,value=('//*[@id="segments0.destination"]')))
            
            for (depart_date,return_date) in zip(depart_dates,return_dates):

                # Set the travel dates
                enter_dates(depart_date,driver.find_element(by=By.XPATH,value='//*[@id="segments0.travelDate"]'))
                enter_dates(return_date,driver.find_element(by=By.XPATH,value='//*[@id="segments1.travelDate"]'))

                # Hitting the ENTER key while still in the airports field will execute a search
                action.send_keys(Keys.ENTER).perform()
                time.sleep(button_press("enter_search"))
                
                # Read the fares that have been generated by the search
                departing_fare_element,returning_fare_element = read_fares()
                
                # Create a list to be the new line of the dataframe
                new_df_line = [datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                               f"{from_airport}-{to_airport}-{depart_date}-{return_date}",
                               from_airport,
                               to_airport,
                               depart_date,
                               return_date
                              ]
                
                # Build the rest of the line in the new df
                build_df(new_df_line,departing_fare_element,returning_fare_element)
                
                # navigate to the website
                driver.get("https://www.aa.com/booking/find-flights")
                time.sleep(button_press(0))




In [9]:
def get_trip_length(x):
    x = re.findall("<li.*?>",str(x.prettify()))[0]
    x = re.findall("data-triptime=\".*?\"",x)[0]
    return float(re.findall("[0-9]+",x)[0])

In [10]:
def get_max_trip_length(list_elements):
    
    shortest_trip_length = 2**15 # minutes
    for x in list_elements:
        x = get_trip_length(x)
        if x < shortest_trip_length:
            shortest_trip_length = x
    
    # After finding the shortest trip, return a maximum trip length of 2x the shortest
    return shortest_trip_length * 2

In [11]:
def trip_duration_filter(list_elements):
    
    # Get the maximum trip length
    max_trip_length = get_max_trip_length(list_elements)
    
    i = 0
    duration_filtered_elements = []
    for x in list_elements:
        y = get_trip_length(x)
        if y < max_trip_length:
            duration_filtered_elements.append(x)
        i += 1
    
    return duration_filtered_elements

In [12]:
def get_lowest_fare(list_elements):
    i = 0
    lowest_fare = 2**15
    for x in list_elements:
        x = re.findall("<li.*?>",str(x.prettify()))[0]
        new_lowest_fare = re.findall("data-lowestfare=\".*?\"",x)[0]
        new_lowest_fare = float(re.findall("[0-9]+",new_lowest_fare)[0])
        if new_lowest_fare < lowest_fare:
            lowest_fare = new_lowest_fare
            lowest_fare_element = list_elements[i]
        i += 1
    return lowest_fare_element

In [13]:
def get_fare_button(departing_fare_element):
    
    lowest_fare_element = re.findall("<button.*?MainCabin.*?>",str(departing_fare_element.prettify()))[0]
    lowest_fare_id = re.findall("id=\"slice\dFlight[0-9]+MainCabin\"",lowest_fare_element)[0]
    lowest_fare_id = f"//*[@{lowest_fare_id}]"

    return driver.find_element(by=By.XPATH,value=lowest_fare_id)

In [14]:
def sort_by_price():
    # Select the sorting menu
    sort_menu_element = driver.find_element(By.XPATH,value='//*[@id="searchResultsSortOptions"]')
    move_and_click(sort_menu_element)

    # Move to "Price (low to high)". Scroll to top and then back down 2 lines
    for x in range(0,8):
        action.send_keys(Keys.UP)
        time.sleep(button_press("keystroke"))

    for x in range(0,2):
        action.send_keys(Keys.DOWN)
        time.sleep(button_press("keystroke"))

    action.click(sort_menu_element).perform()
    time.sleep(button_press(0))

In [15]:
def build_df(new_df_line,departing_fare_element,returning_fare_element):
    
    # Set the pattern for finding everything between quotation marks
    pattern = "\"(.*?)\""

    # Add the total price data from the return flight to the list
    new_df_line.append(re.findall(pattern,re.findall("data-tripprice=\".*?\"",str(returning_fare_element))[0])[0])

    # Add the departing flight data to the list
    new_df_line.append(re.findall(pattern,re.findall("data-departuretime=\".*?\"",str(departing_fare_element))[0])[0])
    new_df_line.append(re.findall(pattern,re.findall("data-triptime=\".*?\"",str(departing_fare_element))[0])[0])
    new_df_line.append(re.findall(pattern,re.findall("data-tripstops=\".*?\"",str(departing_fare_element))[0])[0])

    # Add the returning flight data to the list
    new_df_line.append(re.findall(pattern,re.findall("data-departuretime=\".*?\"",str(returning_fare_element))[0])[0])
    new_df_line.append(re.findall(pattern,re.findall("data-triptime=\".*?\"",str(returning_fare_element))[0])[0])
    new_df_line.append(re.findall(pattern,re.findall("data-tripstops=\".*?\"",str(returning_fare_element))[0])[0])
    new_df_line
    
    # Write the new dataframe line to the next line in the dataframe
    df.loc[len(df)] = new_df_line
#     df.head()

In [16]:
def read_fares():
    
    # Get the html from the page
    html = driver.page_source
    html_soup = soup(html,'html.parser')
    
    # Identifying the list elements that contain flight and price information
    depart_elements = trip_duration_filter(html_soup.find_all("li",id=re.compile("slice0Flight[0-9]+")))
    
    # Get the element for the lowest fare on the departure page
    departing_fare_element = get_lowest_fare(depart_elements)
    
    # Get the button for the lowest fare element
    departing_fare_button = get_fare_button(departing_fare_element)

    # sort the results by price, so that the lowest price is no longer hidden
    sort_by_price()
    
    # Click the button for the lowest fare flight option
    move_and_click(departing_fare_button)
    button_press("fare_click")
        
    # Get the html from the page
    html = driver.page_source
    html_soup = soup(html,'html.parser')
    
    # Identifying the list elements that contain flight and price information
    return_elements = trip_duration_filter(html_soup.find_all("li",id=re.compile("slice1Flight[0-9]+")))
    
    # Get the element for the lowest fare on the return page
    returning_fare_element = get_lowest_fare(return_elements)
    
    return departing_fare_element,returning_fare_element

In [17]:
columns = ["timestamp","trip_id","depart_airport","return_airport","depart_date","return_date","total_cost","depart_datetime","depart_duration","depart_stops","return_datetime","return_duration","return_stops"]
# columns = ["trip_id"]
df = pd.DataFrame(columns=columns)

# create webdriver object
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.maximize_window()

# create action chain object
action = ActionChains(driver)

#Define Airports to Search
home_airports = ["SNA","LAX"]
visit_airports = ["PHX","ORD"]

# Python calendar elements use "3" to represent Thursday
# Python calendar elements use "4" to represent Friday
depart_day = 3
trip_length = 4 # This value minus 1 gives the total number of non-travel days spent at the destination
number_of_weeks = 26
depart_dates,return_dates = get_dates(depart_day,trip_length,number_of_weeks)
# print(depart_dates)

# #Execute the search
# execute_search(home_airports,visit_airports,depart_dates,return_dates)




[WDM] - Current google-chrome version is 102.0.5005
INFO:WDM:Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
INFO:WDM:Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [C:\Users\morroe1\.wdm\drivers\chromedriver\win32\102.0.5005.61\chromedriver.exe] found in cache
INFO:WDM:Driver [C:\Users\morroe1\.wdm\drivers\chromedriver\win32\102.0.5005.61\chromedriver.exe] found in cache


In [18]:
# Get the timestamp for the save file
analysis_time = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')

# Execute the search
execute_search(home_airports,visit_airports,depart_dates,return_dates)

# Close the driver
driver.close()

In [19]:
df.head(10)

Unnamed: 0,timestamp,trip_id,depart_airport,return_airport,depart_date,return_date,total_cost,depart_datetime,depart_duration,depart_stops,return_datetime,return_duration,return_stops
0,2022-06-23 04:40:11,SNA-PHX-06/23/2022-06/27/2022,SNA,PHX,06/23/2022,06/27/2022,353.0,06-23-2022 12:56:00,78,0,06-27-2022 10:13:00,88,0
1,2022-06-23 04:41:02,SNA-PHX-06/30/2022-07/04/2022,SNA,PHX,06/30/2022,07/04/2022,269.0,06-30-2022 12:56:00,78,0,07-04-2022 10:18:00,88,0
2,2022-06-23 04:41:52,SNA-PHX-07/07/2022-07/11/2022,SNA,PHX,07/07/2022,07/11/2022,239.0,07-07-2022 13:11:00,79,0,07-11-2022 10:18:00,88,0
3,2022-06-23 04:42:43,SNA-PHX-07/14/2022-07/18/2022,SNA,PHX,07/14/2022,07/18/2022,300.0,07-14-2022 13:11:00,79,0,07-18-2022 14:00:00,79,0
4,2022-06-23 04:43:34,SNA-PHX-07/21/2022-07/25/2022,SNA,PHX,07/21/2022,07/25/2022,300.0,07-21-2022 16:44:00,84,0,07-25-2022 10:18:00,88,0
5,2022-06-23 04:44:30,SNA-ORD-06/23/2022-06/27/2022,SNA,ORD,06/23/2022,06/27/2022,2939.0,06-23-2022 16:44:00,636,1,06-27-2022 09:55:00,251,0
6,2022-06-23 04:45:25,SNA-ORD-06/30/2022-07/04/2022,SNA,ORD,06/30/2022,07/04/2022,582.0,06-30-2022 16:44:00,330,1,07-04-2022 11:52:00,486,1
7,2022-06-23 04:46:42,SNA-ORD-07/07/2022-07/11/2022,SNA,ORD,07/07/2022,07/11/2022,634.0,07-07-2022 10:17:00,330,1,07-11-2022 05:00:00,420,1
8,2022-06-23 04:47:37,SNA-ORD-07/14/2022-07/18/2022,SNA,ORD,07/14/2022,07/18/2022,690.0,07-14-2022 12:36:00,253,0,07-18-2022 05:00:00,387,1
9,2022-06-23 04:48:32,SNA-ORD-07/21/2022-07/25/2022,SNA,ORD,07/21/2022,07/25/2022,771.0,07-21-2022 16:44:00,330,1,07-25-2022 05:00:00,387,1


In [20]:
ha = ""
for x in home_airports:
    ha = ha + x
va = ""
for x in visit_airports:
    va = va + x
path = f"Data/{analysis_time}_{ha}_{va}.csv"
df.to_csv(path,index=False)

In [21]:
# columns = ["timestamp","trip_id","depart_airport","return_airport","depart_date","return_date",
#            "total_cost",
#            "depart_datetime","depart_duration","depart_stops",
#            "return_datetime","return_duration","return_stops"]
# blah = re.findall("<li[^>]+flight-search-results[^>]+>",str(returning_fare_element))[0]
# # print(blah)
# tripprice = re.findall("data-tripprice=\".*?\"",blah)[0]
# new_df_line = ["2022-06-23 2:43:23",
#                "LAX-ORD-6/23/2022",
#                "LAX",
#                "ORD",
#                "6/23/2022",
#                "6/26/2022"
#               ]

# # Set the pattern for finding everything between quotation marks
# pattern = "\"(.*?)\""

# # Add the total price data from the return flight to the list
# new_df_line.append(re.findall(pattern,re.findall("data-tripprice=\".*?\"",str(returning_fare_element))[0])[0])

# # Add the departing flight data to the list
# new_df_line.append(re.findall(pattern,re.findall("data-departuretime=\".*?\"",str(departing_fare_element))[0])[0])
# new_df_line.append(re.findall(pattern,re.findall("data-triptime=\".*?\"",str(departing_fare_element))[0])[0])
# new_df_line.append(re.findall(pattern,re.findall("data-tripstops=\".*?\"",str(departing_fare_element))[0])[0])

# # Add the returning flight data to the list
# new_df_line.append(re.findall(pattern,re.findall("data-departuretime=\".*?\"",str(returning_fare_element))[0])[0])
# new_df_line.append(re.findall(pattern,re.findall("data-triptime=\".*?\"",str(returning_fare_element))[0])[0])
# new_df_line.append(re.findall(pattern,re.findall("data-tripstops=\".*?\"",str(returning_fare_element))[0])[0])
# new_df_line

In [22]:
# # Identifying the list elements that contain flight and price information
# list_elements = html_soup.find_all("li",id=re.compile("slice0Flight[0-9]+"))
# len(list_elements)

In [23]:
# duration_filtered_elements = trip_duration_filter(html_soup.find_all("li",id=re.compile("slice0Flight[0-9]+")))

In [24]:
# # Identifying the list elements that contain flight and price information
# list_elements = duration_filtered_elements
# len(list_elements)

In [25]:
# #     lowest_fare_element = re.findall("<li.*?flight-search-results.*?>",str(departing_fare_element.prettify()))[0]
#     lowest_fare_element = re.findall("<button.*?MainCabin.*?>",str(departing_fare_element.prettify()))[0]
# #     return lowest_fare_element
#     lowest_fare_id = re.findall("id=\"slice\dFlight[0-9]+MainCabin\"",lowest_fare_element)[0]
#     lowest_fare_id = f"//*[@{lowest_fare_id}]"

#     lowest_fare_element = driver.find_element(
#         by=By.XPATH,
#         value=lowest_fare_id
#     )

In [26]:


# i = 0
# lowest_fare = 2**15
# lowest_fare_element = []
# for x in list_elements:
#     list_elements[i] = re.findall("<li.*?>",str(x.prettify()))[0]
#     new_lowest_fare = re.findall("data-lowestfare=\".*?\"",list_elements[i])[0]
#     new_lowest_fare = float(re.findall("[0-9]+",new_lowest_fare)[0])
#     if new_lowest_fare < lowest_fare:
#         lowest_fare = new_lowest_fare
#         lowest_fare_element.append(x)
# #     print(f"{i}: {list_elements[i]}")
# #     print(lowest_fare_element)
#     i += 1
    
# i = 0
# lowest_fare = 2**15
# for x in list_elements:
#     x = re.findall("<li.*?>",str(x.prettify()))[0]
#     new_lowest_fare = re.findall("data-lowestfare=\".*?\"",x)[0]
#     new_lowest_fare = float(re.findall("[0-9]+",new_lowest_fare)[0])
#     if new_lowest_fare < lowest_fare:
#         lowest_fare = new_lowest_fare
#         lowest_fare_element = list_elements[i]
# #     print(f"{i}: {list_elements[i]}")
#     i += 1
# print(lowest_fare_element)

# lowest_fare_element = re.findall("<button.*?MainCabin.*?>",str(lowest_fare_element.prettify()))[0]
# # print(lowest_fare_element)

# lowest_fare_id = re.findall("id=\"slice\dFlight[0-9]+MainCabin\"",lowest_fare_element)[0]
# # print(lowest_fare_id)
# lowest_fare_id = f"//*[@{lowest_fare_id}]"
# # print(lowest_fare_id)

# lowest_fare_element = driver.find_element(
#     by=By.XPATH,
#     value=lowest_fare_id
# )
# print(lowest_fare_element)

In [27]:
# # sort the results by price, so that the lowest price is no longer hidden
# sort_menu_element = driver.find_element(By.XPATH,value='//*[@id="searchResultsSortOptions"]')
# move_and_click(sort_menu_element)

# # Move down to "Price (low to high)"
# for x in range(0,2):
#     action.send_keys(Keys.DOWN)
#     time.sleep(button_press("keystroke"))

# action.click(sort_menu_element).perform()
# time.sleep(button_press(0))
    
# # Click the button for the lowest flight option
# move_and_click(lowest_fare_element)

In [28]:
# Identifying the list elements that contain flight and price information for return flights
# # Get the html from the page
# html_return = driver.page_source
# html_soup_return = soup(html_return,'html.parser')

# return_elements = html_soup_return.find_all("li",id=re.compile("slice1Flight\d"))

# i = 0
# for x in return_elements:
# #     list_elements[i] = re.search("<li\s*>\w",str(x.prettify()))
#     return_elements[i] = re.findall("<li.*?>",str(x.prettify()))[0]
#     print(f"{i}: {return_elements[i]}")
#     i += 1

In [29]:
#This tool solely used in early dev for identifying html elemnts that contain information


# button_elements = html_soup.findAll('button')
# i = 0
# for x in button_elements:
#     print(f"{i}: {x}")
#     i += 1

In [30]:
# driver.close()

In [31]:
# return_elements = html_soup_return.find_all("li",id=re.compile("slice1Flight\d"))
# i = 0
# for x in return_elements:
# #     list_elements[i] = re.search("<li\s*>\w",str(x.prettify()))
# #     return_elements[i] = re.findall("<li.*?>",str(x.prettify()))[0]
#     return_elements[i] = re.findall("<li[^>]+flight-search-results[^>]+>",str(x))
#     print(f"{i}: {return_elements[i]}")
#     i += 1
    
# return_elements = html_soup_return.find_all("li",id=re.compile("slice1Flight\d"))

In [32]:
# '''
# <li
# data-arrivaltime="06-26-2022 08:25:00"
# data-departuretime="06-26-2022 07:25:00"
# data-flightindex="6"
# data-lowestfare="666.00"
# data-tripprice="666.00"
# data-tripstops="0"
# data-triptime="180"
# '''
# all_keys = [
#     "data-arrivaltime",
#     "data-departuretime",
#     "data-flightindex",
#     "data-lowestfare",
#     "data-tripprice",
#     "data-tripstops",
#     "data-triptime",
# ]
# # x.find('li')["aria-describedby"]
# # return_elements[2].find_all('li', class_="flight-search-results")
# dict_vals = []
# for i in return_elements:
#     a = {}
#     for x in all_keys:
#         a[x] = i[x]
#     dict_vals.append(a)
# #     print(return_elements[2][x])