-
Notifications
You must be signed in to change notification settings - Fork 3
/
flight_data_scraperv1.1.py
237 lines (200 loc) · 11.5 KB
/
flight_data_scraperv1.1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
from playwright.sync_api import sync_playwright,TimeoutError
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime,timedelta
url = "https://matrix.itasoftware.com"
out_path = r"output.xlsx"
def origin_input():
"""
This function takes input from user. Only accepts three digits in chars. Does not check if the input is valid.
"""
while True:
origin = input("Origin (3 Char IATA Format): ")
if len(origin) == 3 and origin.isalpha():
break
else:
print("Origin input is invalid. Please provide in the suggested form.")
return origin.upper()
def destination_input():
"""
This function takes input from user. Only accepts three digits in chars. Does not check if the input is valid.
"""
while True:
destination = input("Destination (3 Char IATA Format): ")
if len(destination) == 3 and destination.isalpha():
break
else:
print("Destination input is invalid. Please provide in the suggested form.")
return destination.upper()
def start_date_input():
"""
This function takes in date values, checks if the date is valid but does not check if the date is valid.
It is possible to check if the date is in the past to confirm, and since it is only possible to book for a date in the upcoming 365 days
this validity check can also be implemented.
"""
while True:
date_input = input("Start Date ( Format : MM/DD/YY) ")
try:
formatted_date = datetime.strptime(date_input, "%m/%d/%y")
print("Succesful, date you provided is: ", formatted_date.strftime("%m/%d/%y"))
break
except ValueError:
print("Error! Please provide dates in suggested format (MM/DD/YY).")
return date_input
def end_date_input():
"""
This function takes in date values, checks if the date is valid but does not check if the date is valid.
It is possible to check if the date is in the past to confirm, and since it is only possible to book for a date in the upcoming 365 days
this validity check can also be implemented.
"""
while True:
date_input = input("End Date ( Format : MM/DD/YY) ")
try:
formatted_date = datetime.strptime(date_input, "%m/%d/%y")
print("Succesful, date you provided is: ", formatted_date.strftime("%m/%d/%y"))
break
except ValueError:
print("Error! Please provide dates in suggested format (MM/DD/YY).")
return date_input
def wait_for_table_load(page):
"""
This function will wait for the table objects to be loaded, and catch TimeoutError if it happens.
Ver 1.1 -- With new version we added a counter to keep track of failed attempts and break out of the program because sometimes page never loads
and program gets stuck in a never ending loop.
"""
counter = 0
while True:
try:
print("Waiting for tables to load...")
counter +=1
page.locator('div[id="cdk-accordion-child-0"]').wait_for(timeout=10000)
print("Tables loaded... Parsing the screen...")
break
except TimeoutError:
print(f"TimeoutError... Retrying...Total Tries :{counter}")
if counter >= 5:
print(f"This took more than {counter} tries. Breaking out and restarting.")
return True
def wait_for_carrier_table_load(page,carrier):
"""
This function will wait for the table objects to be loaded for each carrier after click action, and catch TimeoutError if it happens.
Ver 1.1 -- With new version we added a counter to keep track of failed attempts and break out of the program because sometimes page never loads
and program gets stuck in a never ending loop.
"""
counter = 0
while True:
try:
counter += 1
page.locator('thead[class="ng-star-inserted"]').wait_for(timeout=10000)
print(f"got the table for {carrier}... ")
break
except TimeoutError:
print(f"a problem occured while loading the tables for {carrier}... Retrying...{counter}")
if counter >=5:
return True
def read_screen(page,starting_date,ending_date,dataframes):
"""
This function will read the table contents for carriers and return the dataframe objects in a list of dataframes. It will also generate a sheet name representing the first flight date for each query.
"""
new_html = page.content()
df = pd.read_html(new_html,attrs = {"role":"table"})
df=df[0]
df = df[df["Airline filter_alt"].notnull()]
df["start_date"] = starting_date
df["end_date"] = ending_date
dataframes = pd.concat([dataframes,df],ignore_index= True)
sheet_name = starting_date.replace("/",".")
return sheet_name,dataframes
def flight_data_scraper(origin,destination,start_date,end_date):
## First we initiate our page object from playwright.
with sync_playwright() as p:
## Then we convert our dates to datetime object to create a list of dates in order to iterate through the list,this is for the sake of generating more data.
start_date_formatted = datetime.strptime(start_date,"%m/%d/%y")
end_date_formatted = datetime.strptime(end_date,"%m/%d/%y")
time_delta = end_date_formatted - start_date_formatted
date_list = pd.date_range(start_date_formatted,start_date_formatted + timedelta(days=180),freq="D")
for starting_date in date_list:
#We have to reformat the dates to pass them into form in the webpage.
#We also calculate the timedelta (time difference between start and end date) to fill the form for future iterations.
print("Starting Scraper...")
ending_date_in_dt = starting_date + timedelta(days=time_delta.days)
ending_date = datetime.strftime(ending_date_in_dt,"%m/%d/%Y")
starting_date = datetime.strftime(starting_date,"%m/%d/%Y")
while True:
try:
#In this while loop and try/except block, we try to fill the form and click search button.
#But sometimes page takes too long to load after searching. So when a TimeoutError occurs we catch it in the except block and close the browser to restart the process.
print("Starting Browser...")
browser = p.firefox.launch(headless= True,timeout=10000)
page = browser.new_page()
page.goto(url)
print("Page is now loaded...")
page.type("input[id=mat-mdc-chip-list-input-1]",destination.upper())
page.type("input[id=mat-mdc-chip-list-input-0]",origin.upper())
page.get_by_placeholder("Start Date").fill(starting_date)
page.get_by_placeholder("End Date").fill(ending_date)
print("Filled parameters...")
page.keyboard.down(key="Enter")
page.keyboard.up(key="Enter")
print("Clicking search button...")
#Ver 1.1 --We added no_wait_after = True to manually check if the page is loaded but this parameter does not seem to work properly.
#This line of parameters is set for other click lines aswell.
page.click('//button[@value="Search"]',no_wait_after=True)
print("Search completed...")
#Ver 1.1 --We moved wait_for_table_load inside this try block to prevent never ending loads and if the function returns true we raise a TimeoutError
#to breakout and restart the process.
if wait_for_table_load(page):
raise TimeoutError("Timelimit exceeded")
break
except TimeoutError:
print("Loading took longer than expected... Closing the page and retrying...")
browser.close()
print("Page is now closed... Restarting the process...")
#Parsing the screen to get carriers and content in the upper table.
#When we click the carrier names/images in the upper table we get a seperate table that only includes their prices, so we collect their names in carriers list.
page_content = page.content()
soup = BeautifulSoup(page_content,"lxml")
upper_table_content = soup.find("mat-table",attrs={"role":"table"})
rows = upper_table_content.find_all("mat-row")
headers = upper_table_content.find("mat-header-row")
list = []
carriers = []
#We also create a pandas.dataframe object in order to collect and append all the carriers prices/durations etc.
dataframes = pd.DataFrame()
for header_cell in headers.find_all("mat-header-cell"):
carriers.append(header_cell.text)
for row in rows:
cells = row.find_all("mat-cell")
for cell in cells:
list.append(cell.text)
for carrier in carriers[1:]:
#after we get all the data from upper table, we start to iterate through the carriers list and click them.
print("Iterating over carriers... Next carrier is: ",carrier)
#Ver 1.1 --We implemented a while/try/except method here aswell because sometimes after clicking to carrier image page does not load, so we try a couple of times
#If it still does not load we catch the error in the if block, reload the page and skip that carrier for the day.
while True:
try:
page.get_by_role("columnheader", name=f"{carrier} Carrier logo",exact= True).get_by_role("img",exact=True).click(no_wait_after=True)
#Pages does not load instantly, so wait for the table object to be present/visible.
#After screen is loaded we get the table content with pandas'es built in read_html function and concat the dataframe to the placeholder dataframe we created above.
#Function below will also generate a sheetname for us to use in excel sheets.
if wait_for_carrier_table_load(page,carrier):
print("Couldn't get the table for the carrier, skipping.")
page.reload()
continue
break
except TimeoutError:
print("TimeoutError...Retrying...")
sheet_name,dataframes = read_screen(page,starting_date,ending_date,dataframes)
#Here, we write all the information collected from parsing the screen to a single sheet.
with pd.ExcelWriter(out_path,mode="a",engine="openpyxl",if_sheet_exists="replace") as writer:
dataframes.to_excel(writer,sheet_name = f"{sheet_name}",index=False)
print(f"Completed the search for date {sheet_name}... Continuing for the next day...")
browser.close()
print("All searches are now completed... Please go ahead and check your excel file...")
return dataframes
origin = origin_input()
destination = destination_input()
start_date = start_date_input()
end_date = end_date_input()
flight_data_scraper(origin,destination,start_date,end_date)