-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
225 lines (188 loc) · 9.12 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
#!/usr/bin/python3
# -*- coding: utf-8 -*-
from time import sleep, strftime
from random import randint
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
driver = webdriver.Chrome()
time.sleep(2)
kayak = 'https://www.kayak.com/flights/LIS-SIN/2019-07-29-flexible/2019-08-15-flexible?sort=bestflight_a'
driver.get(kayak)
time.sleep(3)
city_from = input('From which city? ')
city_to = input('Where to? ')
date_start = input('Search around which departure date? Please use YYYY-MM-DD format only ')
date_end = input('Return when? Please use YYYY-MM-DD format only ')
# city_from = 'LIS'
# city_to = 'SIN'
# date_start = '2019-08-21'
# date_end = '2019-09-07'
for n in range(0,5):
start_kayak(city_from, city_to, date_start, date_end)
print('iteration {} was complete @ {}'.format(n, time.strftime("%Y%m%d-%H%M")))
# Wait 4 hours
time.sleep(60*60*4)
print('sleep finished.....')
driver.save_screenshot('pythonscraping.png')
driver.quit()
def load_more():
try:
more_results = '//a[@class = "moreButton"]'
driver.find_element_by_xpath(more_results).click()
# Printing these notes during the program helps me quickly check what it is doing
print('sleeping.....')
time.sleep(randint(45,60))
except:
pass
def start_kayak(city_from, city_to, date_start, date_end):
"""City codes - it's the IATA codes!
Date format - YYYY-MM-DD"""
kayak = ('https://www.kayak.com/flights/' + city_from + '-' + city_to +
'/' + date_start + '-flexible/' + date_end + '-flexible?sort=bestflight_a')
driver.get(kayak)
time.sleep(randint(8,10))
# sometimes a popup shows up, so we can use a try statement to check it and close
try:
xp_popup_close = '//button[contains(@id,"dialog-close") and contains(@class,"Button-No-Standard-Style close ")]'
driver.find_elements_by_xpath(xp_popup_close)[5].click()
except Exception as e:
pass
time.sleep(randint(60,95))
print('loading more.....')
# load_more()
print('starting first scrape.....')
df_flights_best = page_scrape()
df_flights_best['sort'] = 'best'
time.sleep(randint(60,80))
# Let's also get the lowest prices from the matrix on top
matrix = driver.find_elements_by_xpath('//*[contains(@id,"FlexMatrixCell")]')
matrix_prices = [price.text.replace('$','') for price in matrix]
matrix_prices = list(map(int, matrix_prices))
matrix_min = min(matrix_prices)
matrix_avg = sum(matrix_prices)/len(matrix_prices)
print('switching to cheapest results.....')
cheap_results = '//a[@data-code = "price"]'
driver.find_element_by_xpath(cheap_results).click()
time.sleep(randint(60,90))
print('loading more.....')
# load_more()
print('starting second scrape.....')
df_flights_cheap = page_scrape()
df_flights_cheap['sort'] = 'cheap'
time.sleep(randint(60,80))
print('switching to quickest results.....')
quick_results = '//a[@data-code = "duration"]'
driver.find_element_by_xpath(quick_results).click()
time.sleep(randint(60,90))
print('loading more.....')
# load_more()
print('starting third scrape.....')
df_flights_fast = page_scrape()
df_flights_fast['sort'] = 'fast'
time.sleep(randint(60,80))
# saving a new dataframe as an excel file. the name is custom made to your cities and dates
final_df = df_flights_cheap.append(df_flights_best).append(df_flights_fast)
final_df.to_excel('search_backups//{}_flights_{}-{}_from_{}_to_{}.xlsx'.format(time.strftime("%Y%m%d-%H%M"),
city_from, city_to,
date_start, date_end), index=False)
print('saved df.....')
# We can keep track of what they predict and how it actually turns out!
xp_loading = '//div[contains(@id,"advice")]'
loading = driver.find_element_by_xpath(xp_loading).text
xp_prediction = '//span[@class="info-text"]'
prediction = driver.find_element_by_xpath(xp_prediction).text
print(loading+'\n'+prediction)
# sometimes we get this string in the loading variable, which will conflict with the email we send later
# just change it to "Not Sure" if it happens
weird = '¯\\_(ツ)_/¯'
if loading == weird:
loading = 'Not sure'
def page_scrape():
"""This function takes care of the scraping part"""
xp_sections = '//*[@class="section duration"]'
sections = driver.find_elements_by_xpath(xp_sections)
sections_list = [value.text for value in sections]
section_a_list = sections_list[::2] # This is to separate the two flights
section_b_list = sections_list[1::2] # This is to separate the two flights
# if you run into a reCaptcha, you might want to do something about it
# you will know there's a problem if the lists above are empty
# this if statement lets you exit the bot or do something else
# you can add a sleep here, to let you solve the captcha and continue scraping
# i'm using a SystemExit because i want to test everything from the start
if section_a_list == []:
raise SystemExit
# I'll use the letter A for the outbound flight and B for the inbound
a_duration = []
a_section_names = []
for n in section_a_list:
# Separate the time from the cities
a_section_names.append(''.join(n.split()[2:5]))
a_duration.append(''.join(n.split()[0:2]))
b_duration = []
b_section_names = []
for n in section_b_list:
# Separate the time from the cities
b_section_names.append(''.join(n.split()[2:5]))
b_duration.append(''.join(n.split()[0:2]))
xp_dates = '//div[@class="section date"]'
dates = driver.find_elements_by_xpath(xp_dates)
dates_list = [value.text for value in dates]
a_date_list = dates_list[::2]
b_date_list = dates_list[1::2]
# Separating the weekday from the day
a_day = [value.split()[0] for value in a_date_list]
a_weekday = [value.split()[1] for value in a_date_list]
b_day = [value.split()[0] for value in b_date_list]
b_weekday = [value.split()[1] for value in b_date_list]
# getting the prices
xp_prices = '//a[@class="booking-link"]/span[@class="price option-text"]'
prices = driver.find_elements_by_xpath(xp_prices)
prices_list = [price.text.replace('$','') for price in prices if price.text != '']
prices_list = list(map(int, prices_list))
# the stops are a big list with one leg on the even index and second leg on odd index
xp_stops = '//div[@class="section stops"]/div[1]'
stops = driver.find_elements_by_xpath(xp_stops)
stops_list = [stop.text[0].replace('n','0') for stop in stops]
a_stop_list = stops_list[::2]
b_stop_list = stops_list[1::2]
xp_stops_cities = '//div[@class="section stops"]/div[2]'
stops_cities = driver.find_elements_by_xpath(xp_stops_cities)
stops_cities_list = [stop.text for stop in stops_cities]
a_stop_name_list = stops_cities_list[::2]
b_stop_name_list = stops_cities_list[1::2]
# this part gets me the airline company and the departure and arrival times, for both legs
xp_schedule = '//div[@class="section times"]'
schedules = driver.find_elements_by_xpath(xp_schedule)
hours_list = []
carrier_list = []
for schedule in schedules:
hours_list.append(schedule.text.split('\n')[0])
carrier_list.append(schedule.text.split('\n')[1])
# split the hours and carriers, between a and b legs
a_hours = hours_list[::2]
a_carrier = carrier_list[::2]
b_hours = hours_list[1::2]
b_carrier = carrier_list[1::2]
cols = (['Out Day', 'Out Time', 'Out Weekday', 'Out Airline', 'Out Cities', 'Out Duration', 'Out Stops', 'Out Stop Cities',
'Return Day', 'Return Time', 'Return Weekday', 'Return Airline', 'Return Cities', 'Return Duration', 'Return Stops', 'Return Stop Cities',
'Price'])
flights_df = pd.DataFrame({'Out Day': a_day,
'Out Weekday': a_weekday,
'Out Duration': a_duration,
'Out Cities': a_section_names,
'Return Day': b_day,
'Return Weekday': b_weekday,
'Return Duration': b_duration,
'Return Cities': b_section_names,
'Out Stops': a_stop_list,
'Out Stop Cities': a_stop_name_list,
'Return Stops': b_stop_list,
'Return Stop Cities': b_stop_name_list,
'Out Time': a_hours,
'Out Airline': a_carrier,
'Return Time': b_hours,
'Return Airline': b_carrier,
'Price': prices_list})[cols]
flights_df['timestamp'] = time.strftime("%Y%m%d-%H%M") # so we can know when it was scraped
return flights_df