# 0. Luther – Preliminary

## Imports & Versions

In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import time
import sys

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os
chromedriver = "/Applications/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver

In [2]:
# these doesn't all work right now

list_of_imports = [('Numpy', np), 
                   ('Pandas', pd), 
#                   ('Beautiful Soup', BeautifulSoup), 
                   ('Selenium', webdriver)
                  ]

for mod in list_of_imports:
    print(f"{mod[0]}: {mod[1].__version__}")

print("Python:", sys.version)

Numpy: 1.12.1
Pandas: 0.20.1
Selenium: 3.6.0
Python: 3.6.1 |Anaconda 4.4.0 (x86_64)| (default, May 11 2017, 13:04:09) 
[GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.57)]


## Things For Later

In [3]:
# if you want to add some random delay between queries:

# delay = np.random.uniform(0.8,3.0)
# time.sleep(delay)

In [4]:
# create date in YYYYMMDD format, one day previous
today = datetime.now()
one_day = timedelta(days=1)
previous = today - one_day
previous

datetime.datetime(2017, 10, 2, 10, 24, 46, 635160)

In [5]:
# change flight_num and route_num to args when ready for use

def flight_url_maker(date_str, flight_num='AAL321', route_num='0130Z'):
    """
    Takes datetime object, flight number, and route number, 
    returns URL to search for a particular flight/route/day. 
    ---
    IN: date string YYYYMMDD (str), flight_num (str), route_num (str)
    OUT: search URL (str)
    """

    # base URL constructors
    base_url1 = 'http://flightaware.com/live/flight/'
    base_url2 = '/history/'
    base_url3 = '/KLGA/KORD'
    
    # merge vars with URL bases
    search_url = (base_url1 + flight_num + base_url2 + 
                  date_str + '/' + route_num + base_url3)

    return search_url

## Scraping, Take 1

### Goal 1: scrape every record of a partiular flight number on the LGA-ORD route back to a certain date

All the airlines operating flight between LGA and ORD:
* American
* United
* Republic (for American Eagle, Delta Connection, United Express — FA labels as Delta, but all pictures are American livery)
* SkyWest (for AE, DC, **UE**)
* Spirit

What info do we want? (italics are bonus):
* Airline
* Flight number
* Date
* Scheduled arrival time 
* Actual arrival time
* *Aircraft type*
* *Average fare*

Key URLs:
* http://flightaware.com/live/airport/KORD — ORD main, live updates
* http://flightaware.com/live/airport/KORD/arrivals — ORD arrivals, static (?)
* http://flightaware.com/live/airport/KORD/arrivals?;offset=0;order=actualarrivaltime;sort=DESC — change the offset in increments of 40 to view arrivals history for one week back
* http://flightaware.com/live/findflight?origin=KLGA&destination=KORD — LGA-ORD route, no history option, JavaScript (?) sidebar to filter results
* http://flightaware.com/live/flight/AAL321 — live status on flight in progress
* http://flightaware.com/live/flight/AAL321/history/500 – complete (listed) history of a particular flight number (number at end can be adjusted to capture all posted records)
* http://flightaware.com/live/flight/AAL321/history/20170823/0130Z/KLGA/KORD — historic record of specific flight. 0130Z indicates the route number of that flight number. Can change out the route number and date as far back as the records exist (YES). **Scrape this one for info.**

First, check to see if flight was cancelled!

In [6]:
# could make a class for each route/airline

Here's the info we want to capture:

Airline, Flight #, Origin, Destination, Date, Arrival (Sched), Arrival (Actual), *Plane Model*, *Avg Price*

If canceled, input 'C' for Arrivals (or just in Actual).

### Testing Soup

The three main conditions one encounters with a flight page:
* arrived
* canceled
* no flight

In [11]:
arrived_url = 'http://flightaware.com/live/flight/AAL321/history/20170824/0130Z/KLGA/KORD'
canceled_url = 'http://flightaware.com/live/flight/AAL321/history/20170823/0130Z/KLGA/KORD'
noflight_url = 'http://flightaware.com/live/flight/AAL321/history/20160824/0130Z/KLGA/KORD'

In [8]:
resp_arr = requests.get(arrived_url)
soup_arr = BeautifulSoup(resp_arr.text, 'html5lib')

resp_c = requests.get(canceled_url)
soup_c = BeautifulSoup(resp_c.text, 'html5lib')

resp_nf = requests.get(noflight_url)
soup_nf = BeautifulSoup(resp_nf.text, 'html5lib')

In [9]:
print(soup_arr.prettify())

<!DOCTYPE html>
<html class="responsive-full" dir="LTR" lang="en">
 <!--
Looking at the source?  Great!
  FlightAware is looking for UI/UX, web, and backend developers, mathematicians and electrical engineers.

Get in touch with us:
https://flightaware.com/about/careers

-->
 <head>
  <!-- 	Attention! You must agree with FlightAware's Terms of Use at http://flightaware.com/about/termsofuse
  Data robots and collection agents should use http://flightaware.com/commercial/flightxml/
            Copyright 2017 FlightAware.  All Rights Reserved. -->
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="@FlightAware" name="twitter:site"/>
  <title>
   American Airlines (AA)  #321 ✈ 23-Aug-2017  ✈ KLGA - KORD ✈ FlightAware
  </title>
  <meta content="width=device-width, initial-scale=1, user-scalable=no" name="viewport"/>
  <link href="/opensearch.xml" rel="search" title="FlightAware" type="application/

In [11]:
print(soup_nf.prettify())

<!DOCTYPE html>
<html class="responsive-full" dir="LTR" lang="en">
 <!--
Looking at the source?  Great!
  FlightAware is looking for UI/UX, web, and backend developers, mathematicians and electrical engineers.

Get in touch with us:
https://flightaware.com/about/careers

-->
 <head>
  <!-- 	Attention! You must agree with FlightAware's Terms of Use at http://flightaware.com/about/termsofuse
  Data robots and collection agents should use http://flightaware.com/commercial/flightxml/
            Copyright 2017 FlightAware.  All Rights Reserved. -->
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="@FlightAware" name="twitter:site"/>
  <title>
   ✈ 23-Aug-2016  ✈ KLGA - KORD ✈ FlightAware
  </title>
  <meta content="width=device-width, initial-scale=1, user-scalable=no" name="viewport"/>
  <link href="/opensearch.xml" rel="search" title="FlightAware" type="application/opensearchdescription+xml"/>


In [12]:
print(soup_c.prettify())

<!DOCTYPE html>
<html class="responsive-full" dir="LTR" lang="en">
 <!--
Looking at the source?  Great!
  FlightAware is looking for UI/UX, web, and backend developers, mathematicians and electrical engineers.

Get in touch with us:
https://flightaware.com/about/careers

-->
 <head>
  <!-- 	Attention! You must agree with FlightAware's Terms of Use at http://flightaware.com/about/termsofuse
  Data robots and collection agents should use http://flightaware.com/commercial/flightxml/
            Copyright 2017 FlightAware.  All Rights Reserved. -->
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="@FlightAware" name="twitter:site"/>
  <title>
   American Airlines (AA)  #321 ✈ 22-Aug-2017  ✈ KLGA - KORD ✈ FlightAware
  </title>
  <meta content="width=device-width, initial-scale=1, user-scalable=no" name="viewport"/>
  <link href="/opensearch.xml" rel="search" title="FlightAware" type="application/

No matter the date passed in the URL, it spits back a page. The following determines if that page represents an actual flight or not:

In [13]:
def is_flight(soup):
    """Returns True if page is an actual flight, and False if not."""
    
    names = []
    for meta in soup.find_all('meta'):
        names.append(meta.get('name'))
        
    if 'airline' in names:
        return True
    else:
        return False

In [14]:
# arrived
is_flight(soup_arr)

True

In [15]:
# canceled
is_flight(soup_c)

True

In [16]:
# no flight on that date
is_flight(soup_nf)

False

The part that shows if a flight was canceled resides within a `div` with `class="flightPageFriendlyIdent"`, and this class name is only used once. Unfortunately, searching for that class returns nothing:

In [17]:
ident = soup_c.find(class_="flightPageFriendlyIdent")
print(ident)

None


This seems to work though:

In [18]:
def is_canceled(soup):
    return soup.text.count('cancelled') == 32

In [19]:
# canceled
is_canceled(soup_c)

True

In [20]:
# arrived
is_canceled(soup_arr)

False

Arrival times live within a `div` of `class="flightPageDetails"`, then in a `div` of `data-template="live/flight/details"`, and finally in the second `div` of `class="flightPageDataTable"`.

In [21]:
details = soup_arr.find(class_="flightPageDetails")
details.find(attrs={"data-template": "live/flight/details"})

<div data-template="live/flight/details"></div>

This one is empty, so we can't even get to the `'flightPageDataTable'` that we need. Soup is not enough...

#### Testing Selenium

In [24]:
driver = webdriver.Chrome(chromedriver)

In [91]:
driver.get(arrived_url)

Can't say I have high hopes here, but just in case...

In [92]:
soup1 = BeautifulSoup(driver.page_source, 'html.parser')

In [97]:
details1 = soup1.find(class_="flightPageDetails")
details_sub1 = details1.find(attrs={"data-template": "live/flight/details"})
details_sub1

<div data-template="live/flight/details"><!-- Flight details heading -->
<h3 class="flightPageHeading">
Flight Details
<span class="flightPageLastUpdateTimestamp"></span></h3>
<!-- Links -->
<div class="flightPageLinks flightPageLinksDetails">
<div class="halfButtonTable">
<a class="halfButton text-center" href="/live/flight/AAL321/history/20170824/0130Z/KLGA/KORD/tracklog"><span>View track log</span></a>
</div>
<div class="flightPageLinksDetailsBottom">
<a href="/live/findflight/KLGA/KORD">
<span>
All flights between LGA and ORD
</span>
</a>
</div>
</div>
<!-- Flight times -->
<!-- Departure times -->
<h3 class="flightPageDataTableHeading">Departure Times</h3>
<div class="flightPageDataTable">
<!-- Table header -->
<div class="flightPageHeaderRow">
<div class="flightPageHeaderColumn"> </div>
<div class="flightPageHeaderColumn">Gate Departure</div>
<div class="flightPageHeaderColumn">Taxiing</div>
<div class="flightPageHeaderColumn">Takeoff</div>
</div>
<!-- Actual / Estimated -->
<div

It works! Here are the target numbers:
* Landing (actual) – 10:56 PM
* Gate Arrival (actual) – 11:01 PM
* Landing (sched) – 10:25 PM
* Gate Arrival (sched) – 11:09 PM

In [98]:
for i, e in enumerate(details_sub1.find_all('span')):
    print(f"*** {i}.\n{e.text}")

*** 0.

*** 1.
View track log
*** 2.

All flights between LGA and ORD

*** 3.
09:25PM EDT
*** 4.
09:58PM EDT
*** 5.
09:30PM EDT
*** 6.
09:30PM EDT
*** 7.

10:56PM CDT


*** 8.

11:01PM CDT


*** 9.

10:25PM CDT



*** 10.

11:09PM CDT




In [61]:
# here we can get our arrival times

spans = list(details_sub.find_all('span'))

arrival_times = []

for i in range(7,11):
    arrival_times.append(spans[i].text.strip())

for time in arrival_times:
    print(time)

10:56PM CDT
11:01PM CDT
10:25PM CDT
11:09PM CDT


In [49]:
search_date = datetime.strptime("20170824", "%Y%m%d").date()
search_date

datetime.date(2017, 8, 24)

In [51]:
# then convert to datetime
arrival_times_conv = map(lambda x: datetime.strptime(x, "%I:%M%p %Z").time(), arrival_times)
arrival_times_conv2 = list(map(lambda x: datetime.combine(search_date, x), arrival_times_conv))

for time in arrival_times_conv2:
    print(time)

2017-08-24 22:56:00
2017-08-24 23:01:00
2017-08-24 22:25:00
2017-08-24 23:09:00


Practice function:

In [70]:
def get_arrival_times(search_date_string, soup):
    search_date = datetime.strptime(search_date_string, "%Y%m%d").date()
    details = soup.find(class_="flightPageDetails")
    details_sub = details.find(attrs={"data-template": "live/flight/details"})
    spans = list(details_sub.find_all('span'))
    arrival_times = []
    # debugging
    for i, e in enumerate(details_sub.find_all('span')):
        print(f"*** {i}.\n{e.text}")
        
    for i in range(7,11):
        time_str = spans[i].text.strip().split(' ')[0]
        arrival_times.append(time_str)
    at_conv = map(lambda x: datetime.strptime(x, "%I:%M%p").time(), arrival_times)
    arrival_times = list(map(lambda x: datetime.combine(search_date, x), at_conv))
    return arrival_times

Trying it for another flight on another day...

In [25]:
driver.get("http://flightaware.com/live/flight/AAL321/history/20170930/0130Z/KLGA/KORD")

In [26]:
soup2 = BeautifulSoup(driver.page_source, 'html.parser')

Target numbers here are:
* Landing (actual) – 10:32 PM
* Gate Arrival (actual) – 10:45 PM
* Landing (sched) – 10:59 PM
* Gate Arrival (sched) – 11:09 PM

In [79]:
for line in get_arrival_times("20170930", soup2):
    print(line)

*** 0.

*** 1.
View track log
*** 2.
Track inbound plane
*** 3.

All flights between LGA and ORD

*** 4.
09:21PM EDT
*** 5.
09:36PM EDT
*** 6.
09:30PM EDT
*** 7.
09:30PM EDT
*** 8.

10:32PM CDT


*** 9.


10:45PM CDT



*** 10.

10:59PM CDT



*** 11.

11:09PM CDT


2017-09-30 21:30:00
2017-09-30 22:32:00
2017-09-30 22:45:00
2017-09-30 22:59:00


**It's one off — searching `<span>` won't work!!!** There's an extra field (Track Inbound Plane) for recent entries that mucks it all up. Searching for this class instead. First the 8-24 flight:

In [83]:
for i, e in enumerate(details_sub1.find_all(class_="flightPageTimeData")):
    print(f"*** {i}.\n{e.text.strip()}")

*** 0.
09:25PM EDT
*** 1.
33 minutes
*** 2.
09:58PM EDT
*** 3.
09:30PM EDT
*** 4.

*** 5.
09:30PM EDT
*** 6.
Less than 10 minutes
*** 7.

*** 8.

*** 9.
10:56PM CDT
*** 10.
5 minutes
*** 11.
11:01PM CDT
*** 12.
10:25PM CDT
*** 13.

*** 14.
11:09PM CDT
*** 15.

*** 16.

*** 17.
Less than 10 minutes


And then the 9-30 flight:

In [87]:
details2 = soup2.find(class_="flightPageDetails")
details_sub2 = details2.find(attrs={"data-template": "live/flight/details"})

In [88]:
for i, e in enumerate(details_sub2.find_all(class_="flightPageTimeData")):
    print(f"*** {i}.\n{e.text.strip()}")

*** 0.
09:21PM EDT
*** 1.
15 minutes
*** 2.
09:36PM EDT
*** 3.
09:30PM EDT
*** 4.

*** 5.
09:30PM EDT
*** 6.
Less than 10 minutes
*** 7.

*** 8.

*** 9.
10:32PM CDT
*** 10.
13 minutes
*** 11.
10:45PM CDT
*** 12.
10:59PM CDT
*** 13.

*** 14.
11:09PM CDT
*** 15.

*** 16.

*** 17.
Less than 10 minutes


**These indices line up!!!** Take 2.

In [101]:
def get_arrival_times2(search_date_string, soup):
    search_date = datetime.strptime(search_date_string, "%Y%m%d").date()
    details = soup.find(class_="flightPageDetails")
    details_sub = details.find(attrs={"data-template": "live/flight/details"})
    spans = list(details_sub.find_all('span'))
    arrival_times = []
    fpTimeData = details_sub.find_all(class_="flightPageTimeData")
    # debugging
    for i, e in enumerate(fpTimeData):
        print(f"*** {i}.\n{e.text.strip()}")
    # carry on
    for i in [9,11,12,14]:
        time_str = fpTimeData[i].text.strip().split(' ')[0]
        arrival_times.append(time_str)
    at_conv = map(lambda x: datetime.strptime(x, "%I:%M%p").time(), arrival_times)
    arrival_times = list(map(lambda x: datetime.combine(search_date, x), at_conv))
    return arrival_times

Testing both flights:

In [103]:
for line in get_arrival_times2("20170824", soup1):
    print(line)

*** 0.
09:25PM EDT
*** 1.
33 minutes
*** 2.
09:58PM EDT
*** 3.
09:30PM EDT
*** 4.

*** 5.
09:30PM EDT
*** 6.
Less than 10 minutes
*** 7.

*** 8.

*** 9.
10:56PM CDT
*** 10.
5 minutes
*** 11.
11:01PM CDT
*** 12.
10:25PM CDT
*** 13.

*** 14.
11:09PM CDT
*** 15.

*** 16.

*** 17.
Less than 10 minutes
2017-08-24 22:56:00
2017-08-24 23:01:00
2017-08-24 22:25:00
2017-08-24 23:09:00


In [104]:
for line in get_arrival_times2("20170930", soup2):
    print(line)

*** 0.
09:21PM EDT
*** 1.
15 minutes
*** 2.
09:36PM EDT
*** 3.
09:30PM EDT
*** 4.

*** 5.
09:30PM EDT
*** 6.
Less than 10 minutes
*** 7.

*** 8.

*** 9.
10:32PM CDT
*** 10.
13 minutes
*** 11.
10:45PM CDT
*** 12.
10:59PM CDT
*** 13.

*** 14.
11:09PM CDT
*** 15.

*** 16.

*** 17.
Less than 10 minutes
2017-09-30 22:32:00
2017-09-30 22:45:00
2017-09-30 22:59:00
2017-09-30 23:09:00


**SUCCESS!!**

#### But first a better way to find out if a flight is canceled...

In [33]:
driver = webdriver.Chrome(chromedriver)

In [34]:
driver.get(canceled_url)

In [35]:
soup_can = BeautifulSoup(driver.page_source, 'html.parser')

In [36]:
'cancelled' in soup_can.find(class_="flightPageSummary").text.casefold()

True

Good... and then with a completed flight:

In [37]:
'cancelled' in soup2.find(class_="flightPageSummary").text.lower()

False

Success.

In [38]:
# run when ready to shut down the driver

driver.close()

#### Goal 2: scrape arrivals or route pages to find all flight numbers serving the route

*Not part of MVP*

#### Goal 3: build function that takes flight and route numbers and scrapes history for each

In [121]:
# All the LGA-ORD flights:

flight_urls = [
"http://flightaware.com/live/flight/AAL321/history/20171003/0130Z/KLGA/KORD",
"http://flightaware.com/live/flight/AAL371/history/20171003/0030Z/KLGA/KORD",
"http://flightaware.com/live/flight/AAL359/history/20171002/2330Z/KLGA/KORD",
"http://flightaware.com/live/flight/AAL361/history/20171002/2230Z/KLGA/KORD",
"http://flightaware.com/live/flight/AAL383/history/20171002/2200Z/KLGA/KORD",
"http://flightaware.com/live/flight/AAL345/history/20171002/2130Z/KLGA/KORD",
"http://flightaware.com/live/flight/AAL343/history/20171002/2030Z/KLGA/KORD",
"http://flightaware.com/live/flight/AAL358/history/20171002/1930Z/KLGA/KORD",
"http://flightaware.com/live/flight/AAL187/history/20171002/1830Z/KLGA/KORD",
"http://flightaware.com/live/flight/AAL332/history/20171002/1730Z/KLGA/KORD",
"http://flightaware.com/live/flight/AAL390/history/20171002/1630Z/KLGA/KORD",
"http://flightaware.com/live/flight/AAL337/history/20171002/1530Z/KLGA/KORD",
"http://flightaware.com/live/flight/AAL328/history/20171002/1430Z/KLGA/KORD",
"http://flightaware.com/live/flight/AAL373/history/20171002/1330Z/KLGA/KORD",
"http://flightaware.com/live/flight/AAL1619/history/20171002/1230Z/KLGA/KORD",
"http://flightaware.com/live/flight/AAL289/history/20171002/1130Z/KLGA/KORD",
"http://flightaware.com/live/flight/AAL304/history/20171002/1030Z/KLGA/KORD"
]

In [111]:
import re

In [126]:
flights = []
for url in flight_urls:
    fn_p = re.compile(r'AAL\d{3,4}')
    rn_p = re.compile(r'\d{4}Z')
    fn = fn_p.search(url).group()
    rn = rn_p.search(url).group()
    flights.append((fn,rn))

flights

[('AAL321', '0130Z'),
 ('AAL371', '0030Z'),
 ('AAL359', '2330Z'),
 ('AAL361', '2230Z'),
 ('AAL383', '2200Z'),
 ('AAL345', '2130Z'),
 ('AAL343', '2030Z'),
 ('AAL358', '1930Z'),
 ('AAL187', '1830Z'),
 ('AAL332', '1730Z'),
 ('AAL390', '1630Z'),
 ('AAL337', '1530Z'),
 ('AAL328', '1430Z'),
 ('AAL373', '1330Z'),
 ('AAL1619', '1230Z'),
 ('AAL289', '1130Z'),
 ('AAL304', '1030Z')]