# 1. Luther – Scraping

### Imports & Versions

In [153]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import pickle
from datetime import datetime, timedelta
import time
import re
import sys

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os
chromedriver = "/Applications/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver

In [4]:
list_of_imports = [('Numpy', np), 
                   ('Pandas', pd), 
#                   ('Beautiful Soup', BeautifulSoup), 
                   ('Selenium', webdriver)
                  ]

for mod in list_of_imports:
    print(f"{mod[0]}: {mod[1].__version__}")

print("Python:", sys.version)

Numpy: 1.12.1
Pandas: 0.20.1
Selenium: 3.6.0
Python: 3.6.1 |Anaconda 4.4.0 (x86_64)| (default, May 11 2017, 13:04:09) 
[GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.57)]


### Necessary Functions and Variables

In [213]:
def make_flight_url(date_str, flight_num, route_num):
    """
    Takes date string, flight number, and route number, 
    returns URL to search for a particular flight/route/day. 
    ---
    IN: 
    date_str, YYYYMMDD (str)
    flight_num (str)
    route_num (str)
    
    OUT:
    search URL (str)
    """

    # base URL constructors
    base_url1 = 'http://flightaware.com/live/flight/'
    base_url2 = '/history/'
    base_url3 = '/KLGA/KORD'
    
    # merge vars with URL bases
    search_url = (base_url1 + flight_num + base_url2 + 
                  date_str + '/' + route_num + base_url3)

    return search_url

In [134]:
def scrape_flight_soup(soup, flight_num, search_date):
    """
    Scrapes pertinient information off single flight page, returns record
    for that flight (one record), returns None if no record for that day.
    ---
    IN: 
    soup, BS4 object of webpage
    flight_num, flight number as searched on FlightAware (str)
    search_date, date to search (datetime obj)
    
    OUT: four flight arrival times (list)
    """
    
    date_str = datetime.strftime(search_date, "%Y%m%d")
    
    # is there a flight that day?
    names = []
    for meta in soup.find_all('meta'):
        names.append(meta.get('name'))
    if not 'airline' in names:
        return 'No Flight'
    
    # was the flight canceled?
    if 'cancelled' in soup.find(class_="flightPageSummary").text.lower():
        return 'Canceled'
    
    # if flight arrived
    try:
        details = soup.find(class_="flightPageDetails")
        details_sub = details.find(attrs={"data-template": "live/flight/details"})
        spans = list(details_sub.find_all('span'))
        arrival_times = []
        fptd_divs = details_sub.find_all(class_="flightPageTimeData")

        # pulls from the four relevant indices of fptd_divs
        for i in [9,11,12,14]:
            time_str = fptd_divs[i].text.strip().split(' ')[0]
            arrival_times.append(time_str)

        arr_conv = map(lambda x: datetime.strptime(x, "%I:%M%p").time(), arrival_times)
        arrival_times = list(map(lambda x: datetime.combine(search_date, x), arr_conv))
        return arrival_times

    except Exception as e:
        print(f"*** {flight_num}, {date_str}: ERROR: {e}")
        return None

In [212]:
def scrape_fn(days, start_date_str, flight_num, route_num, df=None):
    """
    Make a version that takes either days/fn/rn or a list of URLs?
    
    Goes through a series of steps to gather data for a given flight 
    number and route over a given length of time. Appends each record
    to a dataframe (provided or generated). Also saves df for each 
    round as a .pkl.
    ---
    IN:
    days, number of days to scrape, starting yesterday (int)
    start_date_str, date from which to search backwards, YYYYMMDD (str)
    flight_num, flight number as searched on FlightAware (str)
    route_num, route number as searched on FlightAware (str)
    df, pandas dataframe
    
    OUT: 
    pandas dataframe, list of timeout URLS (list)
    """
    
    # makes df if none passed
    if df is None:
        df = pd.DataFrame(columns=['airline',
                                   'f_num',
                                   'origin',
                                   'dest',
                                   'date',
                                   'land_act',
                                   'arr_act',
                                   'land_sch',
                                   'arr_sch'])

    # starts Selenium and sets timeout preferences
    driver = webdriver.Chrome(chromedriver)
    driver.set_page_load_timeout(20)
    driver.set_script_timeout(20)
    
    start_date = datetime.strptime(start_date_str, "%Y%m%d").date()
    # today = datetime.now().date()
    no_flight_count = 0 
    miss_list = []
    
    # loop to search each date
    for d in range(days):
        time.sleep(np.random.uniform(1.0,2.0))
        search_date = start_date - timedelta(days=d)
        date_str = datetime.strftime(search_date, "%Y%m%d")
        record_a = ['American', flight_num, 'LGA', 'ORD', search_date]
        flight_url = make_flight_url(date_str, flight_num, route_num)
        
        try:
            driver.get(flight_url)
            flight_soup = BeautifulSoup(driver.page_source, 'html.parser')
            record_b = scrape_flight_soup(flight_soup, flight_num, search_date)
        except Exception as e:
            print(f"*** {flight_num}, {date_str}: ERROR: {e}")
            miss_list.append(flight_url)
            record_b = None
        
        if record_b == None:
            continue 
        elif record_b == 'Canceled':
            no_flight_count = 0
            print(f"{flight_num}, {date_str}: canceled")        
        elif record_b == 'No Flight':
            no_flight_count += 1
            print(f"{flight_num}, {date_str}: no flight")
            if no_flight_count == 7:
                print(f"{flight_num}: 7 consecutive days of no flights as of {date_str}!")
                break
        else:
            no_flight_count = 0
            record = record_a + record_b
            print(f"{flight_num}, {date_str}: flight data recorded")
            df.loc[len(df)] = record    
    
    # pickle the current round
    timestamp = datetime.strftime(datetime.now(), "%m%d_%H%M%S")
    picklepath = f'../data/{flight_num}_{timestamp}.pkl'
    with open(picklepath, 'wb') as picklefile:
        pickle.dump(df, picklefile)
        
    # save missed URLs as text file
    filepath = f'../data/{flight_num}_missed_{timestamp}.txt'
    with open(filepath, 'w') as f:
        for url in miss_list:
            f.write(url+'\n')
    
    driver.close()
    
    return df, miss_list

In [168]:
def multiple_flights(days, start_date_str, flight_list):
    """
    Finds all flights in a list of flight number/route number tuples
    over however many days provided and returns data in a concatenated
    dataframe.
    ---
    IN: 
    days, number of days to search (int)
    start_date_str, date from which to search BACKWARDS, YYYYMMDD (str)
    flight_list, list of flight number/route numbers (string tuples in list)
    
    OUT: 
    dataframe with all flight info (pandas df)
    """
    
    master_miss_list = []
    flight_df = pd.DataFrame(columns=['airline',
                                   'f_num',
                                   'origin',
                                   'dest',
                                   'date',
                                   'land_act',
                                   'arr_act',
                                   'land_sch',
                                   'arr_sch'])
    
    for fn, rn in flight_list:
        print(f"\n\n<<<<< {fn} >>>>>")
        flight_df, miss_list = scrape_fn(days, start_date_str, fn, rn, df=flight_df)
        master_miss_list += miss_list

    return flight_df, master_miss_list

In [211]:
def cleanup(url_list):
    """
    Takes a list of URLs that were missed the first time and returns
    successful querys as a df of the same format as the other
    functions, as well as a miss list.
    
    THIS IS A QUICK PATCH!!! Too much copy-paste. Trying to modify
    scrape_fn() to accept either a list of URLS or a (days/fn/rn) 
    tuple instead, because they are mostly similar.
    ---
    IN: 
    url_list, list of url strings to search
    
    OUT: 
    pandas dataframe, list of timeout URLS (list)
    """
    
    df = pd.DataFrame(columns=['airline',
                                   'f_num',
                                   'origin',
                                   'dest',
                                   'date',
                                   'land_act',
                                   'arr_act',
                                   'land_sch',
                                   'arr_sch'])

    # starts Selenium and sets timeout preferences
    driver = webdriver.Chrome(chromedriver)
    driver.set_page_load_timeout(60)
    driver.set_script_timeout(20)
    
    no_flight_count = 0 
    miss_list = []
    
    for flight_url in url_list:
        p = re.compile(r'(AAL\d{3,4}).*(\d{8})')
        flight_num, date_str = p.search(flight_url).group(1,2)
        search_date = datetime.strptime(date_str, "%Y%m%d").date()
        record_a = ['American', flight_num, 'LGA', 'ORD', search_date]
        
        try:
            driver.get(flight_url)
            flight_soup = BeautifulSoup(driver.page_source, 'html.parser')
            record_b = scrape_flight_soup(flight_soup, flight_num, search_date)
        except Exception as e:
            print(f"*** {flight_num}, {date_str}: ERROR: {e}")
            miss_list.append(flight_url)
            record_b = None
        
        if record_b == None:
            continue 
        elif record_b == 'Canceled':
            no_flight_count = 0
            print(f"{flight_num}, {date_str}: canceled")        
        elif record_b == 'No Flight':
            no_flight_count += 1
            print(f"{flight_num}, {date_str}: no flight")
            if no_flight_count == 7:
                print(f"{flight_num}: 7 consecutive days of no flights as of {date_str}!")
                break
        else:
            no_flight_count = 0
            record = record_a + record_b
            print(f"{flight_num}, {date_str}: flight data recorded")
            df.loc[len(df)] = record    
    
    # pickle the current round
    timestamp = datetime.strftime(datetime.now(), "%m%d_%H%M%S")
    picklepath = f'../data/cleanup_{timestamp}.pkl'
    with open(picklepath, 'wb') as picklefile:
        pickle.dump(df, picklefile)
    
    # save missed URLs as text file
    filepath = f'../data/cleanup_missed_{timestamp}.txt'
    with open(filepath, 'w') as f:
        for url in miss_list:
            f.write(url+'\n')    
    
    driver.close()
    
    return df, miss_list

In [136]:
# All the LGA-ORD flights:

flight_urls = [
"http://flightaware.com/live/flight/AAL321/history/20171003/0130Z/KLGA/KORD",
"http://flightaware.com/live/flight/AAL371/history/20171003/0030Z/KLGA/KORD",
"http://flightaware.com/live/flight/AAL359/history/20171002/2330Z/KLGA/KORD",
"http://flightaware.com/live/flight/AAL361/history/20171002/2230Z/KLGA/KORD",
"http://flightaware.com/live/flight/AAL383/history/20171002/2200Z/KLGA/KORD",
"http://flightaware.com/live/flight/AAL345/history/20171002/2130Z/KLGA/KORD",
"http://flightaware.com/live/flight/AAL343/history/20171002/2030Z/KLGA/KORD",
"http://flightaware.com/live/flight/AAL358/history/20171002/1930Z/KLGA/KORD",
"http://flightaware.com/live/flight/AAL187/history/20171002/1830Z/KLGA/KORD",
"http://flightaware.com/live/flight/AAL332/history/20171002/1730Z/KLGA/KORD",
"http://flightaware.com/live/flight/AAL390/history/20171002/1630Z/KLGA/KORD",
"http://flightaware.com/live/flight/AAL337/history/20171002/1530Z/KLGA/KORD",
"http://flightaware.com/live/flight/AAL328/history/20171002/1430Z/KLGA/KORD",
"http://flightaware.com/live/flight/AAL373/history/20171002/1330Z/KLGA/KORD",
"http://flightaware.com/live/flight/AAL1619/history/20171002/1230Z/KLGA/KORD",
"http://flightaware.com/live/flight/AAL289/history/20171002/1130Z/KLGA/KORD",
"http://flightaware.com/live/flight/AAL304/history/20171002/1030Z/KLGA/KORD"
]

In [None]:
# if I have time, I'll write a function to get this by scraping.
# but for now, the 'manual' way...

flights = []
for url in flight_urls:
#     fn_p = re.compile(r'AAL\d{3,4}')
#     rn_p = re.compile(r'\d{4}Z')
#     fn = fn_p.search(url).group()
#     rn = rn_p.search(url).group()
    p = re.compile(r'(AAL\d{3,4}).*(\d{4}Z)')
    fn, rn = p.search(url).group(1,2)
    flights.append((fn,rn))

### Let the Scrapes Begin

In [173]:
start = '20171002'

In [132]:
AAL_r1, missed_r1 = multiple_flights(365, start, flights[0:5])

*** AAL321 ***
AAL321, 20171002: flight data recorded
AAL321, 20171001: no flight
AAL321, 20170930: flight data recorded
*** AAL321, 20170929: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

AAL321, 20170928: flight data recorded
AAL321, 20170927: flight data recorded
AAL321, 20170926: flight data recorded
AAL321, 20170925: flight data recorded
AAL321, 20170924: no flight
AAL321, 20170923: flight data recorded
AAL321, 20170922: flight data recorded
AAL321, 20170921: flight data recorded
AAL321, 20170920: flight data recorded
AAL321, 20170919: flight data recorded
AAL321, 20170918: canceled
AAL321, 20170917: no flight
AAL321, 20170916: flight data recorded
AAL321, 20170915: flight data recorded
AAL321, 20170914: flight data recorded
AAL321, 20170913: flight data recorded
AAL321, 20170912: flight data recorded
AAL321, 20170911: flight data recorded
AAL321,

AAL371, 20170530: flight data recorded
AAL371, 20170529: flight data recorded
AAL371, 20170528: no flight
AAL371, 20170527: flight data recorded
*** AAL371, 20170526: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

AAL371, 20170525: flight data recorded
AAL371, 20170524: flight data recorded
*** AAL371, 20170523: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

AAL371, 20170522: flight data recorded
AAL371, 20170521: flight data recorded
AAL371, 20170520: flight data recorded
AAL371, 20170519: flight data recorded
AAL371, 20170518: flight data recorded
AAL371, 20170517: flight data recorded
AAL371, 20170516: flight data recorded
AAL371, 20170515: flight data recorded
AAL371, 20170514: flight data recorded
*** AAL371, 201705

*** AAL359, 20170701: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

*** AAL359, 20170630: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

*** AAL359, 20170629: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

*** AAL359, 20170628: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

*** AAL359, 20170627: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS

AAL383, 20170922: flight data recorded
AAL383, 20170921: flight data recorded
AAL383, 20170920: flight data recorded
AAL383, 20170919: flight data recorded
AAL383, 20170918: flight data recorded
AAL383, 20170917: flight data recorded
AAL383, 20170916: no flight
*** AAL383, 20170915: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

*** AAL383, 20170914: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

*** AAL383, 20170913: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

*** AAL383, 20170912: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 

In [161]:
AAL_r2, missed_r2 = multiple_flights(365, start, flights[5:9])

*** AAL345 ***
AAL345, 20171002: flight data recorded
AAL345, 20171001: flight data recorded
AAL345, 20170930: no flight
AAL345, 20170929: flight data recorded
AAL345, 20170928: flight data recorded
AAL345, 20170927: flight data recorded
AAL345, 20170926: no flight
AAL345, 20170925: flight data recorded
AAL345, 20170924: flight data recorded
AAL345, 20170923: no flight
AAL345, 20170922: flight data recorded
AAL345, 20170921: flight data recorded
*** AAL345, 20170920: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

AAL345, 20170919: no flight
AAL345, 20170918: flight data recorded
*** AAL345, 20170917: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

AAL345, 20170916: no flight
AAL345, 20170915: flight data recorded
AAL345, 

AAL343, 20170919: flight data recorded
AAL343, 20170918: flight data recorded
*** AAL343, 20170917: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

AAL343, 20170916: flight data recorded
AAL343, 20170915: flight data recorded
AAL343, 20170914: flight data recorded
AAL343, 20170913: flight data recorded
AAL343, 20170912: flight data recorded
AAL343, 20170911: flight data recorded
AAL343, 20170910: flight data recorded
AAL343, 20170909: flight data recorded
AAL343, 20170908: flight data recorded
AAL343, 20170907: flight data recorded
AAL343, 20170906: flight data recorded
AAL343, 20170905: canceled
AAL343, 20170904: flight data recorded
AAL343, 20170903: flight data recorded
AAL343, 20170902: flight data recorded
AAL343, 20170901: flight data recorded
*** AAL343, 20170831: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chrom

AAL343, 20170430: no flight
AAL343, 20170429: no flight
AAL343, 20170428: no flight
AAL343, 20170427: no flight
AAL343, 20170426: no flight
AAL343, 20170425: no flight
AAL343: 7 consecutive days of no flights as of 20170425!
*** AAL358 ***
AAL358, 20171002: flight data recorded
*** AAL358, 20171001: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

AAL358, 20170930: no flight
AAL358, 20170929: flight data recorded
AAL358, 20170928: flight data recorded
*** AAL358, 20170927: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

AAL358, 20170926: flight data recorded
AAL358, 20170925: flight data recorded
AAL358, 20170924: flight data recorded
AAL358, 20170923: no flight
AAL358, 20170922: flight data recorded
AAL358, 20170921: fligh

AAL358, 20170507: flight data recorded
AAL358, 20170506: no flight
AAL358, 20170505: flight data recorded
AAL358, 20170504: no flight
AAL358, 20170503: no flight
AAL358, 20170502: no flight
AAL358, 20170501: no flight
AAL358, 20170430: no flight
AAL358, 20170429: no flight
AAL358, 20170428: no flight
AAL358: 7 consecutive days of no flights as of 20170428!
*** AAL187 ***
*** AAL187, 20171002: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

AAL187, 20171001: flight data recorded
AAL187, 20170930: flight data recorded
*** AAL187, 20170929: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

AAL187, 20170928: flight data recorded
AAL187, 20170927: flight data recorded
AAL187, 20170926: flight data recorded
AAL187, 20170925: fligh

In [175]:
AAL_r3, missed_r3 = multiple_flights(365, start, flights[9:14])



<<<<< AAL332 >>>>>
AAL332, 20171002: flight data recorded
AAL332, 20171001: flight data recorded
AAL332, 20170930: flight data recorded
AAL332, 20170929: flight data recorded
AAL332, 20170928: flight data recorded
AAL332, 20170927: flight data recorded
AAL332, 20170926: flight data recorded
AAL332, 20170925: flight data recorded
AAL332, 20170924: flight data recorded
AAL332, 20170923: flight data recorded
AAL332, 20170922: flight data recorded
AAL332, 20170921: flight data recorded
*** AAL332, 20170920: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

AAL332, 20170919: flight data recorded
*** AAL332, 20170918: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

AAL332, 20170917: flight data recorded
AAL332, 20170916: flight 

AAL390, 20170926: no flight
AAL390, 20170925: flight data recorded
AAL390, 20170924: flight data recorded
AAL390, 20170923: no flight
AAL390, 20170922: flight data recorded
AAL390, 20170921: flight data recorded
AAL390, 20170920: flight data recorded
*** AAL390, 20170919: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

AAL390, 20170918: flight data recorded
AAL390, 20170917: flight data recorded
*** AAL390, 20170916: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

AAL390, 20170915: flight data recorded
AAL390, 20170914: flight data recorded
AAL390, 20170913: flight data recorded
AAL390, 20170912: no flight
AAL390, 20170911: flight data recorded
AAL390, 20170910: flight data recorded
AAL390, 20170909: no flight
AAL390, 2017

AAL337, 20170919: flight data recorded
AAL337, 20170918: flight data recorded
AAL337, 20170917: flight data recorded
*** AAL337, 20170916: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

AAL337, 20170915: flight data recorded
AAL337, 20170914: flight data recorded
AAL337, 20170913: flight data recorded
AAL337, 20170912: flight data recorded
AAL337, 20170911: flight data recorded
AAL337, 20170910: flight data recorded
AAL337, 20170909: flight data recorded
AAL337, 20170908: flight data recorded
AAL337, 20170907: flight data recorded
AAL337, 20170906: flight data recorded
AAL337, 20170905: flight data recorded
AAL337, 20170904: flight data recorded
AAL337, 20170903: flight data recorded
AAL337, 20170902: flight data recorded
AAL337, 20170901: flight data recorded
AAL337, 20170831: flight data recorded
*** AAL337, 20170830: ERROR: Message: timeout
  (Sessio

AAL328, 20170930: flight data recorded
*** AAL328, 20170929: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

AAL328, 20170928: flight data recorded
*** AAL328, 20170927: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

AAL328, 20170926: flight data recorded
AAL328, 20170925: flight data recorded
AAL328, 20170924: flight data recorded
AAL328, 20170923: flight data recorded
*** AAL328, 20170922: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

AAL328, 20170921: flight data recorded
*** AAL328, 20170920: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2

AAL328, 20170408: no flight
AAL328, 20170407: flight data recorded
AAL328, 20170406: flight data recorded
AAL328, 20170405: flight data recorded
AAL328, 20170404: flight data recorded
AAL328, 20170403: flight data recorded
AAL328, 20170402: flight data recorded
AAL328, 20170401: no flight
AAL328, 20170331: flight data recorded
AAL328, 20170330: flight data recorded
AAL328, 20170329: flight data recorded
AAL328, 20170328: flight data recorded
AAL328, 20170327: flight data recorded
AAL328, 20170326: no flight
AAL328, 20170325: no flight
AAL328, 20170324: flight data recorded
AAL328, 20170323: flight data recorded
AAL328, 20170322: no flight
AAL328, 20170321: no flight
AAL328, 20170320: flight data recorded
AAL328, 20170319: flight data recorded
AAL328, 20170318: no flight
AAL328, 20170317: flight data recorded
AAL328, 20170316: flight data recorded
AAL328, 20170315: no flight
AAL328, 20170314: no flight
AAL328, 20170313: flight data recorded
AAL328, 20170312: flight data recorded
AAL328,

AAL373, 20170515: flight data recorded
AAL373, 20170514: flight data recorded
AAL373, 20170513: flight data recorded
AAL373, 20170512: flight data recorded
AAL373, 20170511: flight data recorded
AAL373, 20170510: flight data recorded
AAL373, 20170509: flight data recorded
AAL373, 20170508: flight data recorded
AAL373, 20170507: flight data recorded
AAL373, 20170506: flight data recorded
AAL373, 20170505: flight data recorded
AAL373, 20170504: no flight
AAL373, 20170503: no flight
AAL373, 20170502: flight data recorded
AAL373, 20170501: no flight
AAL373, 20170430: no flight
AAL373, 20170429: no flight
AAL373, 20170428: no flight
AAL373, 20170427: no flight
*** AAL373, 20170426: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

AAL373, 20170425: no flight
AAL373, 20170424: flight data recorded
AAL373, 20170423: no flight
*** AAL373, 20170422: ERROR: Message:

In [184]:
AAL_r4, missed_r4 = multiple_flights(365, start, flights[14:])



<<<<< AAL1619 >>>>>
AAL1619, 20171002: flight data recorded
*** AAL1619, 20171001: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

AAL1619, 20170930: flight data recorded
AAL1619, 20170929: flight data recorded
AAL1619, 20170928: flight data recorded
AAL1619, 20170927: flight data recorded
AAL1619, 20170926: flight data recorded
*** AAL1619, 20170925: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

AAL1619, 20170924: flight data recorded
AAL1619, 20170923: flight data recorded
AAL1619, 20170922: flight data recorded
AAL1619, 20170921: flight data recorded
AAL1619, 20170920: flight data recorded
AAL1619, 20170919: flight data recorded
AAL1619, 20170918: flight data recorded
AAL1619, 20170917: flight data recorded
AAL1619,

AAL289, 20170925: flight data recorded
AAL289, 20170924: flight data recorded
*** AAL289, 20170923: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

AAL289, 20170922: flight data recorded
AAL289, 20170921: flight data recorded
*** AAL289, 20170920: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

AAL289, 20170919: flight data recorded
AAL289, 20170918: flight data recorded
AAL289, 20170917: flight data recorded
AAL289, 20170916: flight data recorded
AAL289, 20170915: flight data recorded
AAL289, 20170914: flight data recorded
AAL289, 20170913: flight data recorded
AAL289, 20170912: flight data recorded
AAL289, 20170911: flight data recorded
AAL289, 20170910: flight data recorded
AAL289, 20170909: flight data recorded
*** AAL

*** AAL304, 20170926: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

*** AAL304, 20170925: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

AAL304, 20170924: no flight
AAL304, 20170923: no flight
AAL304, 20170922: flight data recorded
AAL304, 20170921: flight data recorded
AAL304, 20170920: flight data recorded
AAL304, 20170919: flight data recorded
*** AAL304, 20170918: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

AAL304, 20170917: no flight
AAL304, 20170916: no flight
AAL304, 20170915: flight data recorded
AAL304, 20170914: flight data recorded
AAL304, 20170913: flight data recorded
AAL

In [185]:
AAL = pd.concat([AAL_r1, AAL_r2, AAL_r3, AAL_r4], ignore_index=True)
AAL_missed = missed_r1 + missed_r2 + missed_r3 + missed_r4

In [189]:
len(AAL_missed)

211

### Now That I Have It All...

In [190]:
picklepath = f'../data/AAL_pass1.pkl'
with open(picklepath, 'wb') as picklefile:
    pickle.dump(AAL, picklefile)

filepath = f'../data/AAL_missed1.txt'
with open(filepath, 'w') as f:
    for url in AAL_missed:
        f.write(url+'\n')  

### And the Cleanup

In [194]:
AAL_stragglers1, missed_again = cleanup(AAL_missed)

AAL321, 20170929: flight data recorded
AAL321, 20170907: flight data recorded
AAL321, 20170821: no flight
*** AAL371, 20170928: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

AAL371, 20170820: no flight
AAL371, 20170724: flight data recorded
AAL371, 20170709: no flight
*** AAL371, 20170708: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

AAL371, 20170625: no flight
AAL371, 20170605: flight data recorded
AAL371, 20170531: flight data recorded
AAL371, 20170526: flight data recorded
AAL371, 20170523: flight data recorded
AAL371, 20170513: flight data recorded
AAL371, 20170512: flight data recorded
*** AAL371, 20170506: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb

AAL328, 20170922: flight data recorded
AAL328, 20170920: flight data recorded
AAL328, 20170903: flight data recorded
AAL328, 20170827: flight data recorded
AAL328, 20170812: no flight
AAL328, 20170803: flight data recorded
AAL328, 20170628: flight data recorded
AAL328, 20170524: flight data recorded
AAL328, 20170409: flight data recorded
AAL373, 20170926: flight data recorded
AAL373, 20170919: flight data recorded
AAL373, 20170911: flight data recorded
AAL373, 20170824: flight data recorded
AAL373, 20170713: flight data recorded
AAL373, 20170706: flight data recorded
AAL373, 20170705: flight data recorded
AAL373, 20170630: flight data recorded
AAL373, 20170615: flight data recorded
AAL373, 20170601: flight data recorded
AAL373, 20170426: no flight
AAL373, 20170422: no flight
AAL373, 20170414: no flight
AAL373, 20170413: no flight
AAL1619, 20171001: flight data recorded
AAL1619, 20170925: flight data recorded
AAL1619, 20170910: flight data recorded
AAL1619, 20170725: flight data recorde

In [198]:
AAL_stragglers2, missed_again2 = cleanup(missed_again)

AAL371, 20170928: flight data recorded
AAL371, 20170708: flight data recorded
AAL371, 20170506: flight data recorded
AAL359, 20170821: flight data recorded
*** AAL359, 20170717: ERROR: Message: timeout
  (Session info: chrome=61.0.3163.100)
  (Driver info: chromedriver=2.32.498537 (cb2f855cbc7b82e20387eaf9a43f6b99b6105061),platform=Mac OS X 10.12.6 x86_64)

AAL359, 20170709: flight data recorded
AAL359, 20170706: flight data recorded
AAL359, 20170702: flight data recorded
AAL359, 20170630: canceled
AAL359, 20170616: canceled
AAL345, 20170917: flight data recorded
AAL343, 20170810: flight data recorded
AAL337, 20170830: flight data recorded
AAL289, 20170620: flight data recorded


In [199]:
AAL_stragglers3, foo = cleanup(missed_again2)

AAL359, 20170717: flight data recorded


In [202]:
AAL = pd.concat([AAL, AAL_stragglers1, AAL_stragglers2, AAL_stragglers3], ignore_index=True)

### Most Importantly... Pickled!

In [208]:
picklepath = f'../data/AAL_complete.pkl'
with open(picklepath, 'wb') as picklefile:
    pickle.dump(AAL, picklefile)

In [210]:
AAL.describe()

Unnamed: 0,airline,f_num,origin,dest,date,land_act,arr_act,land_sch,arr_sch
count,1928,1928,1928,1928,1928,1928,1928,1928,1928
unique,1,17,1,1,193,1928,1925,1927,1928
top,American,AAL328,LGA,ORD,2017-09-27,2017-09-28 11:56:00,2017-09-15 20:20:00,2017-08-15 18:16:00,2017-07-18 18:26:00
freq,1928,158,1928,1928,17,1,2,2,1
first,,,,,,2017-03-12 11:55:00,2017-03-12 12:08:00,2017-03-12 12:18:00,2017-03-12 12:28:00
last,,,,,,2017-10-02 22:35:00,2017-10-02 22:40:00,2017-10-02 22:52:00,2017-10-02 23:09:00
