# This Notebook uses Selenium to scrape data from transtats.bts.gov and combine into a single csv file. All flights US domestic flights between Jan 2017 and July 2018 are collected.

### Note: if you want to use this you will need to change the paths in the chromedriver cell as well as those in the move_flight_data() cell. 

In [3]:
from bs4 import BeautifulSoup
import pandas as pd
import os
import zipfile
import requests
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
import time

In [2]:
# Set up chrome webdriver

chromedriver = "/Users/dcotes/documents/chromedriver/chromedriver" 
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)


In [None]:
# Download all relevant flight data form the Transtats website 

def scrape_flight_data():
    
    #Send driver to the transtats website
    driver.get("https://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236")
    
    #Select options on download page for data we want
    titles = ['Year', 'DayOfWeek', 'FlightDate', 'Reporting_Airline', 'Tail_Number', 'Flight_Number_Reporting_Airline', 
          'Origin', 'Dest', 'CRSDepTime', 'DepTime', 'DepDelayMinutes', 'CRSArrTime', 'ArrTime', 'ArrDelayMinutes']
    for title in titles:
        driver.find_element_by_xpath("//*[@title="+'"'+title+'"'+']').click()
        
    
    #Loop through list of tuples of months and years, downloading data from each combination
    a = '2017'
    b = '2018'
    times = [('January', a), ('February', a) , ('March', a), ('April', a), 
             ('May', a), ('June', a), ('July', a), ('August', a), 
             ('September', a), ('October', a), ('November', a), ('December', a), 
             ('January', b), ('February', b), ('March', b), ('April', b), 
             ('May', b), ('June', b), ('July', b)]


    for t in times:
        select_year = Select(driver.find_element_by_id("XYEAR"))
        select_year.select_by_value(t[1])
        select_month = Select(driver.find_element_by_id("FREQUENCY"))
        select_month.select_by_visible_text(t[0])
        driver.find_element_by_name('Download').click()
        # This takes FOREVER but if you don't do this you get the same file many times
        time.sleep(60)


scrape_flight_data()

In [None]:
# Move downloaded data into project folder for further processing
def move_flight_data():
    # move zip files from downloads to project directory
    for file in os.listdir('/Users/dcotes/downloads'):
        if file.endswith('csv'):
            os.rename('/Users/dcotes/downloads/'+file, "/Users/dcotes/Documents/metis/Projects/Luther/flights/"+file)

move_flight_data()


In [None]:
# TODO: Programmatically extract zip files -> csv

# def extract_flight_data():
#     os.chdir('flights')
#     for file in os.listdir():
#         file_name = os.path.abspath(file) # get full path of files
#         zip_ref = zipfile.ZipFile(file_name) # create zipfile object
#         zip_ref.extractall(os.getcwd()) # extract file to dir
#         zip_ref.close() # close file
#         os.remove(file_name)
    

In [1]:
# Combine all data into one csv file for processing
def combine_data():
    combined_csv = pd.concat([pd.read_csv('flights/'+f) for f in os.listdir('flights/')],
                            sort=True)
    combined_csv.to_csv("all_flights.csv", index=False)

    

In [5]:
combine_data()