# Scrape flight information from [flightradar24.com](https://www.flightradar24.com)

Flight data was scraped for the appropriate airports near to outbreak locations. However, this data wasn't incorporated into the model due to time limitations.

In [33]:
import pandas as pd
import numpy as np
import requests

import re
import time
from StringIO import StringIO

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

from csv_pkl_sql import save_it

In [34]:
airports = pd.read_pickle('../pkl/02_airport_information_fallingrain.pkl')
airports.head(1)

Unnamed: 0,city,FAA,IATA,ICAO,kind,latitude,longitude,max_runway,name,country,state
56,BAHIA BLANCA,,BHI,SAZB,Medium,-38.725,-62.169,8579.0,COMANDANTE ESPORA,Argentina,


## Country URLs
First get the urls for each country's airports.

In [35]:
# ! ssh -ND 8081 server1
# ! ssh -ND 8082 server2
# ! ssh -ND 8083 server3
# ! ssh -ND 8084 sever4

port_list = [8081, 8082, 8083, 8084]

chrome_options1 = Options().add_argument("--proxy-server=socks5://127.0.0.1:" + str(port_list[0]))
chrome_options2 = Options().add_argument("--proxy-server=socks5://127.0.0.1:" + str(port_list[1]))
chrome_options3 = Options().add_argument("--proxy-server=socks5://127.0.0.1:" + str(port_list[2]))
chrome_options4 = Options().add_argument("--proxy-server=socks5://127.0.0.1:" + str(port_list[3]))

driver1 = webdriver.Chrome('/Volumes/Files/homebrew/bin/chromedriver', chrome_options=chrome_options1)
driver2 = webdriver.Chrome('/Volumes/Files/homebrew/bin/chromedriver', chrome_options=chrome_options2)
driver3 = webdriver.Chrome('/Volumes/Files/homebrew/bin/chromedriver', chrome_options=chrome_options3)
driver4 = webdriver.Chrome('/Volumes/Files/homebrew/bin/chromedriver', chrome_options=chrome_options4)

driver_list = [driver1, driver2, driver3, driver4]

In [36]:
def get_table_rows(url, text=True, driver=driver1):
    driver.get(url)
    html = driver.page_source
    soup = BeautifulSoup(html, 'lxml')
    
    a = soup.find('tbody').find_all('a')
    if text:
        links = [(x.text.strip(), x.get('href')) 
                 for x in a 
                 if (x.has_attr('href') and x.get('title') != 'Scroll to top')]
        
    else:
        links = [(x.get('title'), x.get('href')) 
                 for x in a 
                 if (x.has_attr('href') and x.get('title') != 'Scroll to top')]
    
    return links

In [37]:
countries = get_table_rows('https://www.flightradar24.com/data/airports', False)
countries_df = pd.DataFrame(countries, columns=['country','url']).drop_duplicates()
countries_df.head(1)

Unnamed: 0,country,url
0,Afghanistan,https://www.flightradar24.com/data/airports/af...


## Airport URLs
Get the URLs for each airport within the countries present.

In [38]:
airports.country.nunique(), countries_df.country.isin(airports.country.unique()).sum()

(12, 12)

In [39]:
mask = countries_df.country.isin(airports.country.unique())
countries_df = countries_df.loc[mask]

In [40]:
airport_flight_list = list()
for row,dat in countries_df.iterrows():
    country = dat.country
    url = dat.url
    df = pd.DataFrame(get_table_rows(url), columns=['airport','url'])
    df['country'] = country
    airport_flight_list.append(df)
    time.sleep(1)

In [41]:
airport_flight_df = pd.concat(airport_flight_list).reset_index(drop=True)

In [42]:
airport_flight_df[['IATA','ICAO']] = airport_flight_df.airport.str.extract(r"""\(([A-Z]+)\/([A-Z]+)\)""",expand=True)
airport_flight_df['airport'] = airport_flight_df.airport.str.replace(r""" \(.+\)""", '').str.strip()

In [43]:
airport_flight_df.head(1)

Unnamed: 0,airport,url,country,IATA,ICAO
0,Bahia Blanca Comandante Espora Airport,https://www.flightradar24.com/data/airports/bhi,Argentina,BHI,SAZB


In [44]:
airport_flight_url = pd.merge(airports[['country','name','kind','IATA','ICAO']],
         airport_flight_df[['country','IATA','ICAO','url']],
         on=['country','IATA','ICAO'],
         how='right')

In [45]:
airport_flight_url.shape[0], airport_flight_url.url.isnull().sum()

(840, 0)

In [46]:
airport_flight_url.head(1)

Unnamed: 0,country,name,kind,IATA,ICAO,url
0,Argentina,COMANDANTE ESPORA,Medium,BHI,SAZB,https://www.flightradar24.com/data/airports/bhi


In [47]:
airport_flight_url.shape

(840, 6)

In [48]:
mask = (airport_flight_url.IATA.isin(airports.IATA) | airport_flight_url.ICAO.isin(airports.ICAO))
airport_flight_url = airport_flight_url.loc[mask]
airport_flight_url.shape

(648, 6)

## Flight data for airports

Now get flight schedules for each of the airports.

In [58]:
def get_arrivals(url, country, driver_list=driver_list):
    
    driver = np.random.choice(driver_list)
    
    try:
        driver.get(url+'#arrivals')
        time.sleep(1.5)
        driver.find_element_by_xpath('//*[@id="tblFlightsArrivalsAll"]/thead/tr[2]/td/button').click()
        time.sleep(1.5)

        html = driver1.page_source
        soup = BeautifulSoup(html, 'lxml')
        tr = soup.find_all('tr', attrs={'class':'ng-scope'})

        attr_list = list()
        first_date = False

        for x in soup.find_all('tr'):
            try:
                attr_class = x.attrs['class'][0]

                if attr_class=='row-date-separator':
                    first_date = True
                    date = x.text

                elif attr_class=='ng-scope':
                    td = x.find_all('td')
                    if (len(td) > 0) and first_date:
                        td_list = [td[2], td[1], td[0], td[6], td[4]]
                        td_str = [re.sub(r"""[-\t\n]*""", '', x.text) for x in td_list]
                        td_str.insert(0, date)
                        attr_list.append(td_str)
            except:
                pass


        flight_df = pd.DataFrame(attr_list, 
                     columns=['date', 'city_dept', 'flight',
                              'time_dept', 'time_dest', 'aircraft'])

        flight_df['code_dept'] = flight_df.city_dept.str.extract(r"""\(([A-Z]+)\)""", expand=False)
        flight_df['city_dept'] = flight_df.city_dept.str.replace(r"""\(([A-Z]+)\)""", '')
        flight_df['country_dest'] = country
        flight_df['code_dest'] = url.split('/')[-1].upper()
    except:
        flight_df = pd.DataFrame([np.NaN]*10, columns=['date', 'city_dept', 'flight',
                                                      'time_dept', 'time_dest', 'aircraft',
                                                      'code_dept', 'city_dept', 
                                                      'country_dest', 'code_dest'])

    return flight_df

In [59]:
airport_flight_list = list()

for row,df in airport_flight_url.iterrows():
    url = df.url
    country = df.country
    airport_flight_list.append(get_arrivals(url, country))

In [75]:
airport_flight_df.head(1)

Unnamed: 0,date,flight,aircraft,time_dept,city_dept,IATA_dept,time_dest,country_dest,IATA_dest
0,"Sunday, Jul 31",AZ680,B772(EIDBL),6:40 AM,Rome,FCO,Landed7:07 AM,Argentina,EZE


In [76]:
airport_flight_df = pd.concat(airport_flight_list)[['date', 'flight', 'aircraft',
                                'time_dept', 'city_dept', 'code_dept',
                                'time_dest', 'country_dest', 'code_dest']]

airport_flight_df.rename(columns={'code_dept':'IATA_dept', 'code_dest':'IATA_dest'}, inplace=True)

In [83]:
date_mapper = {'Saturday, Jul 30' : '07/30/2016',
               'Sunday, Jul 31'   : '07/31/2016',  
               'Monday, Aug 01'   : '08/01/2016'}

airport_flight_df['date_time'] = (airport_flight_df.date.apply(lambda x: date_mapper[x]) + ' ' + 
                                  airport_flight_df.time_dept)

airport_flight_df['date_time'] = pd.to_datetime(airport_flight_df.date_time)

In [86]:
airport_flight_df.head(1)

Unnamed: 0,date,flight,aircraft,time_dept,city_dept,IATA_dept,time_dest,country_dest,IATA_dest,date_time
0,"Sunday, Jul 31",AZ680,B772(EIDBL),6:40 AM,Rome,FCO,Landed7:07 AM,Argentina,EZE,2016-07-31 06:40:00


In [94]:
save_it(airport_flight_df, '08_flight_data')