# Getting the weather data of the airports

Using the table at http://www.flugzeuginfo.net/table_airportcodes_country-location_en.php
a csv with the IATA and ICAO codes of the airports can be created.

Using the ICAO codes the weather data can be retrieved from this website: https://www.wunderground.com/history/daily/de/frankfurt/EDDF/date/2015-3-18

Given a list with the destination airport codes and the schedules arrival time, a list of the corresponding weather conditions (wind speed, visibility) can be web scraped.

This list of weater conditions can then be incorporated into the dataframe with the delays and be subsequently used in a predictive model.

In [282]:
from datetime import datetime

import pandas as pd
import numpy as np

In [264]:
# The airport and date and time of departure
df_dep = pd.read_csv("data/sanitized_Train_data.csv", usecols=[3, 5], parse_dates=[1])
df_dep.columns = ['IATA', 'DATE_TIME']

In [265]:
df_dep.head()

Unnamed: 0,IATA,DATE_TIME
0,CMN,2016-01-03 10:30:00
1,MXP,2016-01-13 15:05:00
2,TUN,2016-01-16 04:10:00
3,DJE,2016-01-17 14:10:00
4,TUN,2016-01-17 14:30:00


In [266]:
# The airport and date and time of departure
df_dest = pd.read_csv("data/sanitized_Train_data.csv", usecols=[4, 6], parse_dates=[1])
df_dest.columns = ['IATA', 'DATE_TIME']

In [267]:
df_dest.head()

Unnamed: 0,IATA,DATE_TIME
0,TUN,2016-01-03 12:55:00
1,TUN,2016-01-13 16:55:00
2,IST,2016-01-16 06:45:00
3,NTE,2016-01-17 17:00:00
4,ALG,2016-01-17 15:50:00


In [268]:
df = pd.DataFrame()
df = pd.concat([df_dep, df_dest], axis=0)
df

Unnamed: 0,IATA,DATE_TIME
0,CMN,2016-01-03 10:30:00
1,MXP,2016-01-13 15:05:00
2,TUN,2016-01-16 04:10:00
3,DJE,2016-01-17 14:10:00
4,TUN,2016-01-17 14:30:00
...,...,...
107828,TUN,2018-07-06 02:00:00
107829,TUN,2018-01-13 09:00:00
107830,TUN,2018-11-07 12:50:00
107831,DJE,2018-01-23 18:45:00


In [269]:
df_airport_codes = pd.read_csv('data/list_IATA_ICAO_codes.csv', usecols=[0, 1])

In [270]:
code = df_airport_codes[df_airport_codes['IATA'] == 'CMN']['ICAO']
code = code.iloc[0]
type(code)

str

In [271]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 215666 entries, 0 to 107832
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   IATA       215666 non-null  object        
 1   DATE_TIME  215666 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 4.9+ MB


In [272]:
df = df.join(df_airport_codes.set_index('IATA'), on='IATA', how='left', lsuffix='_left', rsuffix='_right')

In [273]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 215666 entries, 0 to 107832
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   IATA       215666 non-null  object        
 1   DATE_TIME  215666 non-null  datetime64[ns]
 2   ICAO       215666 non-null  object        
dtypes: datetime64[ns](1), object(2)
memory usage: 6.6+ MB


In [275]:
df[df['ICAO'].isna()]# ['IATA'].unique()

Unnamed: 0,IATA,DATE_TIME,ICAO


In [276]:
df["ICAO"].unique()

array(['GMMN', 'LIMC', 'DTTA', 'DTTJ', 'LFBO', 'LTBA', 'LFPO', 'DTMB',
       'EBBR', 'DIAP', 'LIPZ', 'EHAM', 'EDDF', 'LEBL', 'OEJN', 'DAAG',
       'LPPT', 'LFST', 'LFLL', 'DFFD', 'EGKK', 'OLBA', 'LFMN', 'LPPR',
       'LFML', 'EDDL', 'DTTX', 'LIRF', 'LFPG', 'GQNN', 'LFRS', 'LSZH',
       'LSGG', 'GMFO', 'EDDM', 'EDDB', 'EDDH', 'GMMW', 'DTNH', 'HECA',
       'LYBE', 'LOWW', 'DAOO', 'LEMD', 'DTTZ', 'GABS', 'GOOY', 'HRYR',
       'LIPE', 'LMML', 'GMTA', 'EGLL', 'LFBD', 'LKPR', 'LJLJ', 'UUEE',
       'OEMA', 'LHBP', 'ESSA', 'EKCH', 'EBCI', 'EBOS', 'GMTT', 'DTTF',
       'LIRN', 'LKTB', 'LKMT', 'CYUL', 'DRRN', 'DAAT', 'HFFF', 'EDDK',
       'ORER', 'DTTG', 'DAAE', 'LTAI', 'GMMX', 'DXXX', 'LFQQ', 'LICJ',
       'FZQA', 'DTKA', 'LIRP', 'HSSS', 'LFLS', 'DABC', 'FOON', 'EYVI',
       'LTAC', 'FOOL', 'GUCY', 'ULLI', 'LZKZ', 'LZIB', 'OJAI', 'LROP',
       'LIMF', 'UKKK', 'HEBA', 'LIEE', 'UKBB', 'LGAV', 'LGTS', 'LTBJ',
       'GOBD', 'OTBD', 'DBBB', 'ELLX', 'FZAA', 'LPBJ', 'BIKF', 'DTTR',
      

In [277]:
df['DATE'] = df['DATE_TIME'].dt.date

In [278]:
#df.columns = ['IATA', 'DATE_TIME', 'ICAO', '', '', '', '', '']
# Adding columns for weather data
df['TIME_OBSERVATION'] = np.nan
df['TEMP'] = np.nan
df['WIND_SPEED'] = np.nan
df['PRECIP'] = np.nan
df['CONDITION'] = np.nan

In [279]:
df

Unnamed: 0,IATA,DATE_TIME,ICAO,DATE,TIME_OBSERVATION,TEMP,WIND_SPEED,PRECIP,CONDITION
0,CMN,2016-01-03 10:30:00,GMMN,2016-01-03,,,,,
1,MXP,2016-01-13 15:05:00,LIMC,2016-01-13,,,,,
2,TUN,2016-01-16 04:10:00,DTTA,2016-01-16,,,,,
3,DJE,2016-01-17 14:10:00,DTTJ,2016-01-17,,,,,
4,TUN,2016-01-17 14:30:00,DTTA,2016-01-17,,,,,
...,...,...,...,...,...,...,...,...,...
107828,TUN,2018-07-06 02:00:00,DTTA,2018-07-06,,,,,
107829,TUN,2018-01-13 09:00:00,DTTA,2018-01-13,,,,,
107830,TUN,2018-11-07 12:50:00,DTTA,2018-11-07,,,,,
107831,DJE,2018-01-23 18:45:00,DTTJ,2018-01-23,,,,,


### Now the weather data have to be retrieved from the website using some web scraper

This will be done as follows:
1. Start with the first row in ```df```
1. Retrieve the weather data (unless they are already downloaded) and save them in weather_df
1. Find the weather observation closest to the given time
1. Add the weather observation to df
1. Continue with the next line in step 2. End when all lines have been processed.

In [280]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

In [283]:
import time

from selenium import webdriver
from selenium.webdriver.common.by import By
weather_obs = pd.DataFrame(columns=['DATE_TIME', 'ICAO', 'TEMP', 'WIND_SPEED', 'PRECIP', 'CONDITION'])

count = 0

driver = webdriver.Chrome(executable_path='/home/fklein/zindi/Flight_Delay_Prediction_Challenge/chromedriver')  # Optional argument, if not specified will search path.


# search_box = driver.find_element_by_name('')
# <table _ngcontent-app-root-c202="" mat-table="" aria-labelledby="History observation" matsort="" aria-label="table of contents" class="mat-table cdk-table mat-sort ng-star-inserted" role="grid">
# <tr _ngcontent-app-root-c202="" role="row" mat-row="" class="mat-row cdk-row ng-star-inserted">

for index, row in df.iterrows():
    count += 1
    print(f"Count: {count}")
    if count > 2:
        break
    date_str = datetime.strftime(row['DATE_TIME'], '%Y-%m-%d')
    icao_code = row['ICAO']
    print(f"Date: {date_str}, ICAO: {icao_code}")

    # check if the data are already downloaded
    if weather_obs[(weather_obs['ICAO']==icao_code) & (pd.to_datetime(weather_obs['DATE_TIME']).dt.date==datetime.strptime(date_str, '%Y-%m-%d').date())].shape[0] > 1:
        continue

    driver.get('https://www.wunderground.com/history/daily/de/frankfurt/'+icao_code+'/date/'+date_str);

    time.sleep(1) # Let the user actually see something!

    # try:
    if True:
        # weather_table = driver.find_elements_by_xpath(
        weather_table = driver.find_elements(by=BY.XPATH, value=
                '//table[@class="mat-table cdk-table mat-sort ng-star-inserted"]/tbody/tr/td')

        table_html = list()
        table_html = list(weather_table)
        for ii in range(len(table_html)//10):
            dd = dict(
                zip(list(weather_obs.columns),[
                    [pd.to_datetime(datetime.strptime(date_str+'-'+table_html[0+ii*10].text, '%Y-%m-%d-%I:%M %p'))],
                    [icao_code],
                    [table_html[1+ii*10].text],
                    [table_html[5+ii*10].text],
                    [table_html[8+ii*10].text],
                    [table_html[9+ii*10].text]
            ]))
            df_temp = pd.DataFrame(data=dd)
            weather_obs = pd.concat([weather_obs, df_temp])
            # print(f"Time: {table_html[0+ii*10].text}, Windspeed: {table_html[5+ii*10].text}, Conditions: {table_html[9+ii*10].text}")
    # except Exception as e:
    #     print(e)

driver.quit()

  driver = webdriver.Chrome(executable_path='/home/fklein/zindi/Flight_Delay_Prediction_Challenge/chromedriver')  # Optional argument, if not specified will search path.


Count: 1
Date: 2016-01-03, ICAO: GMMN


NameError: name 'BY' is not defined

In [243]:
weather_obs.info()
weather_obs['ICAO'].unique()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4089 entries, 0 to 0
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   DATE_TIME   4089 non-null   object
 1   ICAO        4089 non-null   object
 2   TEMP        4089 non-null   object
 3   WIND_SPEED  4089 non-null   object
 4   PRECIP      4089 non-null   object
 5   CONDITION   4089 non-null   object
dtypes: object(6)
memory usage: 223.6+ KB


array(['GMMN', 'LIMC', 'DTTA', 'DTTJ', 'LFBO', 'LTBA', 'LFPO', 'DTMB',
       'EBBR', 'LIPZ', 'EHAM', 'EDDF', 'LEBL', 'DAAG', 'LPPT', 'LFST',
       'LFLL', 'EGKK', 'OLBA', 'OEJN', 'LFMN', 'LPPR', 'LFML', 'EDDL',
       'LIRF', 'LFPG', 'LFRS', 'DIAP', 'LSZH', 'LSGG', 'GMFO', 'EDDM',
       'EDDB', 'EDDH'], dtype=object)

array(['NDR', 'NBE', 'GAE', 'VNO', 'DSS', 'BYJ', 'SKX'], dtype=object)

In [198]:
pd.DataFrame(data=dd)

Unnamed: 0,DATE_TIME,ICAO,TEMP,WIND_SPEED,PRECIP,CONDITION
0,2016-01-13 00:50:00,LIMC,34 °F,9 mph,0.0 in,Fair


In [206]:
pd.to_datetime(weather_obs['DATE_TIME'].iloc[0]).date


<function Timestamp.date>

In [219]:
pd.to_datetime(weather_obs['DATE_TIME']).dt.date.iloc[0]

datetime.date(2016, 1, 3)

In [237]:
weather_obs[(weather_obs['ICAO']=="LIMC") & (pd.to_datetime(weather_obs['DATE_TIME']).dt.date==datetime.strptime('2016-01-13', '%Y-%m-%d').date())].shape[0]

46