In [1]:
# imports functions from the other notebooks
import sys, os
sys.path.append(os.path.join(os.path.dirname(''), '..'))
#sys.path.append(os.path.join(os.path.dirname(''), '../..'))
from proj3_gans_scooters.src.scraping import scrape_wiki_cities, scrape_weather, icao_airport_codes, city_airport_distance
from proj3_gans_scooters.src.utils import PrivateKeysHandler, MyMySQLConnection, load_or_execute_df

In [2]:
import datetime as dt
from pytz import timezone

In [3]:
import pandas as pd
import requests

In [6]:
relative_path_to_file = '.env_aws'
keys = PrivateKeysHandler(relative_path_to_file)
api_key_dict = keys.load_keys('APIs')

Here we set the date, so we get the flights for the next day

In [4]:
today = dt.datetime.now().astimezone(timezone('Europe/Berlin')).date()
tomorrow = (today + dt.timedelta(days=1)).strftime('%Y-%m-%d') #format YYYY-MM-DD is used in the url for the aerodatabox
tomorrow

'2022-06-15'

Since for the API we can only check flights for 12 hours, we define a list of two times.
This way we call our function twice in order to cover the whole day.

In [5]:
times = [['00:00', '11:59'],['12:00', '23:59']]

here comes the function. Input is a list of icao's. the function returns a dataframe of all the flights of the next day.
one can see that the url is requested twice for every airport_icao.
We use the .get function for dictionaries in order to get the information. 
This prevents unwanted errors: if for example a departure_icao doesn't exist, the a 'None'-value is added to the list.

At the end of the for loop, the arrival_time_local is split into the actual local time and the timezone.
after reordering the columns, the dataframe is returned.

In [17]:
def flights_scraping_tomorrow(icao_list):
    list_for_flights_df = []
    times = [['00:00', '11:59'],['12:00', '23:59']]
    querystring = {"withLeg":"true","direction":"Arrival"}
    headers = {
        "X-RapidAPI-Key": api_key_dict['aerodatabox_key'],
        "X-RapidAPI-Host": "aerodatabox.p.rapidapi.com"
    }
    for icao in icao_list:
        for time in times:
            url = f"https://aerodatabox.p.rapidapi.com/flights/airports/icao/{icao}/{tomorrow}T{time[0]}/{tomorrow}T{time[1]}"
            response = requests.request("GET", url, headers=headers, params=querystring)
            flights_json = response.json()
            for flight in flights_json['arrivals']:
                flights_dict = {}
                flights_dict['arrival_icao'] = icao
                flights_dict['arrival_time_local'] = flight['arrival'].get('scheduledTimeLocal', None)
                flights_dict['departure_city'] = flight['departure']['airport'].get('name', None)
                flights_dict['departure_icao'] = flight['departure']['airport'].get('icao', None)
                flights_dict['departure_time_local'] = flight['departure'].get('scheduledTimeLocal', None)
                flights_dict['airline'] = flight['airline'].get('name', None)
                flights_dict['flight_number'] = flight.get('number', None)
                flights_dict['data_retrieved_on'] = dt.datetime.now().astimezone(timezone('Europe/Berlin')).date()
                list_for_flights_df.append(flights_dict)
    flights_df = pd.DataFrame(list_for_flights_df)
    flights_df['timezone'] = 'UTC+' + flights_df['arrival_time_local'].str.split('+')[0][1].split(':')[0]
    flights_df['arrival_time_local'] = flights_df['arrival_time_local'].str.split('+')[0][0]
    cols = ['arrival_icao', 'arrival_time_local', 'timezone', 'departure_icao', 'departure_city', 'airline', 'flight_number', 'data_retrieved_on']
    flights_df = flights_df[cols]
    return flights_df

In [11]:
icao_list=['EDDB','EDDT']

Error because I had no requests left for the API

In [37]:
flights_df = flights_scraping_tomorrow(icao_list)

KeyError: 'arrivals'

# Rest of the file is just me playing around with splitting columns in the dataframe and can be ignored.

In [22]:
flights_df

Unnamed: 0,arrival_icao,arrival_time_local,departure_city,departure_icao,departure_time_local,airline,flight_number,data_retrieved_on
0,EDDB,2022-06-15 09:45+02:00,Bazel,,,easyJet,U2 5630,2022-06-14
1,EDDB,2022-06-15 10:30+02:00,Amsterdam,,,easyJet,U2 4562,2022-06-14
2,EDDB,2022-06-15 10:50+02:00,Bolzano,,,FlexFlight,W2 1250,2022-06-14
3,EDDB,2022-06-15 11:25+02:00,Reykjavik,BIKF,2022-06-15 05:45+00:00,Ghodawat Enterprises,OG 700,2022-06-14
4,EDDB,2022-06-15 09:45+02:00,Brussels,EBBR,2022-06-15 08:15+02:00,Ryanair,FR 163,2022-06-14
...,...,...,...,...,...,...,...,...
224,EDDB,2022-06-15 12:10+02:00,Istanbul,LTFJ,2022-06-15 10:10+03:00,Pegasus,PC 979,2022-06-14
225,EDDB,2022-06-15 14:05+02:00,Istanbul,LTFM,2022-06-15 12:15+03:00,Turkish,TK 1725,2022-06-14
226,EDDB,2022-06-15 18:00+02:00,Istanbul,LTFM,2022-06-15 16:10+03:00,Turkish,TK 1723,2022-06-14
227,EDDB,2022-06-15 19:55+02:00,Belgrade,LYBE,2022-06-15 18:05+02:00,Air Serbia,JU 354,2022-06-14


In [66]:
flights_df['timezone'] = 'UTC+' + flights_df['arrival_time_local'].str.split('+')[0][1].split(':')[0]
flights_df['arrival_time_local'] = flights_df['arrival_time_local'].str.split('+')[0][0]
cols = ['arrival_icao', 'arrival_time_local', 'timezone', 'departure_icao', 'departure_city', 'airline', 'flight_number', 'data_retrieved_on']
flights_df = flights_df[cols]
flights_df

Unnamed: 0,arrival_icao,arrival_time_local,timezone,departure_icao,departure_city,airline,flight_number,data_retrieved_on
0,EDDB,2022-06-15 09:45,UTC+02,,Bazel,easyJet,U2 5630,2022-06-14
1,EDDB,2022-06-15 09:45,UTC+02,,Amsterdam,easyJet,U2 4562,2022-06-14
2,EDDB,2022-06-15 09:45,UTC+02,,Bolzano,FlexFlight,W2 1250,2022-06-14
3,EDDB,2022-06-15 09:45,UTC+02,BIKF,Reykjavik,Ghodawat Enterprises,OG 700,2022-06-14
4,EDDB,2022-06-15 09:45,UTC+02,EBBR,Brussels,Ryanair,FR 163,2022-06-14
...,...,...,...,...,...,...,...,...
224,EDDB,2022-06-15 09:45,UTC+02,LTFJ,Istanbul,Pegasus,PC 979,2022-06-14
225,EDDB,2022-06-15 09:45,UTC+02,LTFM,Istanbul,Turkish,TK 1725,2022-06-14
226,EDDB,2022-06-15 09:45,UTC+02,LTFM,Istanbul,Turkish,TK 1723,2022-06-14
227,EDDB,2022-06-15 09:45,UTC+02,LYBE,Belgrade,Air Serbia,JU 354,2022-06-14


In [None]:
# import datetime as dt
# now = dt.datetime.now()
# current_date = now.strftime('%Y-%m-%d')
# url1 = f"https://aerodatabox.p.rapidapi.com/flights/airports/icao/{icao}/{current_date}T00:00/{current_date}T11:59"
#    # url2 = f"https://aerodatabox.p.rapidapi.com/flights/airports/icao/{icao}/{current_date}T12:00/{current_date}T23:59"
    
# querystring = {"withLeg":"true","direction":"Arrival"}

# headers = {
#         "X-RapidAPI-Key": api_key_dict['aerodatabox_key'],
#         "X-RapidAPI-Host": "aerodatabox.p.rapidapi.com"
#     }
# ICAOcode = requests.request("GET", url1, headers=headers, params=querystring)

In [None]:
#ICAOcode.json()['arrivals']

In [44]:
flights_df['arrival_time_local'].str.split('+')[0]

['2022-06-15 09:45', '02:00']

In [None]:
#flight = ICAOcode.pd.normalize

In [None]:
df = pd.DataFrame(data = [[1,2,3],[2,3,4],[3,4,5]], columns = ['a', 'b', 'c'])
df = df[['a','c']]
#df = df.drop(columns= ['b'])
df