# Datapipeline Gans
## Loading API keys

In [1]:
def load_keys(relative_path) -> dict:
    import configparser
    import os

    configParser = configparser.RawConfigParser()   
    configFilePath = os.path.join(os.path.dirname(''), relative_path) 
    configParser.read(configFilePath)
    
    return {'openweather_key' : configParser.get('APIs', 'openweather_key'),
            'aerodatabox_key' : configParser.get('APIs', 'aerodatabox_key')}

In [None]:
import utils

In [None]:
keys = PrivateKeysHandler(relative_path_to_file)
api_key_dict = keys.load_keys('APIs')

In [2]:
d = load_keys('.env')
openweather_key = d['openweather_key']
aerodatabox_key = d['aerodatabox_key']

# Importing libraries

In [3]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

# Getting the cities_df

To collect information about European cities Wikipedia was used. We extracted the city names and population table found in the url below.

In [4]:
url = "https://en.wikipedia.org/wiki/List_of_cities_in_the_European_Union_by_population_within_city_limits"
response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [5]:
#  parse html (create the 'soup')
soup = BeautifulSoup(response.content, "html.parser")
# check that the html code looks like it should
#soup.prettify

In [6]:
# collecting city and population of the first 50 entries of the table
cities = []
population = []
for i in range(2, 52):
        cities.append(soup.select(f'.wikitable > tbody > tr:nth-child({i}) > td:nth-child(2)')[0].get_text())
        population.append(soup.select(f'.wikitable > tbody > tr:nth-child({i}) > td:nth-child(4) > span')[0].get_text())

We also want the geographical information (latitude, longitude) of the cities. To get those we opened for each city it's corresponding Wikipedia page (city_url). Fortunately lat and lon were at the same spot each time. So a single for loop was enough.

In [7]:
latitude = []
longitude = []
for city in cities:
    city_url = 'https://en.wikipedia.org/wiki/' + city
    city_response = requests.get(city_url)
    soup = BeautifulSoup(city_response.content, "html.parser")
    latitude.append(soup.select(".latitude")[0].get_text())
    longitude.append(soup.select(".longitude")[0].get_text())

In [8]:
#latitude

In [9]:
assert len(cities) == len(population) == len(latitude) == len(longitude)

In [10]:
# making a df with the columns we gathered
cities_df = pd.DataFrame( data = {'city': cities,
                         'lat':latitude,
                         'lon': longitude,
                         'population':population}
                        )
cities_df

Unnamed: 0,city,lat,lon,population
0,Berlin,52°31′12″N,13°24′18″E,3664088
1,Madrid,40°25′00″N,03°42′09″W,3305408
2,Rome,41°53′36″N,12°28′58″E,2770226
3,Bucharest,44°25′57″N,26°6′14″E,2161347
4,Paris,48°51′24″N,2°21′08″E,2139907
5,Vienna,48°12′N,16°22′E,1951354
6,Hamburg,53°33′55″N,10°00′05″E,1852478
7,Warsaw,52°13′48″N,21°00′40″E,1792718
8,Budapest,47°29′33″N,19°03′05″E,1723836
9,Barcelona,41°23′N,2°11′E,1636732


For the population to be an integer, we have to remove the commas:

In [11]:
cities_df['population'] = cities_df.population.str.replace(',', '')
cities_df['population'] = pd.to_numeric(cities_df['population'])
cities_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   city        50 non-null     object
 1   lat         50 non-null     object
 2   lon         50 non-null     object
 3   population  50 non-null     int64 
dtypes: int64(1), object(3)
memory usage: 1.7+ KB


Since we want to use the longitude and latitude for the aerodata API, we have to change the format of these entries. We want to have float numbers instead of sth like 12°34´56´´N.
Note that a latitude with an S (south) and a longitude with a W (west) correspond to negative numbers in this case.

In [12]:
# since all latitudes gathered are on the north half and not too close to the equator,
# we can just remove all non-digits and pot a floating point after the second digit.
# If we would look at cities all over the world, we should proceed as we do for the longitude
cities_df['lat'] = cities_df.lat.str.replace('\D','', regex=True)
cities_df['lat'] = cities_df.lat.str[:2] + '.' + cities_df.lat.str[2:]

In [13]:
cities_df['lon'] = cities_df.lon.str.replace('°','.')
cities_df.loc[cities_df.lon.str[-1] == 'W','lon'] = '-' + cities_df.loc[cities_df.lon.str[-1] == 'W','lon']

In [14]:
cities_df['lon'] = cities_df['lon'].replace('[EW]', '', regex=True)

In [15]:
cities_df['lon'] = cities_df['lon'].replace('″', '', regex=True)
cities_df['lon'] = cities_df['lon'].replace('′', '', regex=True)

In [16]:
cities_df['lat'] = pd.to_numeric(cities_df['lat'])
cities_df['lon'] = pd.to_numeric(cities_df['lon'])

In [17]:
cities_df

Unnamed: 0,city,lat,lon,population
0,Berlin,52.3112,13.2418,3664088
1,Madrid,40.25,-3.4209,3305408
2,Rome,41.5336,12.2858,2770226
3,Bucharest,44.2557,26.614,2161347
4,Paris,48.5124,2.2108,2139907
5,Vienna,48.12,16.22,1951354
6,Hamburg,53.3355,10.0005,1852478
7,Warsaw,52.1348,21.004,1792718
8,Budapest,47.2933,19.0305,1723836
9,Barcelona,41.23,2.11,1636732


Now we can save the file as a csv-file.

In [18]:
cities_df.to_csv('cities.csv')

## Getting weather information using openweather API

Here we use the API keys that we loaded in the beginning of the notebook.

In [19]:
weather_url = f'http://api.openweathermap.org/data/2.5/forecast?q=Berlin&appid={openweather_key}&units=metric'
response = requests.get(weather_url)
#response.json()
response.json()

{'cod': '200',
 'message': 0,
 'cnt': 40,
 'list': [{'dt': 1654776000,
   'main': {'temp': 20.19,
    'feels_like': 20.07,
    'temp_min': 20.19,
    'temp_max': 21.56,
    'pressure': 999,
    'sea_level': 999,
    'grnd_level': 1006,
    'humidity': 69,
    'temp_kf': -1.37},
   'weather': [{'id': 802,
     'main': 'Clouds',
     'description': 'scattered clouds',
     'icon': '03d'}],
   'clouds': {'all': 40},
   'wind': {'speed': 3.88, 'deg': 285, 'gust': 5.19},
   'visibility': 10000,
   'pop': 0.06,
   'sys': {'pod': 'd'},
   'dt_txt': '2022-06-09 12:00:00'},
  {'dt': 1654786800,
   'main': {'temp': 21.25,
    'feels_like': 20.97,
    'temp_min': 21.25,
    'temp_max': 23.36,
    'pressure': 1003,
    'sea_level': 1003,
    'grnd_level': 1006,
    'humidity': 59,
    'temp_kf': -2.11},
   'weather': [{'id': 803,
     'main': 'Clouds',
     'description': 'broken clouds',
     'icon': '04d'}],
   'clouds': {'all': 60},
   'wind': {'speed': 3.47, 'deg': 290, 'gust': 4.63},
   'visi

In [20]:
def openweather(city):
    weather_url = f'http://api.openweathermap.org/data/2.5/forecast?q={city}&limit=5&appid={openweather_key}&units=metric'
    response = requests.get(weather_url)
    
    city_weather = []
    for i in range(len(response.json()['list'])):
        dt_txt = response.json()['list'][i]['dt_txt']
        temp = response.json()['list'][i]['main']['temp']
        humidity = response.json()['list'][i]['main']['humidity']
        weather = response.json()['list'][i]['weather'][0]['main']
        weather_desc = response.json()['list'][i]['weather'][0]['description']
        clouds = response.json()['list'][i]['clouds']['all']
        pop = response.json()['list'][i]['pop']
        city_weather.append([city, dt_txt, temp, humidity, weather, weather_desc, clouds, pop])
    return(city_weather)

In [21]:
#openweather('Berlin')

In [22]:
cit = []
dt_txt = []
temp = []
humidity = []
weather = []
weather_desc = []
clouds = []
pop = []

for city in cities:
    city_weather = openweather(city)
    for i in range(len(city_weather)):
        cit.append(city_weather[i][0])
        dt_txt.append(city_weather[i][1])
        temp.append(city_weather[i][2])
        humidity.append(city_weather[i][3])
        weather.append(city_weather[i][4])
        weather_desc.append(city_weather[i][5])
        clouds.append(city_weather[i][6])
        pop.append(city_weather[i][7])

In [23]:
city_weather_df = pd.DataFrame(data = {'city':cit, 
                                       'dt_txt':dt_txt, 
                                       'temp':temp, 
                                      'humidity': humidity, 
                                      'weather': weather, 
                                      'weather_desc': weather_desc, 
                                      'clouds': clouds, 
                                      'pop' : pop})

In [24]:
city_weather_df

Unnamed: 0,city,dt_txt,temp,humidity,weather,weather_desc,clouds,pop
0,Berlin,2022-06-09 12:00:00,20.09,69,Clouds,overcast clouds,98,0.06
1,Berlin,2022-06-09 15:00:00,21.18,59,Clouds,overcast clouds,99,0.13
2,Berlin,2022-06-09 18:00:00,20.56,57,Clouds,overcast clouds,99,0.03
3,Berlin,2022-06-09 21:00:00,18.11,71,Rain,light rain,99,0.28
4,Berlin,2022-06-10 00:00:00,14.79,81,Rain,light rain,79,0.27
...,...,...,...,...,...,...,...,...
1995,Antwerp,2022-06-13 21:00:00,13.02,68,Clouds,few clouds,12,0.00
1996,Antwerp,2022-06-14 00:00:00,11.85,73,Clouds,scattered clouds,31,0.00
1997,Antwerp,2022-06-14 03:00:00,10.80,76,Clouds,few clouds,11,0.00
1998,Antwerp,2022-06-14 06:00:00,13.72,65,Clouds,scattered clouds,33,0.00


In [28]:
city_weather_df.to_csv('city_weather.csv')

In [44]:
cities_df

Unnamed: 0,city,lat,lon,population
0,Berlin,52.3112,13.2418,3664088
1,Madrid,40.25,-3.4209,3305408
2,Rome,41.5336,12.2858,2770226
3,Bucharest,44.2557,26.614,2161347
4,Paris,48.5124,2.2108,2139907
5,Vienna,48.12,16.22,1951354
6,Hamburg,53.3355,10.0005,1852478
7,Warsaw,52.1348,21.004,1792718
8,Budapest,47.2933,19.0305,1723836
9,Barcelona,41.23,2.11,1636732


# Airports

we will now gather all the airports which are close to the cities from our cities_df. 
Since we also want to measure the distance from airport to city, we install geopy.
This allows us to get the distance from latitude and longitude alone.

In [59]:
pip install geopy

Collecting geopyNote: you may need to restart the kernel to use updated packages.
  Downloading geopy-2.2.0-py3-none-any.whl (118 kB)

Collecting geographiclib<2,>=1.49
  Downloading geographiclib-1.52-py3-none-any.whl (38 kB)
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-1.52 geopy-2.2.0


In [60]:
from geopy import distance

In [48]:
airports = []

querystring = {"withFlightInfoOnly":"true"}

headers = {
    "X-RapidAPI-Host": "aerodatabox.p.rapidapi.com",
    "X-RapidAPI-Key": d['aerodatabox_key']
}

for i in range(len(cities_df)):
    lat = cities_df.iloc[i]['lat']
    lon = cities_df.iloc[i]['lon']
    airport_url = f"https://aerodatabox.p.rapidapi.com/airports/search/location/{lat}/{lon}/km/50/16"
    
    response = requests.request("GET", airport_url, headers=headers, params=querystring)
    
    for j in range(len(response.json()['items'])):
        country_code = response.json()['items'][j]['countryCode']
        iata = response.json()['items'][j]['iata']
        icao = response.json()['items'][j]['icao']
        name = response.json()['items'][j]['name']
        #short_name = response.json()['items'][j]['shortName']
        airport_lat = response.json()['items'][j]['location']['lat']
        airport_lon = response.json()['items'][j]['location']['lon']
        airports.append([cities_df.iloc[i]['city'], country_code, iata, icao, name, airport_lat, airport_lon])

In [51]:
airports_df = pd.DataFrame( data = airports, columns = ['city',
                                                        'country_code',
                                                        'iata',
                                                        'icao',
                                                        'airport_name',
                                                        'airport_lat', 
                                                        'airport_lon'])

In [1]:
for i in len(airports_df):
    city_lat = cities_df.loc[cities_df['city'].str == airports_df.iloc[i]['city'].str].lat
    city_lon = cities_df.loc[cities_df['city'].str == airports_df.iloc[i]['city'].str].lon
    city_geo = (city_lat, city_lon)
    airport_geo = (airports_df.iloc[i].airport_lat, airports_df.iloc[i].airport_lon)
    airport = 

NameError: name 'airports_df' is not defined

In [47]:
response.json()

{'items': [{'icao': 'ETUR',
   'iata': 'BGN',
   'localCode': 'БЯГ',
   'name': 'Brüggen Air Base',
   'location': {'lat': 51.1997, 'lon': 6.13208},
   'countryCode': 'DE'},
  {'icao': 'EDDL',
   'iata': 'DUS',
   'name': 'Duesseldorf, Düsseldorf',
   'shortName': 'Düsseldorf',
   'municipalityName': 'Duesseldorf',
   'location': {'lat': 51.2895, 'lon': 6.766779},
   'countryCode': 'DE'}]}

In [61]:
print(distance.distance((51.1997, 6.13208), (51.2895, 6.766779)).km)

45.43164214949632


In [39]:
type(response)

requests.models.Response

In [62]:
pd.json_normalize(response.json()['items'])

Unnamed: 0,icao,iata,name,shortName,municipalityName,countryCode,location.lat,location.lon
0,EBAW,ANR,"Antwerp, Antwerp (Deurne)",(Deurne),Antwerp,BE,51.1894,4.460279
1,EBBR,BRU,Brussels,Brussels,Brussels,BE,50.9014,4.484439
2,EHWO,WOE,"Bergen op Zoom, Woensdrecht Air Base",Woensdrecht Air Base,Bergen op Zoom,NL,51.4491,4.34203
