In [2]:
import altair as alt
import pandas as pd
import numpy as np
import re
from vega_datasets import data

In [3]:
airport_url = 'csv/airports.csv'
airports_df = pd.read_csv(airport_url, encoding='latin-1')


In [4]:
airports_df.head(5)


Unnamed: 0,Airport ID,Name,City,Country,IATA,ICAO,Latitude,Longitude,Altitude,TimeZone,DST,Tz Database Time Zone,Type,Source
0,1,Goroka Airport,Goroka,Papua New Guinea,GKA,AYGA,-6.08169,145.391998,5282,10,U,Pacific/Port_Moresby,airport,OurAirports
1,2,Madang Airport,Madang,Papua New Guinea,MAG,AYMD,-5.20708,145.789002,20,10,U,Pacific/Port_Moresby,airport,OurAirports
2,3,Mount Hagen Kagamuga Airport,Mount Hagen,Papua New Guinea,HGU,AYMH,-5.82679,144.296005,5388,10,U,Pacific/Port_Moresby,airport,OurAirports
3,4,Nadzab Airport,Nadzab,Papua New Guinea,LAE,AYNZ,-6.569803,146.725977,239,10,U,Pacific/Port_Moresby,airport,OurAirports
4,5,Port Moresby Jacksons International Airport,Port Moresby,Papua New Guinea,POM,AYPY,-9.44338,147.220001,146,10,U,Pacific/Port_Moresby,airport,OurAirports


### Derive country continents from new dataset

In [5]:
# number of airports in each continent
# find out how to derive continent
continents = pd.read_csv('csv/continents.csv')
continents
# drop code and year attributes
continents.drop(columns=['Code', 'Year'],inplace=True)


# aggregate all airports in each continent

Certain country names do not match in the datasets so they must be normalized. To do this we must create a copy of the original dataframe and replace the unmatching countries with their continents. we end up with a mix of countries and continents in one column

In [6]:

# # for each row in airports, check if country is equal to entity (continents), append continent to it
temp_df = airports_df.copy()
for country in temp_df['Country'].to_list():
    for row in continents.values:
        if country.lower() in row[0].lower():
            temp_df['Country'] = temp_df['Country'].replace(
                country, row[1])
            break
mixed_continents_and_countries = temp_df['Country'].unique()


In [7]:
mixed_continents_and_countries

array(['Oceania', 'North America', 'Europe', 'Africa', 'South America',
       'Faroe Islands', 'Congo (Brazzaville)', 'Congo (Kinshasa)',
       'Swaziland', 'Asia', 'Czech Republic', 'Antarctica', 'West Bank',
       'Midway Islands', 'Macau', 'Burma', 'East Timor', 'Johnston Atoll',
       'Cocos (Keeling) Islands', 'Wake Island'], dtype=object)

Next we must extract these countries into their own list

In [8]:
unmatched_countries = []
for country in mixed_continents_and_countries:
    if country.lower() not in [x.lower() for x in continents['Continent'].to_list()]:
        unmatched_countries.append(country)
            
unmatched_countries

['Faroe Islands',
 'Congo (Brazzaville)',
 'Congo (Kinshasa)',
 'Swaziland',
 'Czech Republic',
 'West Bank',
 'Midway Islands',
 'Macau',
 'Burma',
 'East Timor',
 'Johnston Atoll',
 'Cocos (Keeling) Islands',
 'Wake Island']

Finally, we create a copy of the original dataframe and replace the unmatched country names

In [9]:
new_country_names = airports_df.copy()

for country in airports_df['Country'].to_list():
    for x in unmatched_countries:
        if x.lower() == country.lower():
            new_country_names['Country'] = new_country_names['Country'].replace(x)

Now we can finally add continent information to the dataset

In [10]:
continents_list = []
for country in new_country_names['Country'].to_list():
    for row in continents.values:
        if country.lower() in row[0].lower():
            continents_list.append(row[1])
            break
new_country_names['Continent'] = continents_list
airports_df = new_country_names
# export to new csv
airports_df.to_csv('csv/airports_continents.csv',index=False)