In [120]:
# Do all imports and installs here - Done
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
import pandas as pd
import re
import configparser
import os

In [93]:
# Parse configurations - Done
config = configparser.ConfigParser()
config.read('etl.cfg')

['etl.cfg']

In [None]:
# df = spark.read.options(header='True',inferSchema='True',delimiter=',').csv(worldtempe_data_source)
# df = spark.read.options(header='True',inferSchema='True',delimiter=';').csv(citydemo_data_source)
# df = spark.read.options(header='True',inferSchema='True',delimiter=',').csv(airport_data_source)
# df = spark.read.format('com.github.saurfang.sas.spark').load('../../data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat')

In [121]:
airport_dataset = './airport-codes_csv.csv'
airport_df = pd.read_csv(airport_dataset,sep=",")
pd.set_option('display.max_columns', 50)
airport_df.head()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
0,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125"
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
4,00AR,closed,Newport Hospital & Clinic Heliport,237.0,,US,US-AR,Newport,,,,"-91.254898, 35.6087"


In [122]:
airport_df['iso_region']

0         US-PA
1         US-KS
2         US-AK
3         US-AL
4         US-AR
          ...  
55070     CN-21
55071     CN-21
55072    GB-ENG
55073    TF-U-A
55074     JP-46
Name: iso_region, Length: 55075, dtype: object

### Check for primary key of Airport.

Check the uniqueness of `ident`

In [34]:
airport_df['ident'].nunique()

55075

Check the uniqueness of `local_code`

In [35]:
airport_df['local_code'].nunique()

27436

Check the uniqueness of combination `ident` and `local_code`

In [36]:
key_list = ['ident', 'local_code']
airport_df[key_list]

Unnamed: 0,ident,local_code
0,00A,00A
1,00AA,00AA
2,00AK,00AK
3,00AL,00AL
4,00AR,
...,...,...
55070,ZYYK,
55071,ZYYY,
55072,ZZ-0001,
55073,ZZ-0002,


Check NaN values on columns `ident` and `local_code`

In [37]:
airport_df[key_list].isna().groupby('ident')['ident'].count()

ident
False    55075
Name: ident, dtype: int64

In [16]:
airport_df[key_list].isna().groupby('local_code')['local_code'].count()

local_code
False    28686
True     26389
Name: local_code, dtype: int64

We will use `ident` as primarykey

### Check airport distribution by country

Count column `iso_country` for missing values

In [44]:
airport_df.groupby('iso_country')['iso_country'].count()

iso_country
AD      2
AE     57
AF     64
AG      3
AI      1
     ... 
YT      1
ZA    489
ZM    103
ZW    138
ZZ      7
Name: iso_country, Length: 243, dtype: int64

Count `iata_code` group by `iso_country` for missing values

In [38]:
airport_df[airport_df['iso_country'].isna()].groupby('iata_code')['iata_code'].count()

iata_code
ADI    1
AIW    1
BQI    1
ERS    1
GFY    1
GOG    1
HAL    1
KAS    1
KMP    1
LHU    1
LUD    1
MJO    1
MPA    1
MQG    1
NDU    1
NNI    1
OHI    1
OKF    1
OKU    1
OMD    1
OMG    1
OND    1
OPW    1
OTJ    1
RHN    1
SWP    1
SZM    1
TCY    1
TSB    1
WDH    1
WVB    1
Name: iata_code, dtype: int64

Count `continent` group by `iso_country` for missing values

In [39]:
airport_df[airport_df['iso_country'].isna()].groupby('continent')['continent'].count()

continent
AF    247
Name: continent, dtype: int64

Missing `continent` values on airport belong to `iso_country = AF`. This mean don't worry about US.

### Filter out US airport

Count amount of US airport

In [95]:
iso_country_us = ['US']
airport_df = airport_df[airport_df['iso_country'].str.strip().isin(iso_country_us)].copy()
airport_df.groupby('iso_country')['iso_country'].count()

iso_country
US    22757
Name: iso_country, dtype: int64

Uppercase values in `iso_country` and filter out US airport

In [97]:
airport_df = airport_df[airport_df['iso_country'].fillna('').str.upper().str.contains('US')].copy()

In [98]:
airport_df.groupby('iso_country').count()

Unnamed: 0_level_0,ident,type,name,elevation_ft,continent,iso_region,municipality,gps_code,iata_code,local_code,coordinates
iso_country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
US,22757,22757,22757,22518,1,22757,22655,20984,2019,21236,22757


In [99]:
airport_df.head()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
0,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125"
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
4,00AR,closed,Newport Hospital & Clinic Heliport,237.0,,US,US-AR,Newport,,,,"-91.254898, 35.6087"


In [100]:
airport_df.shape

(22757, 12)

### Filter out US airport type with immigration allow

List out airport type

In [101]:
airport_df.groupby('type')['type'].count()

type
balloonport          18
closed             1326
heliport           6265
large_airport       170
medium_airport      692
seaplane_base       566
small_airport     13720
Name: type, dtype: int64

The airport with immigration allowed is not includes some kind of types: closed, balloonport, heliport, seaplan_base.

We can filter out records of these airports

In [102]:
not_allow_immi_porttype = ['balloonport', 'closed', 'heliport', 'seaplane_base']
airport_df = airport_df[~airport_df['type'].str.strip().isin(not_allow_immi_porttype)].copy()
airport_df.groupby('type')['type'].count()

type
large_airport       170
medium_airport      692
small_airport     13720
Name: type, dtype: int64

In [103]:
airport_df.shape

(14582, 12)

### Mapping airport to city

Check the unique of column `municipality`

In [104]:
airport_df[airport_df.municipality.isna()]
airport_df

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
5,00AS,small_airport,Fulton Airport,1100.0,,US,US-OK,Alex,00AS,,00AS,"-97.8180194, 34.9428028"
6,00AZ,small_airport,Cordes Airport,3810.0,,US,US-AZ,Cordes,00AZ,,00AZ,"-112.16500091552734, 34.305599212646484"
...,...,...,...,...,...,...,...,...,...,...,...,...
54574,Z91,small_airport,Birch Creek Airport,450.0,,US,US-AK,Birch Creek,Z91,KBC,Z91,"-145.824005127, 66.2740020752"
54575,Z92,small_airport,Harsens Island Airport,578.0,,US,US-MI,Harsens Island,Z92,,Z92,"-82.57640075683594, 42.589698791503906"
54576,Z93,small_airport,Copper Center 2 Airport,1150.0,,US,US-AK,Copper Center,Z93,CZC,Z93,"-145.294006348, 61.9412002563"
54577,Z95,small_airport,Cibecue Airport,5037.0,,US,US-AZ,Cibecue,Z95,,Z95,"-110.44400024414062, 34.003299713134766"


Remove missing values from column `municipality` and then upercase values in this column to merge later

In [105]:
airport_df = airport_df[~airport_df['municipality'].isna()].copy()
airport_df

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
5,00AS,small_airport,Fulton Airport,1100.0,,US,US-OK,Alex,00AS,,00AS,"-97.8180194, 34.9428028"
6,00AZ,small_airport,Cordes Airport,3810.0,,US,US-AZ,Cordes,00AZ,,00AZ,"-112.16500091552734, 34.305599212646484"
...,...,...,...,...,...,...,...,...,...,...,...,...
54574,Z91,small_airport,Birch Creek Airport,450.0,,US,US-AK,Birch Creek,Z91,KBC,Z91,"-145.824005127, 66.2740020752"
54575,Z92,small_airport,Harsens Island Airport,578.0,,US,US-MI,Harsens Island,Z92,,Z92,"-82.57640075683594, 42.589698791503906"
54576,Z93,small_airport,Copper Center 2 Airport,1150.0,,US,US-AK,Copper Center,Z93,CZC,Z93,"-145.294006348, 61.9412002563"
54577,Z95,small_airport,Cibecue Airport,5037.0,,US,US-AZ,Cibecue,Z95,,Z95,"-110.44400024414062, 34.003299713134766"


In [106]:
airport_df.municipality = airport_df.municipality.str.upper()
airport_df

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,LEOTI,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,ANCHOR POINT,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,HARVEST,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
5,00AS,small_airport,Fulton Airport,1100.0,,US,US-OK,ALEX,00AS,,00AS,"-97.8180194, 34.9428028"
6,00AZ,small_airport,Cordes Airport,3810.0,,US,US-AZ,CORDES,00AZ,,00AZ,"-112.16500091552734, 34.305599212646484"
...,...,...,...,...,...,...,...,...,...,...,...,...
54574,Z91,small_airport,Birch Creek Airport,450.0,,US,US-AK,BIRCH CREEK,Z91,KBC,Z91,"-145.824005127, 66.2740020752"
54575,Z92,small_airport,Harsens Island Airport,578.0,,US,US-MI,HARSENS ISLAND,Z92,,Z92,"-82.57640075683594, 42.589698791503906"
54576,Z93,small_airport,Copper Center 2 Airport,1150.0,,US,US-AK,COPPER CENTER,Z93,CZC,Z93,"-145.294006348, 61.9412002563"
54577,Z95,small_airport,Cibecue Airport,5037.0,,US,US-AZ,CIBECUE,Z95,,Z95,"-110.44400024414062, 34.003299713134766"


In [107]:
airport_df.shape

(14532, 12)

### Mapping airport to region

In [108]:
airport_df[airport_df.iso_region.isna()]
airport_df

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,LEOTI,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,ANCHOR POINT,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,HARVEST,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
5,00AS,small_airport,Fulton Airport,1100.0,,US,US-OK,ALEX,00AS,,00AS,"-97.8180194, 34.9428028"
6,00AZ,small_airport,Cordes Airport,3810.0,,US,US-AZ,CORDES,00AZ,,00AZ,"-112.16500091552734, 34.305599212646484"
...,...,...,...,...,...,...,...,...,...,...,...,...
54574,Z91,small_airport,Birch Creek Airport,450.0,,US,US-AK,BIRCH CREEK,Z91,KBC,Z91,"-145.824005127, 66.2740020752"
54575,Z92,small_airport,Harsens Island Airport,578.0,,US,US-MI,HARSENS ISLAND,Z92,,Z92,"-82.57640075683594, 42.589698791503906"
54576,Z93,small_airport,Copper Center 2 Airport,1150.0,,US,US-AK,COPPER CENTER,Z93,CZC,Z93,"-145.294006348, 61.9412002563"
54577,Z95,small_airport,Cibecue Airport,5037.0,,US,US-AZ,CIBECUE,Z95,,Z95,"-110.44400024414062, 34.003299713134766"


In [148]:
airport_df.groupby('iso_region')['iso_region'].count()
airport_df.head()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates,len,iso_region_len
0,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125",5,5
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022",5,5
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968",5,5
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172",5,5
4,00AR,closed,Newport Hospital & Clinic Heliport,237.0,,US,US-AR,Newport,,,,"-91.254898, 35.6087",5,5


In [149]:
airport_df['iso_region']

0         US-PA
1         US-KS
2         US-AK
3         US-AL
4         US-AR
          ...  
55070     CN-21
55071     CN-21
55072    GB-ENG
55073    TF-U-A
55074     JP-46
Name: iso_region, Length: 55075, dtype: object

In [146]:
airport_df['iso_region_len'] = airport_df["iso_region"].apply(len)

In [147]:
check_iso_region_length = ['iso_region','iso_region_len']
airport_df[check_iso_region_length]

Unnamed: 0,iso_region,iso_region_len
0,US-PA,5
1,US-KS,5
2,US-AK,5
3,US-AL,5
4,US-AR,5
...,...,...
55070,CN-21,5
55071,CN-21,5
55072,GB-ENG,6
55073,TF-U-A,6


In [150]:
df_airports = airport_df[airport_df['iso_region_len']==5].copy()

In [153]:
airport_df['iso_region_len']

0        5
1        5
2        5
3        5
4        5
        ..
55070    5
55071    5
55072    6
55073    6
55074    5
Name: iso_region_len, Length: 55075, dtype: int64

In [161]:
df_airports['state'] = df_airports['iso_region'].str.strip().str.split("-", n = 1, expand = True)[1]
df_airports

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates,len,iso_region_len,state
0,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125",5,5,PA
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022",5,5,KS
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968",5,5,AK
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172",5,5,AL
4,00AR,closed,Newport Hospital & Clinic Heliport,237.0,,US,US-AR,Newport,,,,"-91.254898, 35.6087",5,5,AR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55068,ZYTX,large_airport,Taoxian Airport,198.0,AS,CN,CN-21,Shenyang,ZYTX,SHE,,"123.48300170898438, 41.639801025390625",5,5,21
55069,ZYYJ,medium_airport,Yanji Chaoyangchuan Airport,624.0,AS,CN,CN-22,Yanji,ZYYJ,YNJ,,"129.451004028, 42.8828010559",5,5,22
55070,ZYYK,medium_airport,Yingkou Lanqi Airport,0.0,AS,CN,CN-21,Yingkou,ZYYK,YKH,,"122.3586, 40.542524",5,5,21
55071,ZYYY,medium_airport,Shenyang Dongta Airport,,AS,CN,CN-21,Shenyang,ZYYY,,,"123.49600219726562, 41.784400939941406",5,5,21


### Verify cleaned dataframe and save out to file .csv

In [159]:
airport_df.shape

(55075, 14)

Look like Airport data cleaned.

In [164]:
# Saving to CSV to staging
airport_df.to_csv("airports_df_clean.csv", index=False)

==========================================================================================================

==========================================================================================================