In [39]:
# Do all imports and installs here - Done
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
import pandas as pd
import re
import configparser
import os

In [40]:
# Parse configurations - Done
config = configparser.ConfigParser()
config.read('etl.cfg')

['etl.cfg']

#### Path 
input_data_source = '.'

output_processed_data = './storage'

#

i94immi_data_source = '../../data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat'

worldtempe_data_source = '../../data2/GlobalLandTemperaturesByCity.csv'

citydemo_data_source = './us-cities-demographics.csv'

airport_data_source = './airport-codes_csv.csv'

#

i94_immi_splited_dir = './storage/.sas7bdat'

world_tempe_splited_dir = './storage/.csv'

In [114]:
# Create spark session
spark = SparkSession.builder\
            .config("spark.jars.repositories", "https://repos.spark-packages.org/")\
            .config("spark.jars.packages", "saurfang:spark-sas7bdat:2.0.0-s_2.11")\
            .enableHiveSupport()\
            .getOrCreate()

In [None]:
# df = spark.read.options(header='True',inferSchema='True',delimiter=',').csv(worldtempe_data_source)
# df = spark.read.options(header='True',inferSchema='True',delimiter=';').csv(citydemo_data_source)
# df = spark.read.options(header='True',inferSchema='True',delimiter=',').csv(airport_data_source)
# df = spark.read.format('com.github.saurfang.sas.spark').load('../../data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat')

- Cleaning steps
    - Drop column: gps_code, iata_code, local_code, coordinates.
    - Drop records with NaN or NULL.
    - Convert datatype

In [41]:
airport_dataset = './airport-codes_csv.csv'
airport_df = pd.read_csv(airport_dataset,sep=",")
pd.set_option('display.max_columns', 50)
airport_df.head()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
0,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125"
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
4,00AR,closed,Newport Hospital & Clinic Heliport,237.0,,US,US-AR,Newport,,,,"-91.254898, 35.6087"


In [44]:
# Check airport distribution by country
airport_df.groupby('iso_country')['iso_country'].count()

iso_country
AD      2
AE     57
AF     64
AG      3
AI      1
     ... 
YT      1
ZA    489
ZM    103
ZW    138
ZZ      7
Name: iso_country, Length: 243, dtype: int64

Check column 'iata_code' for missing values

In [46]:
airport_df[airport_df['iso_country'].isna()].groupby('iata_code')['iata_code'].count()

iata_code
ADI    1
AIW    1
BQI    1
ERS    1
GFY    1
GOG    1
HAL    1
KAS    1
KMP    1
LHU    1
LUD    1
MJO    1
MPA    1
MQG    1
NDU    1
NNI    1
OHI    1
OKF    1
OKU    1
OMD    1
OMG    1
OND    1
OPW    1
OTJ    1
RHN    1
SWP    1
SZM    1
TCY    1
TSB    1
WDH    1
WVB    1
Name: iata_code, dtype: int64

Check column 'continent' for missing values

In [49]:
airport_df[airport_df['iso_country'].isna()].groupby('continent')['continent'].count()

continent
AF    247
Name: continent, dtype: int64

Missing 'continent' values on airport belong to 'iso_country = AF'. This mean don't worry about US.

We filter dataframe by 'iso_country = US' to a new dataframe and then upercase the new one to merge later.

In [50]:
airport_clean_continent_df = airport_df[airport_df['iso_country'].fillna('').str.upper().str.contains('US')].copy()

We do airport type inventory on new dataframe with name *airport_clean_df*

In [51]:
airport_clean_continent_df.groupby('type')['type'].count()

type
balloonport          18
closed             1326
heliport           6265
large_airport       170
medium_airport      692
seaplane_base       566
small_airport     13720
Name: type, dtype: int64

The airport with immigration allowed is not includes some kind of types: closed, balloonport, heliport, seaplan_base.

We can filter out records of these airports

In [52]:
not_allow_immi_porttype = ['balloonport', 'closed', 'heliport', 'seaplane_base']
airport_clean_airporttype_df = airport_clean_continent_df[~airport_clean_continent_df['type'].str.strip().isin(not_allow_immi_porttype)].copy()
airport_clean_airporttype_df.groupby('type')['type'].count()

type
large_airport       170
medium_airport      692
small_airport     13720
Name: type, dtype: int64

Check the unique of column 'municipality'

In [53]:
airport_clean_airporttype_df[airport_clean_airporttype_df.municipality.isna()]
airport_clean_airporttype_df

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
5,00AS,small_airport,Fulton Airport,1100.0,,US,US-OK,Alex,00AS,,00AS,"-97.8180194, 34.9428028"
6,00AZ,small_airport,Cordes Airport,3810.0,,US,US-AZ,Cordes,00AZ,,00AZ,"-112.16500091552734, 34.305599212646484"
...,...,...,...,...,...,...,...,...,...,...,...,...
54574,Z91,small_airport,Birch Creek Airport,450.0,,US,US-AK,Birch Creek,Z91,KBC,Z91,"-145.824005127, 66.2740020752"
54575,Z92,small_airport,Harsens Island Airport,578.0,,US,US-MI,Harsens Island,Z92,,Z92,"-82.57640075683594, 42.589698791503906"
54576,Z93,small_airport,Copper Center 2 Airport,1150.0,,US,US-AK,Copper Center,Z93,CZC,Z93,"-145.294006348, 61.9412002563"
54577,Z95,small_airport,Cibecue Airport,5037.0,,US,US-AZ,Cibecue,Z95,,Z95,"-110.44400024414062, 34.003299713134766"


In [None]:
airport_clean_airporttype_df

Remove missing values from column 'municipality' and then upercase values in this column to merge later

In [54]:
airport_clean_municipality_df = airport_clean_airporttype_df[~airport_clean_airporttype_df['municipality'].isna()].copy()
airport_clean_municipality_df.municipality = airport_clean_municipality_df.municipality.str.upper()
airport_clean_municipality_df

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,LEOTI,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,ANCHOR POINT,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,HARVEST,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
5,00AS,small_airport,Fulton Airport,1100.0,,US,US-OK,ALEX,00AS,,00AS,"-97.8180194, 34.9428028"
6,00AZ,small_airport,Cordes Airport,3810.0,,US,US-AZ,CORDES,00AZ,,00AZ,"-112.16500091552734, 34.305599212646484"
...,...,...,...,...,...,...,...,...,...,...,...,...
54574,Z91,small_airport,Birch Creek Airport,450.0,,US,US-AK,BIRCH CREEK,Z91,KBC,Z91,"-145.824005127, 66.2740020752"
54575,Z92,small_airport,Harsens Island Airport,578.0,,US,US-MI,HARSENS ISLAND,Z92,,Z92,"-82.57640075683594, 42.589698791503906"
54576,Z93,small_airport,Copper Center 2 Airport,1150.0,,US,US-AK,COPPER CENTER,Z93,CZC,Z93,"-145.294006348, 61.9412002563"
54577,Z95,small_airport,Cibecue Airport,5037.0,,US,US-AZ,CIBECUE,Z95,,Z95,"-110.44400024414062, 34.003299713134766"


In [58]:
airport_clean_iso_region_df = airport_clean_municipality_df.groupby('iso_region')['iso_region'].count()
airport_clean_iso_region_df.head(50)

iso_region
US-AK      586
US-AL      197
US-AR      291
US-AZ      214
US-CA      551
US-CO      288
US-CT       56
US-DC        2
US-DE       36
US-FL      522
US-GA      365
US-HI       35
US-IA      232
US-ID      238
US-IL      579
US-IN      486
US-KS      372
US-KY      164
US-LA      281
US-MA       79
US-MD      157
US-ME      122
US-MI      379
US-MN      361
US-MO      411
US-MS      211
US-MT      255
US-NC      349
US-ND      297
US-NE      259
US-NH       54
US-NJ      116
US-NM      149
US-NV      113
US-NY      402
US-OH      492
US-OK      372
US-OR      357
US-PA      486
US-RI       10
US-SC      173
US-SD      162
US-TN      228
US-TX     1546
US-U-A       3
US-UT      103
US-VA      311
US-VT       66
US-WA      379
US-WI      457
Name: iso_region, dtype: int64

In [59]:
airport_clean_iso_region_df = airport_clean_municipality_df.groupby('iso_region')['iso_region'].count()
airport_clean_iso_region_df.tail(50)

iso_region
US-AR      291
US-AZ      214
US-CA      551
US-CO      288
US-CT       56
US-DC        2
US-DE       36
US-FL      522
US-GA      365
US-HI       35
US-IA      232
US-ID      238
US-IL      579
US-IN      486
US-KS      372
US-KY      164
US-LA      281
US-MA       79
US-MD      157
US-ME      122
US-MI      379
US-MN      361
US-MO      411
US-MS      211
US-MT      255
US-NC      349
US-ND      297
US-NE      259
US-NH       54
US-NJ      116
US-NM      149
US-NV      113
US-NY      402
US-OH      492
US-OK      372
US-OR      357
US-PA      486
US-RI       10
US-SC      173
US-SD      162
US-TN      228
US-TX     1546
US-U-A       3
US-UT      103
US-VA      311
US-VT       66
US-WA      379
US-WI      457
US-WV       83
US-WY       95
Name: iso_region, dtype: int64

In [60]:
airport_clean_iso_country_df = airport_clean_municipality_df.groupby('iso_country')['iso_country'].count()
airport_clean_iso_country_df.head(50)

iso_country
US    14532
Name: iso_country, dtype: int64

In [61]:
airport_clean_iso_country_df = airport_clean_municipality_df.groupby('iso_country')['iso_country'].count()
airport_clean_iso_country_df.tail(50)

iso_country
US    14532
Name: iso_country, dtype: int64

Look like Airport data cleaned.

In [None]:
# Saving to CSV to staging
airport_clean_iso_country_df.to_csv("airports_df_clean.csv", index=False)

==========================================================================================================

==========================================================================================================

In [7]:
# Check airport distribution by country
airport_df[airport_df['iso_country'].isna()].count()

ident           247
type            247
name            247
elevation_ft    231
continent       247
iso_country       0
iso_region      247
municipality    145
gps_code         74
iata_code        31
local_code        0
coordinates     247
dtype: int64

In [8]:
# Check airport distribution by country
airport_df[airport_df['iso_country'].isna()].shape

(247, 12)

In [4]:
# Check country unique
airport_df['iso_country'].nunique()

243

In [5]:
airport_df = airport_df[airport_df['iso_country'] == 'US']
airport_df['iso_country'].nunique()

1

In [6]:
airport_df.isnull().sum()

ident               0
type                0
name                0
elevation_ft      239
continent       22756
iso_country         0
iso_region          0
municipality      102
gps_code         1773
iata_code       20738
local_code       1521
coordinates         0
dtype: int64

In [7]:
airport_df.duplicated().sum()

0

In [8]:
airport_df = airport_df.drop(['elevation_ft', 'continent', 'gps_code', 'local_code', 'coordinates'], axis=1)

In [9]:
airport_df.head()

Unnamed: 0,ident,type,name,iso_country,iso_region,municipality,iata_code
0,00A,heliport,Total Rf Heliport,US,US-PA,Bensalem,
1,00AA,small_airport,Aero B Ranch Airport,US,US-KS,Leoti,
2,00AK,small_airport,Lowell Field,US,US-AK,Anchor Point,
3,00AL,small_airport,Epps Airpark,US,US-AL,Harvest,
4,00AR,closed,Newport Hospital & Clinic Heliport,US,US-AR,Newport,


In [19]:
airport_df = airport_df.dropna(subset=["iata_code"])

In [24]:
airport_df.shape

(2019, 7)

In [25]:
def convert_column_names(df):
    cols = df.columns
    column_name_changed = []

    for col in cols:
        new_column = col.lstrip().rstrip().lower().replace (" ", "_").replace ("-", "_") #strip beginning spaces, makes lowercase, add underscpre
        column_name_changed.append(new_column)

    df.columns = column_name_changed

In [27]:
convert_column_names(airport_df)
airport_df.columns

Index(['ident', 'type', 'name', 'iso_country', 'iso_region', 'municipality',
       'iata_code'],
      dtype='object')

In [None]:
# Saving to CSV to staging
convert_column_names.to_csv("airports_df_clean.csv", index=False)