In [1]:
# Do all imports and installs here - Done
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
import pandas as pd
import re
import configparser
import os
from pathlib import Path

In [2]:
# Parse configurations - Done
config = configparser.ConfigParser()
config.read('etl.cfg')

['etl.cfg']

In [3]:
input_data_source = config.get('DIR','INPUT_DIR')
output_processed_data = config.get('DIR','OUTPUT_DIR')

i94immi_dataset = config.get('DATA','I94_IMMI')
worldtempe_dataset = config.get('DATA','WORLD_TEMPE')
citydemo_dataset = config.get('DATA','CITY_DEMOGRAPHIC')
airport_dataset = config.get('DATA','AIR_PORT')
saslabel_dataset = config.get('DATA','SAS_LABEL')

In [4]:
airport_dataset

'./airport-codes_csv.csv'

In [5]:
# Run on production version
spark = SparkSession.builder\
            .config("spark.jars.repositories", "https://repos.spark-packages.org/")\
            .config("spark.jars.packages", "saurfang:spark-sas7bdat:2.0.0-s_2.11")\
            .enableHiveSupport()\
            .getOrCreate()

In [6]:
# df = spark.read.options(header='True',inferSchema='True',delimiter=',').csv(worldtempe_data_source)
# df = spark.read.options(header='True',inferSchema='True',delimiter=';').csv(citydemo_data_source)
# df = spark.read.options(header='True',inferSchema='True',delimiter=',').csv(airport_data_source)
# df = spark.read.format('com.github.saurfang.sas.spark').load('../../data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat')

In [10]:
def rmdir(directory):
    '''
    This procedure perform pure recursive a directory.
    
    Parameters
    ----------
    directory : string_of_path_to_dir
        The input directory is a path to target dir. This dir and all its belong child objects wil be deleted.
        Syntax note: rmdir(Path("target_path_to_dir"))
            with Path("target_path_to_dir") returns path to dir format as 'directory' input
    
    Returns
    -------
    None
    '''
    directory = Path(directory)
    for item in directory.iterdir():
        if item.is_dir():
            rmdir(item)
        else:
            item.unlink()
    directory.rmdir()

In [60]:
airport_dataset = './airport-codes_csv.csv'
airport_df = pd.read_csv(airport_dataset,sep=",")
pd.set_option('display.max_columns', 50)
airport_df.head()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
0,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125"
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
4,00AR,closed,Newport Hospital & Clinic Heliport,237.0,,US,US-AR,Newport,,,,"-91.254898, 35.6087"


In [61]:
airport_df.shape

(55075, 12)

In [62]:
airport_df['iso_region']

0         US-PA
1         US-KS
2         US-AK
3         US-AL
4         US-AR
5         US-OK
6         US-AZ
7         US-CA
8         US-CA
9         US-CA
10        US-CO
11        US-FL
12        US-FL
13        US-FL
14        US-GA
15        US-GA
16        US-HI
17        US-ID
18        US-KS
19        US-IN
20        US-IL
21        US-IN
22        US-IL
23        US-KS
24        US-KY
25        US-LA
26        US-IL
27        US-LA
28        US-MD
29        US-MI
          ...  
55045     CN-21
55046     CN-22
55047     CN-21
55048     CN-21
55049     CN-23
55050     CN-21
55051     CN-23
55052     CN-23
55053     CN-23
55054     CN-22
55055     CN-23
55056     CN-23
55057     CN-23
55058     CN-21
55059     CN-23
55060     CN-63
55061     CN-23
55062     CN-23
55063     CN-23
55064     CN-22
55065     CN-23
55066     CN-21
55067     CN-22
55068     CN-21
55069     CN-22
55070     CN-21
55071     CN-21
55072    GB-ENG
55073    TF-U-A
55074     JP-46
Name: iso_region, Length

### Check for primary key of Airport.

Check the uniqueness of `ident`

In [63]:
airport_df['ident'].nunique()

55075

Check the uniqueness of `local_code`

In [64]:
airport_df['local_code'].nunique()

27436

Check the uniqueness of combination `ident` and `local_code`

In [65]:
key_list = ['ident', 'local_code']
airport_df[key_list]

Unnamed: 0,ident,local_code
0,00A,00A
1,00AA,00AA
2,00AK,00AK
3,00AL,00AL
4,00AR,
5,00AS,00AS
6,00AZ,00AZ
7,00CA,00CA
8,00CL,00CL
9,00CN,00CN


Check NaN values on columns `ident` and `local_code`

In [66]:
airport_df[key_list].isna().groupby('ident')['ident'].count()

ident
False    55075
Name: ident, dtype: int64

In [67]:
airport_df[key_list].isna().groupby('local_code')['local_code'].count()

local_code
False    28686
True     26389
Name: local_code, dtype: int64

We will use `ident` as primarykey

### Check airport distribution by country

Count column `iso_country` for missing values

In [68]:
airport_df.groupby('iso_country')['iso_country'].count()

iso_country
AD        2
AE       57
AF       64
AG        3
AI        1
AL       13
AM       13
AO      104
AQ       27
AR      848
AS        4
AT      145
AU     1963
AW        1
AZ       35
BA       15
BB        6
BD       16
BE      146
BF       51
BG      134
BH        4
BI        7
BJ       10
BL        1
BM        3
BN        2
BO      197
BQ        3
BR     4334
      ...  
TM       21
TN       15
TO        6
TR      124
TT        3
TV        3
TW       65
TZ      207
UA      191
UG       38
UM        6
US    22757
UY       54
UZ      176
VA        1
VC        6
VE      592
VG        3
VI        9
VN       50
VU       32
WF        2
WS        4
XK        6
YE       25
YT        1
ZA      489
ZM      103
ZW      138
ZZ        7
Name: iso_country, Length: 243, dtype: int64

Count `iata_code` group by `iso_country` for missing values

In [69]:
airport_df[airport_df['iso_country'].isna()].groupby('iata_code')['iata_code'].count()

iata_code
ADI    1
AIW    1
BQI    1
ERS    1
GFY    1
GOG    1
HAL    1
KAS    1
KMP    1
LHU    1
LUD    1
MJO    1
MPA    1
MQG    1
NDU    1
NNI    1
OHI    1
OKF    1
OKU    1
OMD    1
OMG    1
OND    1
OPW    1
OTJ    1
RHN    1
SWP    1
SZM    1
TCY    1
TSB    1
WDH    1
WVB    1
Name: iata_code, dtype: int64

Count `continent` group by `iso_country` for missing values

In [70]:
airport_df[airport_df['iso_country'].isna()].groupby('continent')['continent'].count()

continent
AF    247
Name: continent, dtype: int64

Missing `continent` values on airport belong to `iso_country = AF`. This mean don't worry about US.

### Filter out US airport

Count amount of US airport

In [71]:
iso_country_us = ['US']
airport_df = airport_df[airport_df['iso_country'].str.strip().isin(iso_country_us)].copy()
airport_df.groupby('iso_country')['iso_country'].count()

iso_country
US    22757
Name: iso_country, dtype: int64

Uppercase values in `iso_country` and filter out US airport

In [73]:
airport_df = airport_df[airport_df['iso_country'].fillna('').str.upper().str.contains('US')].copy()

In [74]:
airport_df.groupby('iso_country').count()

Unnamed: 0_level_0,ident,type,name,elevation_ft,continent,iso_region,municipality,gps_code,iata_code,local_code,coordinates
iso_country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
US,22757,22757,22757,22518,1,22757,22655,20984,2019,21236,22757


In [75]:
airport_df.head()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
0,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125"
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
4,00AR,closed,Newport Hospital & Clinic Heliport,237.0,,US,US-AR,Newport,,,,"-91.254898, 35.6087"


In [27]:
airport_df.shape

(22757, 12)

### Filter out US airport type with immigration allow

List out airport type

In [76]:
airport_df.groupby('type')['type'].count()

type
balloonport          18
closed             1326
heliport           6265
large_airport       170
medium_airport      692
seaplane_base       566
small_airport     13720
Name: type, dtype: int64

The airport with immigration allowed is not includes some kind of types: closed, balloonport, heliport, seaplan_base.

We can filter out records of these airports

In [77]:
not_allow_immi_porttype = ['balloonport', 'closed', 'heliport', 'seaplane_base']
airport_df = airport_df[~airport_df['type'].str.strip().isin(not_allow_immi_porttype)].copy()
airport_df.groupby('type')['type'].count()

type
large_airport       170
medium_airport      692
small_airport     13720
Name: type, dtype: int64

In [78]:
airport_df.shape

(14582, 12)

### Mapping airport to city

Check the unique of column `municipality`

In [79]:
airport_df[airport_df.municipality.isna()]
airport_df

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
5,00AS,small_airport,Fulton Airport,1100.0,,US,US-OK,Alex,00AS,,00AS,"-97.8180194, 34.9428028"
6,00AZ,small_airport,Cordes Airport,3810.0,,US,US-AZ,Cordes,00AZ,,00AZ,"-112.16500091552734, 34.305599212646484"
7,00CA,small_airport,Goldstone /Gts/ Airport,3038.0,,US,US-CA,Barstow,00CA,,00CA,"-116.888000488, 35.350498199499995"
8,00CL,small_airport,Williams Ag Airport,87.0,,US,US-CA,Biggs,00CL,,00CL,"-121.763427, 39.427188"
11,00FA,small_airport,Grass Patch Airport,53.0,,US,US-FL,Bushnell,00FA,,00FA,"-82.21900177001953, 28.64550018310547"
13,00FL,small_airport,River Oak Airport,35.0,,US,US-FL,Okeechobee,00FL,,00FL,"-80.96920013427734, 27.230899810791016"
14,00GA,small_airport,Lt World Airport,700.0,,US,US-GA,Lithonia,00GA,,00GA,"-84.06829833984375, 33.76750183105469"


Remove missing values from column `municipality` and then upercase values in this column to merge later

In [80]:
airport_df = airport_df[~airport_df['municipality'].isna()].copy()
airport_df

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
5,00AS,small_airport,Fulton Airport,1100.0,,US,US-OK,Alex,00AS,,00AS,"-97.8180194, 34.9428028"
6,00AZ,small_airport,Cordes Airport,3810.0,,US,US-AZ,Cordes,00AZ,,00AZ,"-112.16500091552734, 34.305599212646484"
7,00CA,small_airport,Goldstone /Gts/ Airport,3038.0,,US,US-CA,Barstow,00CA,,00CA,"-116.888000488, 35.350498199499995"
8,00CL,small_airport,Williams Ag Airport,87.0,,US,US-CA,Biggs,00CL,,00CL,"-121.763427, 39.427188"
11,00FA,small_airport,Grass Patch Airport,53.0,,US,US-FL,Bushnell,00FA,,00FA,"-82.21900177001953, 28.64550018310547"
13,00FL,small_airport,River Oak Airport,35.0,,US,US-FL,Okeechobee,00FL,,00FL,"-80.96920013427734, 27.230899810791016"
14,00GA,small_airport,Lt World Airport,700.0,,US,US-GA,Lithonia,00GA,,00GA,"-84.06829833984375, 33.76750183105469"


In [81]:
airport_df.municipality = airport_df.municipality.str.upper()
airport_df

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,LEOTI,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,ANCHOR POINT,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,HARVEST,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
5,00AS,small_airport,Fulton Airport,1100.0,,US,US-OK,ALEX,00AS,,00AS,"-97.8180194, 34.9428028"
6,00AZ,small_airport,Cordes Airport,3810.0,,US,US-AZ,CORDES,00AZ,,00AZ,"-112.16500091552734, 34.305599212646484"
7,00CA,small_airport,Goldstone /Gts/ Airport,3038.0,,US,US-CA,BARSTOW,00CA,,00CA,"-116.888000488, 35.350498199499995"
8,00CL,small_airport,Williams Ag Airport,87.0,,US,US-CA,BIGGS,00CL,,00CL,"-121.763427, 39.427188"
11,00FA,small_airport,Grass Patch Airport,53.0,,US,US-FL,BUSHNELL,00FA,,00FA,"-82.21900177001953, 28.64550018310547"
13,00FL,small_airport,River Oak Airport,35.0,,US,US-FL,OKEECHOBEE,00FL,,00FL,"-80.96920013427734, 27.230899810791016"
14,00GA,small_airport,Lt World Airport,700.0,,US,US-GA,LITHONIA,00GA,,00GA,"-84.06829833984375, 33.76750183105469"


In [82]:
airport_df.shape

(14532, 12)

### Mapping airport to region

In [83]:
airport_df[airport_df.iso_region.isna()]
airport_df

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,LEOTI,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,ANCHOR POINT,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,HARVEST,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
5,00AS,small_airport,Fulton Airport,1100.0,,US,US-OK,ALEX,00AS,,00AS,"-97.8180194, 34.9428028"
6,00AZ,small_airport,Cordes Airport,3810.0,,US,US-AZ,CORDES,00AZ,,00AZ,"-112.16500091552734, 34.305599212646484"
7,00CA,small_airport,Goldstone /Gts/ Airport,3038.0,,US,US-CA,BARSTOW,00CA,,00CA,"-116.888000488, 35.350498199499995"
8,00CL,small_airport,Williams Ag Airport,87.0,,US,US-CA,BIGGS,00CL,,00CL,"-121.763427, 39.427188"
11,00FA,small_airport,Grass Patch Airport,53.0,,US,US-FL,BUSHNELL,00FA,,00FA,"-82.21900177001953, 28.64550018310547"
13,00FL,small_airport,River Oak Airport,35.0,,US,US-FL,OKEECHOBEE,00FL,,00FL,"-80.96920013427734, 27.230899810791016"
14,00GA,small_airport,Lt World Airport,700.0,,US,US-GA,LITHONIA,00GA,,00GA,"-84.06829833984375, 33.76750183105469"


In [85]:
airport_df.groupby('iso_region')['iso_region'].count()

iso_region
US-AK      586
US-AL      197
US-AR      291
US-AZ      214
US-CA      551
US-CO      288
US-CT       56
US-DC        2
US-DE       36
US-FL      522
US-GA      365
US-HI       35
US-IA      232
US-ID      238
US-IL      579
US-IN      486
US-KS      372
US-KY      164
US-LA      281
US-MA       79
US-MD      157
US-ME      122
US-MI      379
US-MN      361
US-MO      411
US-MS      211
US-MT      255
US-NC      349
US-ND      297
US-NE      259
US-NH       54
US-NJ      116
US-NM      149
US-NV      113
US-NY      402
US-OH      492
US-OK      372
US-OR      357
US-PA      486
US-RI       10
US-SC      173
US-SD      162
US-TN      228
US-TX     1546
US-U-A       3
US-UT      103
US-VA      311
US-VT       66
US-WA      379
US-WI      457
US-WV       83
US-WY       95
Name: iso_region, dtype: int64

In [86]:
airport_df.head()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,LEOTI,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,ANCHOR POINT,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,HARVEST,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
5,00AS,small_airport,Fulton Airport,1100.0,,US,US-OK,ALEX,00AS,,00AS,"-97.8180194, 34.9428028"
6,00AZ,small_airport,Cordes Airport,3810.0,,US,US-AZ,CORDES,00AZ,,00AZ,"-112.16500091552734, 34.305599212646484"


In [87]:
airport_df['iso_region']

1        US-KS
2        US-AK
3        US-AL
5        US-OK
6        US-AZ
7        US-CA
8        US-CA
11       US-FL
13       US-FL
14       US-GA
17       US-ID
18       US-KS
20       US-IL
22       US-IL
23       US-KS
24       US-KY
27       US-LA
28       US-MD
30       US-MN
31       US-MO
33       US-NJ
34       US-NC
37       US-NY
38       US-OH
40       US-OK
43       US-PA
45       US-OR
46       US-SC
47       US-SD
50       US-TN
         ...  
52740    US-MN
52741    US-MN
52742    US-ND
52743    US-MI
52744    US-IA
52745    US-WI
52746    US-MI
52747    US-MI
52748    US-MI
52749    US-MI
52750    US-MI
52751    US-ND
54550    US-AK
54551    US-AK
54552    US-AK
54557    US-AK
54559    US-AK
54560    US-AK
54562    US-AK
54563    US-AK
54564    US-AK
54565    US-AK
54570    US-AK
54571    US-AK
54573    US-AK
54574    US-AK
54575    US-MI
54576    US-AK
54577    US-AZ
54896    US-AK
Name: iso_region, Length: 14532, dtype: object

In [90]:
airport_df['iso_region_len'] = airport_df["iso_region"].apply(len)

In [91]:
check_iso_region_length = ['iso_region','iso_region_len']
airport_df[check_iso_region_length]

Unnamed: 0,iso_region,iso_region_len
1,US-KS,5
2,US-AK,5
3,US-AL,5
5,US-OK,5
6,US-AZ,5
7,US-CA,5
8,US-CA,5
11,US-FL,5
13,US-FL,5
14,US-GA,5


In [92]:
airport_df = airport_df[airport_df['iso_region_len']==5].copy()

In [93]:
airport_df['iso_region_len']

1        5
2        5
3        5
5        5
6        5
7        5
8        5
11       5
13       5
14       5
17       5
18       5
20       5
22       5
23       5
24       5
27       5
28       5
30       5
31       5
33       5
34       5
37       5
38       5
40       5
43       5
45       5
46       5
47       5
50       5
        ..
52740    5
52741    5
52742    5
52743    5
52744    5
52745    5
52746    5
52747    5
52748    5
52749    5
52750    5
52751    5
54550    5
54551    5
54552    5
54557    5
54559    5
54560    5
54562    5
54563    5
54564    5
54565    5
54570    5
54571    5
54573    5
54574    5
54575    5
54576    5
54577    5
54896    5
Name: iso_region_len, Length: 14529, dtype: int64

In [94]:
airport_df['state'] = airport_df['iso_region'].str.strip().str.split("-", n = 1, expand = True)[1]
airport_df

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates,iso_region_len,state
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,LEOTI,00AA,,00AA,"-101.473911, 38.704022",5,KS
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,ANCHOR POINT,00AK,,00AK,"-151.695999146, 59.94919968",5,AK
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,HARVEST,00AL,,00AL,"-86.77030181884766, 34.86479949951172",5,AL
5,00AS,small_airport,Fulton Airport,1100.0,,US,US-OK,ALEX,00AS,,00AS,"-97.8180194, 34.9428028",5,OK
6,00AZ,small_airport,Cordes Airport,3810.0,,US,US-AZ,CORDES,00AZ,,00AZ,"-112.16500091552734, 34.305599212646484",5,AZ
7,00CA,small_airport,Goldstone /Gts/ Airport,3038.0,,US,US-CA,BARSTOW,00CA,,00CA,"-116.888000488, 35.350498199499995",5,CA
8,00CL,small_airport,Williams Ag Airport,87.0,,US,US-CA,BIGGS,00CL,,00CL,"-121.763427, 39.427188",5,CA
11,00FA,small_airport,Grass Patch Airport,53.0,,US,US-FL,BUSHNELL,00FA,,00FA,"-82.21900177001953, 28.64550018310547",5,FL
13,00FL,small_airport,River Oak Airport,35.0,,US,US-FL,OKEECHOBEE,00FL,,00FL,"-80.96920013427734, 27.230899810791016",5,FL
14,00GA,small_airport,Lt World Airport,700.0,,US,US-GA,LITHONIA,00GA,,00GA,"-84.06829833984375, 33.76750183105469",5,GA


### Verify cleaned dataframe and save out to file .csv

In [95]:
airport_df.shape

(14529, 14)

Staging to CSV use pandas

In [98]:
airport_df.to_csv("airports_df_clean.csv", index=False)

==========================================================================================================

==========================================================================================================

In [99]:
# Verify staging CSV
airport_dataset = './airports_df_clean.csv'
airport_df = pd.read_csv(airport_dataset,sep=",")
pd.set_option('display.max_columns', 50)

In [47]:
airport_df.head()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates,iso_region_len
0,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,LEOTI,00AA,,00AA,"-101.473911, 38.704022",5
1,00AK,small_airport,Lowell Field,450.0,,US,US-AK,ANCHOR POINT,00AK,,00AK,"-151.695999146, 59.94919968",5
2,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,HARVEST,00AL,,00AL,"-86.77030181884766, 34.86479949951172",5
3,00AS,small_airport,Fulton Airport,1100.0,,US,US-OK,ALEX,00AS,,00AS,"-97.8180194, 34.9428028",5
4,00AZ,small_airport,Cordes Airport,3810.0,,US,US-AZ,CORDES,00AZ,,00AZ,"-112.16500091552734, 34.305599212646484",5


In [100]:
# Read out from CSV file to spark dataframe
airport_df = spark.read.csv("airports_df_clean.csv")

In [102]:
# Write dataframe to CSV partitions use Spark

#rmdir(Path("airport_df_clean"))
# airport_df.write.options(header='True', delimiter=',').csv("airport_df_clean")
airport_df.write.mode('overwrite').csv("airport_df_clean")

### Staging cleaned `AIRPORT` from saved csv partitions

In [103]:
# Read out from csv partitions to staging dataframe
airport_df = spark.read.options(inferSchema="true", delimiter=",", header = "true").csv("airport_df_clean")

In [104]:
# Verify loaded dataframe
airport_df.show()

+-----+-------------+--------------------+------------+---------+-----------+----------+------------+--------+---------+----------+--------------------+--------------+-----+
|ident|         type|                name|elevation_ft|continent|iso_country|iso_region|municipality|gps_code|iata_code|local_code|         coordinates|iso_region_len|state|
+-----+-------------+--------------------+------------+---------+-----------+----------+------------+--------+---------+----------+--------------------+--------------+-----+
| 00AA|small_airport|Aero B Ranch Airport|      3435.0|     null|         US|     US-KS|       LEOTI|    00AA|     null|      00AA|-101.473911, 38.7...|             5|   KS|
| 00AK|small_airport|        Lowell Field|       450.0|     null|         US|     US-AK|ANCHOR POINT|    00AK|     null|      00AK|-151.695999146, 5...|             5|   AK|
| 00AL|small_airport|        Epps Airpark|       820.0|     null|         US|     US-AL|     HARVEST|    00AL|     null|      00AL

In [105]:
# Create table from dataframe
airport_df.createOrReplaceTempView('airport_table')

In [106]:
# Verify created table will be using for staging
spark.sql("""
    SELECT COUNT(*) as amount_airport_rows
    FROM airport_table
""").show()

+-------------------+
|amount_airport_rows|
+-------------------+
|              14529|
+-------------------+



==========================================================================================================

==========================================================================================================