In [1]:
# Do all imports and installs here - Done
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
import pandas as pd
import re
import configparser
import os

In [2]:
config = configparser.ConfigParser()
config.read('etl.cfg')

['etl.cfg']

In [4]:
input_data_source = config.get('DIR','INPUT_DIR')
output_processed_data = config.get('DIR','OUTPUT_DIR')

i94immi_dataset = config.get('DATA','I94_IMMI')
worldtempe_dataset = config.get('DATA','WORLD_TEMPE')
citydemo_dataset = config.get('DATA','CITY_DEMOGRAPHIC')
airport_dataset = config.get('DATA','AIR_PORT')
saslabel_dataset = config.get('DATA','SAS_LABEL')

In [5]:
# Create Spark session - Using for droduction only
spark = SparkSession.builder\
            .config("spark.jars.repositories", "https://repos.spark-packages.org/")\
            .config("spark.jars.packages", "saurfang:spark-sas7bdat:2.0.0-s_2.11")\
            .enableHiveSupport()\
            .getOrCreate()

In [7]:
def convert_column_names(df):
    '''
    This procedure standardizing column names to snake case format. Format ex: customer_name, billing_address, total_price.
    
    Parameters
    ----------
    dataframe : string_of_dataframe
        The input dataframe with column names might have elements of messy columns names, including accents, different delimiters, casing and multiple white spaces.
        Snake case style replaces the white spaces and symbol delimiters with underscore and converts all characters to lower case
    
    Returns
    -------
    Dataframe with column names has been changed to snake case format.
    '''
    cols = df.columns
    column_name_changed = []

    for col in cols:
        new_column = col.lstrip().rstrip().lower().replace (" ", "_").replace ("-", "_")
        column_name_changed.append(new_column)

    df.columns = column_name_changed

### Review `US.CITY DEMOGRAPHICS` dataset

In [8]:
citydemo_dataset

'./us-cities-demographics.csv'

In [9]:
citydemo_dataset = './us-cities-demographics.csv'
citydemo_df = pd.read_csv(citydemo_dataset,sep=";")
citydemo_df.columns

Index(['City', 'State', 'Median Age', 'Male Population', 'Female Population',
       'Total Population', 'Number of Veterans', 'Foreign-born',
       'Average Household Size', 'State Code', 'Race', 'Count'],
      dtype='object')

In [10]:
citydemo_df.describe()

Unnamed: 0,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,Count
count,2891.0,2888.0,2888.0,2891.0,2878.0,2878.0,2875.0,2891.0
mean,35.494881,97328.43,101769.6,198966.8,9367.832523,40653.6,2.742543,48963.77
std,4.401617,216299.9,231564.6,447555.9,13211.219924,155749.1,0.433291,144385.6
min,22.9,29281.0,27348.0,63215.0,416.0,861.0,2.0,98.0
25%,32.8,39289.0,41227.0,80429.0,3739.0,9224.0,2.43,3435.0
50%,35.3,52341.0,53809.0,106782.0,5397.0,18822.0,2.65,13780.0
75%,38.0,86641.75,89604.0,175232.0,9368.0,33971.75,2.95,54447.0
max,70.5,4081698.0,4468707.0,8550405.0,156961.0,3212500.0,4.98,3835726.0


In [11]:
pd.set_option('display.max_columns', 50)
citydemo_df.head()

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Count
0,Silver Spring,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.6,MD,Hispanic or Latino,25924
1,Quincy,Massachusetts,41.0,44129.0,49500.0,93629,4147.0,32935.0,2.39,MA,White,58723
2,Hoover,Alabama,38.5,38040.0,46799.0,84839,4819.0,8229.0,2.58,AL,Asian,4759
3,Rancho Cucamonga,California,34.5,88127.0,87105.0,175232,5821.0,33878.0,3.18,CA,Black or African-American,24437
4,Newark,New Jersey,34.6,138040.0,143873.0,281913,5829.0,86253.0,2.73,NJ,White,76402


In [12]:
citydemo_df.shape

(2891, 12)

### Checking for unique, missing values

In [16]:
citydemo_df.isna().sum()

City                      0
State                     0
Median Age                0
Male Population           0
Female Population         0
Total Population          0
Number of Veterans        0
Foreign-born              0
Average Household Size    0
State Code                0
Race                      0
Count                     0
dtype: int64

There are 5 columns have missing values: `Male Population` `Female Population` `Number of Veterans` `Foreign-born` `Average Household Size`

Perform NaN inventory on columns `Male Population` `Female Population` `Number of Veterans` `Foreign-born` `Average Household Size`

In [17]:
def nan_percentage_calc(df):
    nan_demographics_df = pd.DataFrame(data=df.isnull().sum(), columns=['NaN'])
    nan_demographics_df.drop(nan_demographics_df[nan_demographics_df['NaN'] == 0].index, inplace = True)
    nan_demographics_df['% of NaN'] = (nan_demographics_df['NaN']/df.count())*100
    return nan_demographics_df

In [18]:
nan_percentage_calc(citydemo_df)

Unnamed: 0,NaN,% of NaN
City,,
State,,
Median Age,,
Male Population,,
Female Population,,
Total Population,,
Number of Veterans,,
Foreign-born,,
Average Household Size,,
State Code,,


Remove records with missing value `Male Population` `Female Population` `Number of Veterans` `Foreign-born` `Average Household Size`

In [20]:
citydemo_df = citydemo_df.dropna(subset=["Male Population","Female Population","Number of Veterans","Foreign-born","Average Household Size"],how="any")

In [21]:
# Check unique
citydemo_df.nunique()

City                       559
State                       48
Median Age                 176
Male Population            586
Female Population          587
Total Population           586
Number of Veterans         576
Foreign-born               586
Average Household Size     161
State Code                  48
Race                         5
Count                     2770
dtype: int64

Unique values of column `Race` just be 5. We check duplicate of this column

In [22]:
race_unique = citydemo_df['Race'].unique()
race_unique

array(['Hispanic or Latino', 'White', 'Asian', 'Black or African-American',
       'American Indian and Alaska Native'], dtype=object)

Sample records with `race` column values is `American Indian and Alaska Native`

In [None]:
citydemo_df[citydemo_df['Race'] == 'American Indian and Alaska Native']

### Checking for uniqueness of columns combination

Check duplication of combination of key `City` and `Race`

In [24]:
citydemo_df[citydemo_df[['City','Race']].duplicated()].count()

City                      143
State                     143
Median Age                143
Male Population           143
Female Population         143
Total Population          143
Number of Veterans        143
Foreign-born              143
Average Household Size    143
State Code                143
Race                      143
Count                     143
dtype: int64

A lot duplicate. Try the combination of `City`, `Race` and `State`

In [25]:
citydemo_df[citydemo_df[['City','Race','State']].duplicated()].count()

City                      0
State                     0
Median Age                0
Male Population           0
Female Population         0
Total Population          0
Number of Veterans        0
Foreign-born              0
Average Household Size    0
State Code                0
Race                      0
Count                     0
dtype: int64

The key of this table will be combination of `City`, `Race`, `State`

### Standalizing column names format

Next, we convert column names to lowercase without space

In [26]:
convert_column_names(citydemo_df)
citydemo_df.columns

Index(['city', 'state', 'median_age', 'male_population', 'female_population',
       'total_population', 'number_of_veterans', 'foreign_born',
       'average_household_size', 'state_code', 'race', 'count'],
      dtype='object')

Remove leading and trailing space characters at  and upcase values on column `city` to merge later

In [None]:
citydemo_df.city = citydemo_df.city.str.strip().str.upper()
citydemo_df

Verify duplicate again

In [32]:
citydemo_df.duplicated().sum()

0

There is no duplicate anymore :)

### Take a look of data statistical

In [33]:
citydemo_df.describe()

Unnamed: 0,median_age,male_population,female_population,total_population,number_of_veterans,foreign_born,average_household_size,count
count,2875.0,2875.0,2875.0,2875.0,2875.0,2875.0,2875.0,2875.0
mean,35.434678,97445.02,101846.9,199291.9,9361.714435,40691.81,2.742543,48863.79
std,4.250501,216757.2,232051.3,448714.4,13216.754474,155825.9,0.433291,144631.5
min,22.9,29281.0,27348.0,63215.0,416.0,861.0,2.0,98.0
25%,32.8,39314.0,41227.0,80438.0,3737.5,9224.0,2.43,3454.0
50%,35.3,52336.0,53809.0,106782.0,5397.0,18830.0,2.65,13780.0
75%,38.0,86687.5,89589.0,175308.0,9368.0,34003.0,2.95,54146.5
max,48.8,4081698.0,4468707.0,8550405.0,156961.0,3212500.0,4.98,3835726.0


In [34]:
citydemo_df.head()

Unnamed: 0,city,state,median_age,male_population,female_population,total_population,number_of_veterans,foreign_born,average_household_size,state_code,race,count
0,SILVER SPRING,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.6,MD,Hispanic or Latino,25924
1,QUINCY,Massachusetts,41.0,44129.0,49500.0,93629,4147.0,32935.0,2.39,MA,White,58723
2,HOOVER,Alabama,38.5,38040.0,46799.0,84839,4819.0,8229.0,2.58,AL,Asian,4759
3,RANCHO CUCAMONGA,California,34.5,88127.0,87105.0,175232,5821.0,33878.0,3.18,CA,Black or African-American,24437
4,NEWARK,New Jersey,34.6,138040.0,143873.0,281913,5829.0,86253.0,2.73,NJ,White,76402


### Save cleaned dataframe

Save csv use pandas

In [35]:
citydemo_df.to_csv('citydemo_df_clean.csv', index=False)

In [36]:
# Verify staging CSV
citydemo_dataset = './citydemo_df_clean.csv'
citydemo_df = pd.read_csv(citydemo_dataset,sep=",")
pd.set_option('display.max_columns', 50)

In [37]:
citydemo_df.head()

Unnamed: 0,city,state,median_age,male_population,female_population,total_population,number_of_veterans,foreign_born,average_household_size,state_code,race,count
0,SILVER SPRING,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.6,MD,Hispanic or Latino,25924
1,QUINCY,Massachusetts,41.0,44129.0,49500.0,93629,4147.0,32935.0,2.39,MA,White,58723
2,HOOVER,Alabama,38.5,38040.0,46799.0,84839,4819.0,8229.0,2.58,AL,Asian,4759
3,RANCHO CUCAMONGA,California,34.5,88127.0,87105.0,175232,5821.0,33878.0,3.18,CA,Black or African-American,24437
4,NEWARK,New Jersey,34.6,138040.0,143873.0,281913,5829.0,86253.0,2.73,NJ,White,76402


Save csv to Spark partitions

In [38]:
# Read out from CSV file to spark dataframe
citydemo_df = spark.read.csv("citydemo_df_clean.csv")

In [39]:
# Write dataframe to CSV partitions use Spark

#rmdir(Path("citydemo_df_clean"))
# citydemo_df.write.options(header='True', delimiter=',').csv("citydemo_df_clean")
citydemo_df.write.mode('overwrite').csv("citydemo_df_clean")

### Staging cleaned `US. CITY DEMOGRAPHICS` from saved csv partitions

In [40]:
# Read out from csv partitions to staging dataframe
citydemo_df = spark.read.options(inferSchema="true", delimiter=",", header = "true").csv("citydemo_df_clean")

In [41]:
# Verify loaded dataframe
citydemo_df.show()

+----------------+--------------+----------+---------------+-----------------+----------------+------------------+------------+----------------------+----------+--------------------+------+
|            city|         state|median_age|male_population|female_population|total_population|number_of_veterans|foreign_born|average_household_size|state_code|                race| count|
+----------------+--------------+----------+---------------+-----------------+----------------+------------------+------------+----------------------+----------+--------------------+------+
|   SILVER SPRING|      Maryland|      33.8|        40601.0|          41862.0|           82463|            1562.0|     30908.0|                   2.6|        MD|  Hispanic or Latino| 25924|
|          QUINCY| Massachusetts|      41.0|        44129.0|          49500.0|           93629|            4147.0|     32935.0|                  2.39|        MA|               White| 58723|
|          HOOVER|       Alabama|      38.5|      

In [45]:
citydemo_df.head()

Row(city='SILVER SPRING', state='Maryland', median_age=33.8, male_population=40601.0, female_population=41862.0, total_population=82463, number_of_veterans=1562.0, foreign_born=30908.0, average_household_size=2.6, state_code='MD', race='Hispanic or Latino', count=25924)

In [42]:
# Create table from dataframe
citydemo_df.createOrReplaceTempView('citydemo_table')

In [43]:
# Verify created table will be using for staging
spark.sql("""
    SELECT COUNT(*) as amount_citydemo_rows
    FROM citydemo_table
""").show()

+--------------------+
|amount_citydemo_rows|
+--------------------+
|                2875|
+--------------------+

