In [13]:
# Do all imports and installs here - Done
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
import pandas as pd
import re
import configparser
import os

In [14]:
config = configparser.ConfigParser()
config.read('etl.cfg')

input_data_source = config.get('DIR','INPUT_DIR')
output_processed_data = config.get('DIR','OUTPUT_DIR')

i94immi_dataset = config.get('DATA','I94_IMMI')
worldtempe_dataset = config.get('DATA','WORLD_TEMPE')
citydemo_dataset = config.get('DATA','CITY_DEMOGRAPHIC')
airport_dataset = config.get('DATA','AIR_PORT')

In [15]:
# Create Spark session - Using for droduction only
spark = SparkSession.builder\
            .config("spark.jars.repositories", "https://repos.spark-packages.org/")\
            .config("spark.jars.packages", "saurfang:spark-sas7bdat:2.0.0-s_2.11")\
            .enableHiveSupport()\
            .getOrCreate()

In [16]:
citydemo_dataset = './us-cities-demographics.csv'
citydemo_df = pd.read_csv(citydemo_dataset,sep=";")
citydemo_df.columns

Index(['City', 'State', 'Median Age', 'Male Population', 'Female Population',
       'Total Population', 'Number of Veterans', 'Foreign-born',
       'Average Household Size', 'State Code', 'Race', 'Count'],
      dtype='object')

In [17]:
citydemo_df.describe()

Unnamed: 0,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,Count
count,2891.0,2888.0,2888.0,2891.0,2878.0,2878.0,2875.0,2891.0
mean,35.494881,97328.43,101769.6,198966.8,9367.832523,40653.6,2.742543,48963.77
std,4.401617,216299.9,231564.6,447555.9,13211.219924,155749.1,0.433291,144385.6
min,22.9,29281.0,27348.0,63215.0,416.0,861.0,2.0,98.0
25%,32.8,39289.0,41227.0,80429.0,3739.0,9224.0,2.43,3435.0
50%,35.3,52341.0,53809.0,106782.0,5397.0,18822.0,2.65,13780.0
75%,38.0,86641.75,89604.0,175232.0,9368.0,33971.75,2.95,54447.0
max,70.5,4081698.0,4468707.0,8550405.0,156961.0,3212500.0,4.98,3835726.0


In [18]:
pd.set_option('display.max_columns', 50)
citydemo_df.head()

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Count
0,Silver Spring,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.6,MD,Hispanic or Latino,25924
1,Quincy,Massachusetts,41.0,44129.0,49500.0,93629,4147.0,32935.0,2.39,MA,White,58723
2,Hoover,Alabama,38.5,38040.0,46799.0,84839,4819.0,8229.0,2.58,AL,Asian,4759
3,Rancho Cucamonga,California,34.5,88127.0,87105.0,175232,5821.0,33878.0,3.18,CA,Black or African-American,24437
4,Newark,New Jersey,34.6,138040.0,143873.0,281913,5829.0,86253.0,2.73,NJ,White,76402


In [19]:
citydemo_df.shape

(2891, 12)

In [20]:
# Check unique
citydemo_df.nunique()

City                       567
State                       49
Median Age                 180
Male Population            593
Female Population          594
Total Population           594
Number of Veterans         577
Foreign-born               587
Average Household Size     161
State Code                  49
Race                         5
Count                     2785
dtype: int64

In [21]:
citydemo_df.isna().sum()

City                       0
State                      0
Median Age                 0
Male Population            3
Female Population          3
Total Population           0
Number of Veterans        13
Foreign-born              13
Average Household Size    16
State Code                 0
Race                       0
Count                      0
dtype: int64

Remove leading and trailing space characters at  and upcase values on column 'City' to merge later

In [22]:
citydemo_df.City = citydemo_df.City.str.strip().str.upper()
citydemo_df

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Count
0,SILVER SPRING,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.60,MD,Hispanic or Latino,25924
1,QUINCY,Massachusetts,41.0,44129.0,49500.0,93629,4147.0,32935.0,2.39,MA,White,58723
2,HOOVER,Alabama,38.5,38040.0,46799.0,84839,4819.0,8229.0,2.58,AL,Asian,4759
3,RANCHO CUCAMONGA,California,34.5,88127.0,87105.0,175232,5821.0,33878.0,3.18,CA,Black or African-American,24437
4,NEWARK,New Jersey,34.6,138040.0,143873.0,281913,5829.0,86253.0,2.73,NJ,White,76402
...,...,...,...,...,...,...,...,...,...,...,...,...
2886,STOCKTON,California,32.5,150976.0,154674.0,305650,12822.0,79583.0,3.16,CA,American Indian and Alaska Native,19834
2887,SOUTHFIELD,Michigan,41.6,31369.0,41808.0,73177,4035.0,4011.0,2.27,MI,American Indian and Alaska Native,983
2888,INDIANAPOLIS,Indiana,34.1,410615.0,437808.0,848423,42186.0,72456.0,2.53,IN,White,553665
2889,SOMERVILLE,Massachusetts,31.0,41028.0,39306.0,80334,2103.0,22292.0,2.43,MA,American Indian and Alaska Native,374


Unique values of column 'Race' just be 5. We check duplicate of this column

In [23]:
race_unique = citydemo_df['Race'].unique()
race_unique

array(['Hispanic or Latino', 'White', 'Asian',
       'Black or African-American', 'American Indian and Alaska Native'],
      dtype=object)

Sampling of values 'American Indian and Alaska Native'

In [24]:
citydemo_df[citydemo_df['Race'] == 'American Indian and Alaska Native']

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Count
5,PEORIA,Illinois,33.1,56229.0,62432.0,118661,6634.0,7517.0,2.40,IL,American Indian and Alaska Native,1343
11,FOLSOM,California,40.9,41051.0,35317.0,76368,4187.0,13234.0,2.62,CA,American Indian and Alaska Native,998
14,WICHITA,Kansas,34.6,192354.0,197601.0,389955,23978.0,40270.0,2.56,KS,American Indian and Alaska Native,8791
17,LAREDO,Texas,28.8,124305.0,131484.0,255789,4921.0,68427.0,3.66,TX,American Indian and Alaska Native,1253
22,BOLINGBROOK,Illinois,33.7,36295.0,35801.0,72096,2951.0,15212.0,3.42,IL,American Indian and Alaska Native,323
...,...,...,...,...,...,...,...,...,...,...,...,...
2878,CHARLOTTE,North Carolina,34.3,396646.0,430475.0,827121,36046.0,128897.0,2.52,NC,American Indian and Alaska Native,8746
2882,DAVIS,California,26.3,33493.0,34163.0,67656,2176.0,13997.0,2.69,CA,American Indian and Alaska Native,779
2886,STOCKTON,California,32.5,150976.0,154674.0,305650,12822.0,79583.0,3.16,CA,American Indian and Alaska Native,19834
2887,SOUTHFIELD,Michigan,41.6,31369.0,41808.0,73177,4035.0,4011.0,2.27,MI,American Indian and Alaska Native,983


Check duplication of combination of key 'City' and 'Race'

In [25]:
citydemo_df[citydemo_df[['City','Race']].duplicated()].count()

City                      143
State                     143
Median Age                143
Male Population           143
Female Population         143
Total Population          143
Number of Veterans        143
Foreign-born              143
Average Household Size    143
State Code                143
Race                      143
Count                     143
dtype: int64

A lot duplicate

Try the combination of 'City', 'Race' and 'State'

In [26]:
citydemo_df[citydemo_df[['City','Race','State']].duplicated()].count()

City                      0
State                     0
Median Age                0
Male Population           0
Female Population         0
Total Population          0
Number of Veterans        0
Foreign-born              0
Average Household Size    0
State Code                0
Race                      0
Count                     0
dtype: int64

The key of this table will be combination of **['City','Race','State']**

Next, we convert column names to lowercase without space

In [27]:
def convert_column_names(df):
    cols = df.columns
    column_name_changed = []

    for col in cols:
        new_column = col.lstrip().rstrip().lower().replace (" ", "_").replace ("-", "_") #strip beginning spaces, makes lowercase, add underscpre
        column_name_changed.append(new_column)

    df.columns = column_name_changed

In [29]:
convert_column_names(citydemo_df)
citydemo_df.columns

Index(['city', 'state', 'median_age', 'male_population', 'female_population',
       'total_population', 'number_of_veterans', 'foreign_born',
       'average_household_size', 'state_code', 'race', 'count'],
      dtype='object')

Perform NaN inventory on columns **['male_population', 'female_population', 'number_of_veterans', 'foreign_born', 'average_household_size']**

In [30]:
def nan_percentage_calc(df):
    nan_demographics_df = pd.DataFrame(data=df.isnull().sum(), columns=['NaN'])
    nan_demographics_df.drop(nan_demographics_df[nan_demographics_df['NaN'] == 0].index, inplace = True)
    nan_demographics_df['% of NaN'] = (nan_demographics_df['NaN']/df.count())*100
    return nan_demographics_df

In [31]:
nan_percentage_calc(citydemo_df)

Unnamed: 0,NaN,% of NaN
male_population,3,0.103878
female_population,3,0.103878
number_of_veterans,13,0.451703
foreign_born,13,0.451703
average_household_size,16,0.556522


Remove records with missing value **['male_population', 'female_population', 'number_of_veterans', 'foreign_born', 'average_household_size']**

In [32]:
citydemo_df = citydemo_df.dropna(subset=["male_population","female_population","number_of_veterans","foreign_born","average_household_size"],how="any")

Verify duplicate again

In [34]:
citydemo_df.duplicated().sum()

0

There is no duplicate anymore :)

Let's do data statistical

In [35]:
citydemo_df.describe()

Unnamed: 0,median_age,male_population,female_population,total_population,number_of_veterans,foreign_born,average_household_size,count
count,2875.0,2875.0,2875.0,2875.0,2875.0,2875.0,2875.0,2875.0
mean,35.434678,97445.02,101846.9,199291.9,9361.714435,40691.81,2.742543,48863.79
std,4.250501,216757.2,232051.3,448714.4,13216.754474,155825.9,0.433291,144631.5
min,22.9,29281.0,27348.0,63215.0,416.0,861.0,2.0,98.0
25%,32.8,39314.0,41227.0,80438.0,3737.5,9224.0,2.43,3454.0
50%,35.3,52336.0,53809.0,106782.0,5397.0,18830.0,2.65,13780.0
75%,38.0,86687.5,89589.0,175308.0,9368.0,34003.0,2.95,54146.5
max,48.8,4081698.0,4468707.0,8550405.0,156961.0,3212500.0,4.98,3835726.0


In [37]:
citydemo_df.head()

Unnamed: 0,city,state,median_age,male_population,female_population,total_population,number_of_veterans,foreign_born,average_household_size,state_code,race,count
0,SILVER SPRING,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.6,MD,Hispanic or Latino,25924
1,QUINCY,Massachusetts,41.0,44129.0,49500.0,93629,4147.0,32935.0,2.39,MA,White,58723
2,HOOVER,Alabama,38.5,38040.0,46799.0,84839,4819.0,8229.0,2.58,AL,Asian,4759
3,RANCHO CUCAMONGA,California,34.5,88127.0,87105.0,175232,5821.0,33878.0,3.18,CA,Black or African-American,24437
4,NEWARK,New Jersey,34.6,138040.0,143873.0,281913,5829.0,86253.0,2.73,NJ,White,76402


Saving the clean df to a new csv for staging

In [36]:
citydemo_df.to_csv('citydemo_df_clean.csv', index=False)