In [22]:
# Do all imports and installs here - Done
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
import pandas as pd
import re
import configparser
import os

In [23]:
citydemo_dataset = './us-cities-demographics.csv'
citydemo_df = pd.read_csv(citydemo_dataset,sep=";")
citydemo_df.columns

Index(['City', 'State', 'Median Age', 'Male Population', 'Female Population',
       'Total Population', 'Number of Veterans', 'Foreign-born',
       'Average Household Size', 'State Code', 'Race', 'Count'],
      dtype='object')

In [28]:
citydemo_df.head()

Unnamed: 0,city,state,median_age,male_population,female_population,total_population,number_of_veterans,foreign_born,average_household_size,state_code,race,count
0,Silver Spring,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.6,MD,Hispanic or Latino,25924
1,Quincy,Massachusetts,41.0,44129.0,49500.0,93629,4147.0,32935.0,2.39,MA,White,58723
2,Hoover,Alabama,38.5,38040.0,46799.0,84839,4819.0,8229.0,2.58,AL,Asian,4759
3,Rancho Cucamonga,California,34.5,88127.0,87105.0,175232,5821.0,33878.0,3.18,CA,Black or African-American,24437
4,Newark,New Jersey,34.6,138040.0,143873.0,281913,5829.0,86253.0,2.73,NJ,White,76402


In [29]:
citydemo_df

Unnamed: 0,city,state,median_age,male_population,female_population,total_population,number_of_veterans,foreign_born,average_household_size,state_code,race,count
0,Silver Spring,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.60,MD,Hispanic or Latino,25924
1,Quincy,Massachusetts,41.0,44129.0,49500.0,93629,4147.0,32935.0,2.39,MA,White,58723
2,Hoover,Alabama,38.5,38040.0,46799.0,84839,4819.0,8229.0,2.58,AL,Asian,4759
3,Rancho Cucamonga,California,34.5,88127.0,87105.0,175232,5821.0,33878.0,3.18,CA,Black or African-American,24437
4,Newark,New Jersey,34.6,138040.0,143873.0,281913,5829.0,86253.0,2.73,NJ,White,76402
...,...,...,...,...,...,...,...,...,...,...,...,...
2886,Stockton,California,32.5,150976.0,154674.0,305650,12822.0,79583.0,3.16,CA,American Indian and Alaska Native,19834
2887,Southfield,Michigan,41.6,31369.0,41808.0,73177,4035.0,4011.0,2.27,MI,American Indian and Alaska Native,983
2888,Indianapolis,Indiana,34.1,410615.0,437808.0,848423,42186.0,72456.0,2.53,IN,White,553665
2889,Somerville,Massachusetts,31.0,41028.0,39306.0,80334,2103.0,22292.0,2.43,MA,American Indian and Alaska Native,374


In [30]:
def clean_dataframe_column_names(df):
    cols = df.columns
    new_column_names = []

    for col in cols:
        new_col = col.lstrip().rstrip().lower().replace (" ", "_").replace ("-", "_") #strip beginning spaces, makes lowercase, add underscpre
        new_column_names.append(new_col)

    df.columns = new_column_names

In [31]:
clean_dataframe_column_names(citydemo_df)

In [33]:
# Show dataset sample records
citydemo_df.head()

Unnamed: 0,city,state,median_age,male_population,female_population,total_population,number_of_veterans,foreign_born,average_household_size,state_code,race,count
0,Silver Spring,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.6,MD,Hispanic or Latino,25924
1,Quincy,Massachusetts,41.0,44129.0,49500.0,93629,4147.0,32935.0,2.39,MA,White,58723
2,Hoover,Alabama,38.5,38040.0,46799.0,84839,4819.0,8229.0,2.58,AL,Asian,4759
3,Rancho Cucamonga,California,34.5,88127.0,87105.0,175232,5821.0,33878.0,3.18,CA,Black or African-American,24437
4,Newark,New Jersey,34.6,138040.0,143873.0,281913,5829.0,86253.0,2.73,NJ,White,76402


In [34]:
citydemo_df.shape

(2891, 12)

In [35]:
# Verify record with missing value NA or NULL
citydemo_df.isna().sum().sort_values()

city                       0
state                      0
median_age                 0
total_population           0
state_code                 0
race                       0
count                      0
male_population            3
female_population          3
number_of_veterans        13
foreign_born              13
average_household_size    16
dtype: int64

In [37]:
# Remove records with missing value on column 'depdate'
citydemo_df = citydemo_df.dropna(subset=["male_population","female_population","number_of_veterans","foreign_born","average_household_size"],how="any")

In [13]:
# Verify record with missing value NA or NULL
citydemo_df.isna().sum().sort_values()

city                      0
state                     0
median_age                0
male_population           0
female_population         0
total_population          0
number_of_veterans        0
foreign-born              0
average_household_size    0
state_code                0
race                      0
count                     0
dtype: int64

In [39]:
clean_columns = ['city', 'state', 'median_age', 'male_population', 'female_population',
       'total_population', 'number_of_veterans', 'foreign_born',
       'average_household_size', 'state_code', 'count']

In [40]:
citydemo_df = citydemo_df[clean_columns]
citydemo_df

Unnamed: 0,city,state,median_age,male_population,female_population,total_population,number_of_veterans,foreign_born,average_household_size,state_code,count
0,Silver Spring,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.60,MD,25924
1,Quincy,Massachusetts,41.0,44129.0,49500.0,93629,4147.0,32935.0,2.39,MA,58723
2,Hoover,Alabama,38.5,38040.0,46799.0,84839,4819.0,8229.0,2.58,AL,4759
3,Rancho Cucamonga,California,34.5,88127.0,87105.0,175232,5821.0,33878.0,3.18,CA,24437
4,Newark,New Jersey,34.6,138040.0,143873.0,281913,5829.0,86253.0,2.73,NJ,76402
...,...,...,...,...,...,...,...,...,...,...,...
2886,Stockton,California,32.5,150976.0,154674.0,305650,12822.0,79583.0,3.16,CA,19834
2887,Southfield,Michigan,41.6,31369.0,41808.0,73177,4035.0,4011.0,2.27,MI,983
2888,Indianapolis,Indiana,34.1,410615.0,437808.0,848423,42186.0,72456.0,2.53,IN,553665
2889,Somerville,Massachusetts,31.0,41028.0,39306.0,80334,2103.0,22292.0,2.43,MA,374


In [41]:
citydemo_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2875 entries, 0 to 2890
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city                    2875 non-null   object 
 1   state                   2875 non-null   object 
 2   median_age              2875 non-null   float64
 3   male_population         2875 non-null   float64
 4   female_population       2875 non-null   float64
 5   total_population        2875 non-null   int64  
 6   number_of_veterans      2875 non-null   float64
 7   foreign_born            2875 non-null   float64
 8   average_household_size  2875 non-null   float64
 9   state_code              2875 non-null   object 
 10  count                   2875 non-null   int64  
dtypes: float64(6), int64(2), object(3)
memory usage: 269.5+ KB


In [44]:
# Convert data type
citydemo_df['state'] = citydemo_df['state'].astype(str)
citydemo_df['median_age'] = citydemo_df['median_age'].astype(int)
citydemo_df['male_population'] = citydemo_df['male_population'].astype(int)
citydemo_df['female_population'] = citydemo_df['female_population'].astype(int)
citydemo_df['total_population'] = citydemo_df['total_population'].astype(int)
citydemo_df['number_of_veterans'] = citydemo_df['number_of_veterans'].astype(int)
citydemo_df['foreign_born'] = citydemo_df['foreign_born'].astype(int)

citydemo_df.head()

Unnamed: 0,city,state,median_age,male_population,female_population,total_population,number_of_veterans,foreign_born,average_household_size,state_code,count
0,Silver Spring,Maryland,33,40601,41862,82463,1562,30908,2.6,MD,25924
1,Quincy,Massachusetts,41,44129,49500,93629,4147,32935,2.39,MA,58723
2,Hoover,Alabama,38,38040,46799,84839,4819,8229,2.58,AL,4759
3,Rancho Cucamonga,California,34,88127,87105,175232,5821,33878,3.18,CA,24437
4,Newark,New Jersey,34,138040,143873,281913,5829,86253,2.73,NJ,76402


In [22]:
# Show dataset sample records
citydemo_df.shape

(951, 9)

In [23]:
citydemo_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 951 entries, 0 to 999
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   cicid     951 non-null    object
 1   i94yr     951 non-null    int32 
 2   i94mon    951 non-null    int32 
 3   i94cit    951 non-null    int32 
 4   arrdate   951 non-null    int32 
 5   i94mode   951 non-null    int32 
 6   depdate   951 non-null    int32 
 7   i94visa   951 non-null    int32 
 8   visatype  951 non-null    object
dtypes: int32(7), object(2)
memory usage: 48.3+ KB


In [31]:
# Check unique
citydemo_df[citydemo_columns].value_counts()

cicid      i94yr  i94mon  i94cit  arrdate  i94mode  depdate  i94visa  visatype
1000074.0  2016   4       129     20550    1        20564    2        WT          1
480428.0   2016   4       148     20547    1        20559    2        WT          1
4718122.0  2016   4       209     20569    1        20573    2        WT          1
4718538.0  2016   4       209     20569    1        20574    2        WT          1
4729596.0  2016   4       245     20569    1        20584    2        B2          1
                                                                                 ..
2863583.0  2016   4       689     20559    1        20569    2        B2          1
2865787.0  2016   4       691     20559    1        20566    1        B1          1
2865828.0  2016   4       691     20559    1        20600    2        B2          1
2867437.0  2016   4       691     20559    1        20590    2        B2          1
999282.0   2016   4       129     20550    1        20553    1        WB         

In [None]:
# Check unique
citydemo_df[citydemo_columns].nunique()

cicid       951
i94yr         1
i94mon        1
i94cit       87
arrdate      30
i94mode       3
depdate     109
i94visa       3
visatype      8
dtype: int64

In [33]:
citydemo_df[citydemo_columns].sort_values('arrdate',na_position="last")

Unnamed: 0,cicid,i94yr,i94mon,i94cit,arrdate,i94mode,depdate,i94visa,visatype
215,25478.0,2016,4,131,20545,1,20633,2,WT
770,67523.0,2016,4,245,20545,1,20560,1,B1
244,86265.0,2016,4,368,20545,1,20553,2,B2
867,18310.0,2016,4,123,20545,1,20548,2,WT
665,32582.0,2016,4,135,20545,1,20552,2,WT
...,...,...,...,...,...,...,...,...,...
115,5883463.0,2016,4,687,20574,1,20589,2,B2
109,5756066.0,2016,4,260,20574,3,20576,2,B2
256,5899181.0,2016,4,696,20574,1,20583,1,B1
371,6057910.0,2016,4,252,20574,1,20578,2,GMT


In [34]:
citydemo_df.sort_values(by=['visatype','cicid'], ascending=True)

Unnamed: 0,cicid,i94yr,i94mon,i94cit,arrdate,i94mode,depdate,i94visa,visatype
892,1215382.0,2016,4,245,20551,1,20569,1,B1
421,1330915.0,2016,4,687,20551,1,20558,1,B1
126,1346007.0,2016,4,746,20551,1,20566,1,B1
627,1346274.0,2016,4,746,20551,1,20554,1,B1
123,1643294.0,2016,4,273,20553,1,20558,1,B1
...,...,...,...,...,...,...,...,...,...
420,982263.0,2016,4,103,20550,1,20592,2,WT
848,982461.0,2016,4,103,20550,1,20551,2,WT
4,985523.0,2016,4,111,20550,3,20553,2,WT
897,991350.0,2016,4,111,20550,1,20555,2,WT


In [None]:
# Check unique
citydemo_df[citydemo_columns].value_counts()

cicid      i94yr  i94mon  i94cit  arrdate  i94mode  depdate  i94visa  visatype
1000074.0  2016   4       129     20550    1        20564    2        WT          1
480428.0   2016   4       148     20547    1        20559    2        WT          1
4718122.0  2016   4       209     20569    1        20573    2        WT          1
4718538.0  2016   4       209     20569    1        20574    2        WT          1
4729596.0  2016   4       245     20569    1        20584    2        B2          1
                                                                                 ..
2863583.0  2016   4       689     20559    1        20569    2        B2          1
2865787.0  2016   4       691     20559    1        20566    1        B1          1
2865828.0  2016   4       691     20559    1        20600    2        B2          1
2867437.0  2016   4       691     20559    1        20590    2        B2          1
999282.0   2016   4       129     20550    1        20553    1        WB         

In [35]:
citydemo_df['visatype'].value_counts(normalize=True)*100

WT     45.215563
B2     34.700315
WB      9.463722
B1      6.414301
GMT     2.523659
F1      0.841220
CP      0.525762
E2      0.315457
Name: visatype, dtype: float64

In [44]:
citydemo_df["Visa Ranking"] = citydemo_df["visatype"].rank(ascending = True).astype("int")
citydemo_df.sort_values(by=['visatype','Visa Ranking'], ascending=True)

Unnamed: 0,cicid,i94yr,i94mon,i94cit,arrdate,i94mode,depdate,i94visa,visatype,Visa Ranking
30,5692439.0,2016,4,133,20574,1,20580,1,B1,31
34,4805034.0,2016,4,582,20569,1,20573,1,B1,31
41,692716.0,2016,4,245,20548,1,20651,1,B1,31
68,3293058.0,2016,4,691,20561,1,20565,1,B1,31
94,95870.0,2016,4,528,20545,1,20549,1,B1,31
...,...,...,...,...,...,...,...,...,...,...
989,1360834.0,2016,4,117,20552,1,20556,2,WT,736
992,3874218.0,2016,4,148,20565,1,20582,2,WT,736
994,5081809.0,2016,4,254,20571,1,20582,2,WT,736
995,4288772.0,2016,4,135,20567,1,20572,2,WT,736


In [45]:
citydemo_df['visatype'].value_counts()

WT     430
B2     330
WB      90
B1      61
GMT     24
F1       8
CP       5
E2       3
Name: visatype, dtype: int64

In [None]:
# Write to parquet partitioned by arrdate - Run on production
citydemo_df.write.partitionBy("arrdate").parquet(os.path.join(output_data, table), mode="overwrite")