In [12]:
#Import dependencies
import pandas as pd
import numpy as np
import os

In [49]:
# Import CSV and convert to dataframes
IncomeZip_df = pd.read_csv(os.path.join("Resources", "income_zip.csv"))
zip_df = pd.read_csv(os.path.join("Resources", "zip.csv"))

In [50]:
# Clean Income dataframe

# Select desired columns for data frame
IncomeZip_df = IncomeZip_df[['GEO.display-label', 'HC01_EST_VC13']]

#Rename columns 
IncomeZip_df = IncomeZip_df.rename( index=str, columns = {"GEO.display-label": "ZIP","HC01_EST_VC13": "MED_INCOME"})

# Verify no missing data
IncomeZip_df.isnull().sum()

# Find duplicate rows
IncomeZip_df[IncomeZip_df.duplicated(['ZIP'])]

#Verify correct data stypes
IncomeZip_df.dtypes

# Inspect data 
IncomeZip_df.MED_INCOME.unique()

IncomeZip_df = IncomeZip_df.replace("N", None)

# Replace missing values and invalid values with None
IncomeZip_df = IncomeZip_df.replace(["-", "250,000+"], None)

# Remove extra characters from zip code
temp_df = IncomeZip_df["ZIP"].str.split(" ", n = 1, expand = True)
IncomeZip_df["ZIP"] = temp_df[1]

# Drop the first row
IncomeZip_df = IncomeZip_df.drop(IncomeZip_df.index[0])

# Change income column to interger value
IncomeZip_df["MED_INCOME"] = IncomeZip_df["MED_INCOME"].astype('int64')
IncomeZip_df["ZIP"] = IncomeZip_df["ZIP"].astype('int64')

# Verify correct data types
IncomeZip_df.dtypes

ZIP           int64
MED_INCOME    int64
dtype: object

In [54]:
# Clean Zip code dataframe

# Get zip code column from income dataframe

zip_nj = IncomeZip_df["ZIP"]
zip_nj

1       7001
2       7002
3       7003
4       7004
5       7005
6       7006
7       7008
8       7009
9       7010
10      7011
11      7012
12      7013
13      7014
14      7016
15      7017
16      7018
17      7020
18      7021
19      7022
20      7023
21      7024
22      7026
23      7027
24      7028
25      7029
26      7030
27      7031
28      7032
29      7033
30      7034
       ...  
571     8852
572     8853
573     8854
574     8857
575     8858
576     8859
577     8861
578     8863
579     8865
580     8867
581     8869
582     8872
583     8873
584     8876
585     8879
586     8880
587     8882
588     8884
589     8886
590     8887
591     8889
592     8890
593     8901
594     8902
595     8904
596    10969
597    10983
598    10990
599    18042
600    18343
Name: ZIP, Length: 600, dtype: int64

In [52]:
zip_df.dtypes

ZIP      int64
LAT    float64
LNG    float64
dtype: object

In [56]:
zip_df.join(IncomeZip_df, on="ZIP")

ValueError: You are trying to merge on int64 and object columns. If you wish to proceed you should use pd.concat

In [None]:
IncomeZip_df.dtypes

In [25]:
# Change AVG_Income column to interger value
IncomeZip_df.AVG_INCOME.unique()

array(['68426', '56701', '74961', '107417', '109888', '114096', '73892',
       '115926', '67799', '56285', '86472', '93005', '96339', '122848',
       '41322', '42335', '103505', '198750', '56057', '132292', '76623',
       '54037', '79063', '195714', '61084', '127523', '74481', '63222',
       '108281', '82626', '92075', '66166', '153381', '125036', '104786',
       '97920', '191449', '110745', '162798', '175556', '57269', '38506',
       '93954', '90355', '34958', '62648', '114229', '154429', '57959',
       '67664', '72440', '81912', '69932', '100000', '103205', '110208',
       '122434', '93787', '73218', '89419', '77400', '59700', '102283',
       '117765', '76389', '250,000+', '121637', '91250', '110547',
       '116615', '81359', '84841', '43407', '50299', '159923', '129405',
       '51509', '99883', '80196', '24353', '27859', '36547', '45227',
       '37421', '30779', '25192', '67782', '94128', '39734', '35765',
       '26326', '44655', '44867', '52480', '67300', '65717', '434

In [26]:
IncomeZip_df = IncomeZip_df.replace("250,000+", None)

In [27]:
IncomeZip_df.AVG_INCOME.unique()

array(['68426', '56701', '74961', '107417', '109888', '114096', '73892',
       '115926', '67799', '56285', '86472', '93005', '96339', '122848',
       '41322', '42335', '103505', '198750', '56057', '132292', '76623',
       '54037', '79063', '195714', '61084', '127523', '74481', '63222',
       '108281', '82626', '92075', '66166', '153381', '125036', '104786',
       '97920', '191449', '110745', '162798', '175556', '57269', '38506',
       '93954', '90355', '34958', '62648', '114229', '154429', '57959',
       '67664', '72440', '81912', '69932', '100000', '103205', '110208',
       '122434', '93787', '73218', '89419', '77400', '59700', '102283',
       '117765', '76389', '121637', '91250', '110547', '116615', '81359',
       '84841', '43407', '50299', '159923', '129405', '51509', '99883',
       '80196', '24353', '27859', '36547', '45227', '37421', '30779',
       '25192', '67782', '94128', '39734', '35765', '26326', '44655',
       '44867', '52480', '67300', '65717', '43429', '46572'

In [29]:
# Change Income column to interger value
IncomeZip_df["AVG_INCOME"] = IncomeZip_df["AVG_INCOME"].astype('int64')
IncomeZip_df["Zip Code"] = IncomeZip_df["Zip Code"].astype('int64')

In [30]:
IncomeZip_df.dtypes

Zip Code      int64
AVG_INCOME    int64
dtype: object

In [31]:
IncomeZip_df

Unnamed: 0,Zip Code,AVG_INCOME
1,7001,68426
2,7002,56701
3,7003,74961
4,7004,107417
5,7005,109888
6,7006,114096
7,7008,73892
8,7009,115926
9,7010,67799
10,7011,56285


In [34]:
IncomeZip_df["Zip Code"] = IncomeZip_df["Zip Code"].apply(lambda x: "{0:0>5}".format(x))

In [35]:
IncomeZip_df

Unnamed: 0,Zip Code,AVG_INCOME
1,07001,68426
2,07002,56701
3,07003,74961
4,07004,107417
5,07005,109888
6,07006,114096
7,07008,73892
8,07009,115926
9,07010,67799
10,07011,56285


In [37]:
IncomeZip_df.dtypes

Zip Code      object
AVG_INCOME     int64
dtype: object

In [39]:
IncomeZip_df.describe()

Unnamed: 0,AVG_INCOME
count,600.0
mean,90759.44
std,35624.626103
min,14740.0
25%,66604.0
50%,85532.0
75%,111372.5
max,233750.0


In [40]:
# Export data to .csv file
IncomeZip_df.to_csv(os.path.join('Data_temp', 'CleanZipCodeIncome.csv'))