### Data Cleaning

I would like to clean data from two sources:

1) ACS 2018 Census Data
2) Shoemill 5-year Transaction History (By store, location, Customer, Price, etc.)



#### ACS 2018 Data

In [1]:
import pandas as pd
import numpy as np
acs_dta = pd.read_csv('acs_dta.csv')
acs_dta.head()

Unnamed: 0.1,Unnamed: 0,unwt_tot_pop_count,tot_housing,tot_sex_age,tot_male,tot_female,tot_med_age,med_age_male,med_age_female,tot_pop,hh_inc
0,"Block Group 1, Census Tract 16, Deschutes Coun...",50.0,20.0,874,586,288,32.9,31.0,57.0,874,374
1,"Block Group 2, Census Tract 16, Deschutes Coun...",100.0,50.0,2271,1089,1182,33.5,28.1,36.8,2271,979
2,"Block Group 1, Census Tract 17, Deschutes Coun...",150.0,60.0,3118,1374,1744,40.5,42.3,37.8,3118,1269
3,"Block Group 2, Census Tract 17, Deschutes Coun...",100.0,50.0,2370,1201,1169,37.2,34.3,39.7,2370,1026
4,"Block Group 4, Census Tract 17, Deschutes Coun...",90.0,40.0,2228,952,1276,32.8,37.6,30.2,2228,790


You can see above that there is an unnamed column and I would like to break apart block group, tract, and county into columns.

In [28]:
acsdta_cln = acs_dta["Unnamed: 0"].str.split(",", n = 5, expand = True)

In [29]:
print(acsdta_cln)

                  0                    1                  2  \
0     Block Group 1      Census Tract 16   Deschutes County   
1     Block Group 2      Census Tract 16   Deschutes County   
2     Block Group 1      Census Tract 17   Deschutes County   
3     Block Group 2      Census Tract 17   Deschutes County   
4     Block Group 4      Census Tract 17   Deschutes County   
...             ...                  ...                ...   
1092  Block Group 2   Census Tract 67.02   Multnomah County   
1093  Block Group 2     Census Tract 105   Multnomah County   
1094  Block Group 1   Census Tract 82.01   Multnomah County   
1095  Block Group 2      Census Tract 32   Multnomah County   
1096  Block Group 3    Census Tract 4.02   Multnomah County   

                                3  \
0      Oregon: Summary level: 150   
1      Oregon: Summary level: 150   
2      Oregon: Summary level: 150   
3      Oregon: Summary level: 150   
4      Oregon: Summary level: 150   
...                  

In [27]:
acsdta_cln

0       [Block Group 1,  Census Tract 16,  Deschutes C...
1       [Block Group 2,  Census Tract 16,  Deschutes C...
2       [Block Group 1,  Census Tract 17,  Deschutes C...
3       [Block Group 2,  Census Tract 17,  Deschutes C...
4       [Block Group 4,  Census Tract 17,  Deschutes C...
                              ...                        
1092    [Block Group 2,  Census Tract 67.02,  Multnoma...
1093    [Block Group 2,  Census Tract 105,  Multnomah ...
1094    [Block Group 1,  Census Tract 82.01,  Multnoma...
1095    [Block Group 2,  Census Tract 32,  Multnomah C...
1096    [Block Group 3,  Census Tract 4.02,  Multnomah...
Name: Unnamed: 0, Length: 1097, dtype: object

In [31]:
#pd.options.display.max_columns = None
cols = acsdta_cln.columns.tolist()

acsdta_cln.to_csv('acsdta_cln.csv', index=True)

In [24]:
acsdta_cln.columns.values

array([0, 1, 2, 3, 4], dtype=int64)

In [34]:
acsdta_cln.rename(columns={'0': 'blkgrp', '1': 'tract','2':'cnty'}, inplace=True)

In [35]:
print(acsdta_cln)

                  0                    1                  2  \
0     Block Group 1      Census Tract 16   Deschutes County   
1     Block Group 2      Census Tract 16   Deschutes County   
2     Block Group 1      Census Tract 17   Deschutes County   
3     Block Group 2      Census Tract 17   Deschutes County   
4     Block Group 4      Census Tract 17   Deschutes County   
...             ...                  ...                ...   
1092  Block Group 2   Census Tract 67.02   Multnomah County   
1093  Block Group 2     Census Tract 105   Multnomah County   
1094  Block Group 1   Census Tract 82.01   Multnomah County   
1095  Block Group 2      Census Tract 32   Multnomah County   
1096  Block Group 3    Census Tract 4.02   Multnomah County   

                                3  \
0      Oregon: Summary level: 150   
1      Oregon: Summary level: 150   
2      Oregon: Summary level: 150   
3      Oregon: Summary level: 150   
4      Oregon: Summary level: 150   
...                  

In [36]:
acsdta_cln.columns = ['blkgrp', 'tract', 'cnty', 'sum', 'lis']

In [37]:
print(acsdta_cln)

             blkgrp                tract               cnty  \
0     Block Group 1      Census Tract 16   Deschutes County   
1     Block Group 2      Census Tract 16   Deschutes County   
2     Block Group 1      Census Tract 17   Deschutes County   
3     Block Group 2      Census Tract 17   Deschutes County   
4     Block Group 4      Census Tract 17   Deschutes County   
...             ...                  ...                ...   
1092  Block Group 2   Census Tract 67.02   Multnomah County   
1093  Block Group 2     Census Tract 105   Multnomah County   
1094  Block Group 1   Census Tract 82.01   Multnomah County   
1095  Block Group 2      Census Tract 32   Multnomah County   
1096  Block Group 3    Census Tract 4.02   Multnomah County   

                              sum  \
0      Oregon: Summary level: 150   
1      Oregon: Summary level: 150   
2      Oregon: Summary level: 150   
3      Oregon: Summary level: 150   
4      Oregon: Summary level: 150   
...                  

In [38]:
acsdta_cln.drop(columns=['sum', 'lis'])

Unnamed: 0,blkgrp,tract,cnty
0,Block Group 1,Census Tract 16,Deschutes County
1,Block Group 2,Census Tract 16,Deschutes County
2,Block Group 1,Census Tract 17,Deschutes County
3,Block Group 2,Census Tract 17,Deschutes County
4,Block Group 4,Census Tract 17,Deschutes County
...,...,...,...
1092,Block Group 2,Census Tract 67.02,Multnomah County
1093,Block Group 2,Census Tract 105,Multnomah County
1094,Block Group 1,Census Tract 82.01,Multnomah County
1095,Block Group 2,Census Tract 32,Multnomah County


In [44]:
# Ok now join back to original

#acsdtafin = pd.concat([acs_dta, acsdta_cln], axis=1, ignore_index=True)
fin = pd.concat([acs_dta.reset_index(drop=True), acsdta_cln.reset_index(drop=False)], axis=1)

In [40]:
print(acsdtafin)

                                                      0      1     2     3  \
0     Block Group 1, Census Tract 16, Deschutes Coun...   50.0  20.0   874   
1     Block Group 2, Census Tract 16, Deschutes Coun...  100.0  50.0  2271   
2     Block Group 1, Census Tract 17, Deschutes Coun...  150.0  60.0  3118   
3     Block Group 2, Census Tract 17, Deschutes Coun...  100.0  50.0  2370   
4     Block Group 4, Census Tract 17, Deschutes Coun...   90.0  40.0  2228   
...                                                 ...    ...   ...   ...   
1092  Block Group 2, Census Tract 67.02, Multnomah C...  100.0  50.0  1049   
1093  Block Group 2, Census Tract 105, Multnomah Cou...   90.0  40.0  1037   
1094  Block Group 1, Census Tract 82.01, Multnomah C...  100.0  60.0   957   
1095  Block Group 2, Census Tract 32, Multnomah Coun...  100.0  40.0  1189   
1096  Block Group 3, Census Tract 4.02, Multnomah Co...  100.0  50.0  1424   

         4     5     6     7     8     9    10             11  

In [41]:
acsdtafin.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,"Block Group 1, Census Tract 16, Deschutes Coun...",50.0,20.0,874,586,288,32.9,31.0,57.0,874,374,Block Group 1,Census Tract 16,Deschutes County,Oregon: Summary level: 150,state:41> county:017> tract:001600> block gro...
1,"Block Group 2, Census Tract 16, Deschutes Coun...",100.0,50.0,2271,1089,1182,33.5,28.1,36.8,2271,979,Block Group 2,Census Tract 16,Deschutes County,Oregon: Summary level: 150,state:41> county:017> tract:001600> block gro...
2,"Block Group 1, Census Tract 17, Deschutes Coun...",150.0,60.0,3118,1374,1744,40.5,42.3,37.8,3118,1269,Block Group 1,Census Tract 17,Deschutes County,Oregon: Summary level: 150,state:41> county:017> tract:001700> block gro...
3,"Block Group 2, Census Tract 17, Deschutes Coun...",100.0,50.0,2370,1201,1169,37.2,34.3,39.7,2370,1026,Block Group 2,Census Tract 17,Deschutes County,Oregon: Summary level: 150,state:41> county:017> tract:001700> block gro...
4,"Block Group 4, Census Tract 17, Deschutes Coun...",90.0,40.0,2228,952,1276,32.8,37.6,30.2,2228,790,Block Group 4,Census Tract 17,Deschutes County,Oregon: Summary level: 150,state:41> county:017> tract:001700> block gro...


In [45]:
print(fin)

                                             Unnamed: 0  unwt_tot_pop_count  \
0     Block Group 1, Census Tract 16, Deschutes Coun...                50.0   
1     Block Group 2, Census Tract 16, Deschutes Coun...               100.0   
2     Block Group 1, Census Tract 17, Deschutes Coun...               150.0   
3     Block Group 2, Census Tract 17, Deschutes Coun...               100.0   
4     Block Group 4, Census Tract 17, Deschutes Coun...                90.0   
...                                                 ...                 ...   
1092  Block Group 2, Census Tract 67.02, Multnomah C...               100.0   
1093  Block Group 2, Census Tract 105, Multnomah Cou...                90.0   
1094  Block Group 1, Census Tract 82.01, Multnomah C...               100.0   
1095  Block Group 2, Census Tract 32, Multnomah Coun...               100.0   
1096  Block Group 3, Census Tract 4.02, Multnomah Co...               100.0   

      tot_housing  tot_sex_age  tot_male  tot_femal

In [46]:
dta_fin= fin.drop(columns=['Unnamed: 0 ', 'sum','lis'])

KeyError: "['Unnamed: 0 '] not found in axis"

In [51]:
fin.drop(['Unnamed: 0 , 'sum','lis'], axis=1, inplace=True)

SyntaxError: invalid syntax (<ipython-input-51-4b96afb257e5>, line 1)

In [49]:
fin.head()

Unnamed: 0,unwt_tot_pop_count,tot_housing,tot_sex_age,tot_male,tot_female,tot_med_age,med_age_male,med_age_female,tot_pop,hh_inc,index,blkgrp,tract,cnty,sum,lis
0,50.0,20.0,874,586,288,32.9,31.0,57.0,874,374,0,Block Group 1,Census Tract 16,Deschutes County,Oregon: Summary level: 150,state:41> county:017> tract:001600> block gro...
1,100.0,50.0,2271,1089,1182,33.5,28.1,36.8,2271,979,1,Block Group 2,Census Tract 16,Deschutes County,Oregon: Summary level: 150,state:41> county:017> tract:001600> block gro...
2,150.0,60.0,3118,1374,1744,40.5,42.3,37.8,3118,1269,2,Block Group 1,Census Tract 17,Deschutes County,Oregon: Summary level: 150,state:41> county:017> tract:001700> block gro...
3,100.0,50.0,2370,1201,1169,37.2,34.3,39.7,2370,1026,3,Block Group 2,Census Tract 17,Deschutes County,Oregon: Summary level: 150,state:41> county:017> tract:001700> block gro...
4,90.0,40.0,2228,952,1276,32.8,37.6,30.2,2228,790,4,Block Group 4,Census Tract 17,Deschutes County,Oregon: Summary level: 150,state:41> county:017> tract:001700> block gro...


In [52]:
fin.to_csv('fin.csv', index=True)

In [54]:
acscln = pd.read_csv('fin.csv')
acscln.head()

Unnamed: 0,unwt_tot_pop_count,tot_housing,tot_sex_age,tot_male,tot_female,tot_med_age,med_age_male,med_age_female,tot_pop,hh_inc,blkgrp,tract,cnty
0,50.0,20.0,874,586,288,32.9,31.0,57.0,874,374,Block Group 1,Census Tract 16,Deschutes County
1,100.0,50.0,2271,1089,1182,33.5,28.1,36.8,2271,979,Block Group 2,Census Tract 16,Deschutes County
2,150.0,60.0,3118,1374,1744,40.5,42.3,37.8,3118,1269,Block Group 1,Census Tract 17,Deschutes County
3,100.0,50.0,2370,1201,1169,37.2,34.3,39.7,2370,1026,Block Group 2,Census Tract 17,Deschutes County
4,90.0,40.0,2228,952,1276,32.8,37.6,30.2,2228,790,Block Group 4,Census Tract 17,Deschutes County


In [55]:
# Now save out to .csv and move on to EDA

acscln.to_csv('acscln.csv', index=True)