# `censusdata` example

Docs: https://jtleider.github.io/censusdata/

```shell
pip install --user censusdata
```

In [1]:
import censusdata
import pandas as pd

In [11]:
# the last five labels that have the word unemploy in them
censusdata.search(src='acs1', year = 2017, field = 'label', criterion = 'internet' )

[('B24030_083E',
  'SEX BY INDUSTRY FOR THE CIVILIAN EMPLOYED POPULATION 16 YEARS AND OVER',
  'Estimate!!Total!!Male!!Information!!Broadcasting, internet publishing, and telecommunications services'),
 ('B24030_187E',
  'SEX BY INDUSTRY FOR THE CIVILIAN EMPLOYED POPULATION 16 YEARS AND OVER',
  'Estimate!!Total!!Female!!Information!!Broadcasting, internet publishing, and telecommunications services'),
 ('B24040_083E',
  'SEX BY INDUSTRY FOR THE FULL-TIME, YEAR-ROUND CIVILIAN EMPLOYED POPULATION 16 YEARS AND OVER',
  'Estimate!!Total!!Male!!Information!!Broadcasting, internet publishing, and telecommunications services'),
 ('B24040_187E',
  'SEX BY INDUSTRY FOR THE FULL-TIME, YEAR-ROUND CIVILIAN EMPLOYED POPULATION 16 YEARS AND OVER',
  'Estimate!!Total!!Female!!Information!!Broadcasting, internet publishing, and telecommunications services'),
 ('B28002_002E',
  'PRESENCE AND TYPES OF INTERNET SUBSCRIPTIONS IN HOUSEHOLD',
  'Estimate!!Total!!With an Internet subscription'),
 ('B28002_0

In [9]:
# you can search by "concept"
censusdata.search(src = 'acs1', year = 2015, field = 'concept', criterion = 'internet')

[('B28002_001E',
  'B28002.  Presence and Types of Internet Subscriptions in Household',
  'Total:'),
 ('B28002_001M',
  'B28002.  Presence and Types of Internet Subscriptions in Household',
  'Margin of Error for!!Total:'),
 ('B28002_002E',
  'B28002.  Presence and Types of Internet Subscriptions in Household',
  'With an Internet subscription:'),
 ('B28002_002M',
  'B28002.  Presence and Types of Internet Subscriptions in Household',
  'Margin of Error for!!With an Internet subscription:'),
 ('B28002_003E',
  'B28002.  Presence and Types of Internet Subscriptions in Household',
  'With an Internet subscription:!!Dial-up alone'),
 ('B28002_003M',
  'B28002.  Presence and Types of Internet Subscriptions in Household',
  'Margin of Error for!!With an Internet subscription:!!Dial-up alone'),
 ('B28002_004E',
  'B28002.  Presence and Types of Internet Subscriptions in Household',
  'With an Internet subscription:!!DSL:'),
 ('B28002_004M',
  'B28002.  Presence and Types of Internet Subscri

('B28009I_003E',
  'PRESENCE OF A COMPUTER AND TYPE OF INTERNET SUBSCRIPTION IN HOUSEHOLD (HISPANIC OR LATINO)',
  'Estimate!!Total!!Has a computer!!With dial-up Internet subscription alone'),
 ('B28009I_004E',
  'PRESENCE OF A COMPUTER AND TYPE OF INTERNET SUBSCRIPTION IN HOUSEHOLD (HISPANIC OR LATINO)',
  'Estimate!!Total!!Has a computer!!With a broadband Internet subscription'),
 ('B28009I_005E',
  'PRESENCE OF A COMPUTER AND TYPE OF INTERNET SUBSCRIPTION IN HOUSEHOLD (HISPANIC OR LATINO)',
  'Estimate!!Total!!Has a computer!!Without an Internet subscription'),

In [18]:
# you can grab tables if you know what you are looking for
censusdata.censustable(src = 'acs5', year = 2015, table = 'B28009')

Table not found!


ValueError: 

In [12]:
#and print them legibly
censusdata.printtable(censusdata.censustable(src = 'acs5', year = 2015, table = 'B15003'))

Variable     | Table                          | Label                                                    | Type 
-------------------------------------------------------------------------------------------------------------------
B15003_001E  | B15003.  Educational Attainmen | Total:                                                   | int  
B15003_002E  | B15003.  Educational Attainmen | No schooling completed                                   | int  
B15003_003E  | B15003.  Educational Attainmen | Nursery school                                           | int  
B15003_004E  | B15003.  Educational Attainmen | Kindergarten                                             | int  
B15003_005E  | B15003.  Educational Attainmen | 1st grade                                                | int  
B15003_006E  | B15003.  Educational Attainmen | 2nd grade                                                | int  
B15003_007E  | B15003.  Educational Attainmen | 3rd grade                                    

It comes with a `.geographies()` module

In [17]:
# geography object - * is wildcard
geo = censusdata.censusgeo(geo = [('state', '*')])
type(geo)

censusdata.censusgeo.censusgeo

In [18]:
# grab all the states in acs1 of year 2017
censusdata.geographies(within = geo, src='acs1', year=2017)

{'Alabama': censusgeo((('state', '01'),)),
 'Alaska': censusgeo((('state', '02'),)),
 'Arizona': censusgeo((('state', '04'),)),
 'Arkansas': censusgeo((('state', '05'),)),
 'California': censusgeo((('state', '06'),)),
 'Colorado': censusgeo((('state', '08'),)),
 'Connecticut': censusgeo((('state', '09'),)),
 'Delaware': censusgeo((('state', '10'),)),
 'District of Columbia': censusgeo((('state', '11'),)),
 'Florida': censusgeo((('state', '12'),)),
 'Georgia': censusgeo((('state', '13'),)),
 'Hawaii': censusgeo((('state', '15'),)),
 'Idaho': censusgeo((('state', '16'),)),
 'Illinois': censusgeo((('state', '17'),)),
 'Indiana': censusgeo((('state', '18'),)),
 'Iowa': censusgeo((('state', '19'),)),
 'Kansas': censusgeo((('state', '20'),)),
 'Kentucky': censusgeo((('state', '21'),)),
 'Louisiana': censusgeo((('state', '22'),)),
 'Maine': censusgeo((('state', '23'),)),
 'Maryland': censusgeo((('state', '24'),)),
 'Massachusetts': censusgeo((('state', '25'),)),
 'Michigan': censusgeo((('stat

In [23]:
# grab all counties in state X from acs1 in 2017
geo = censusdata.censusgeo([("state", "06"), ("county", "*")])
censusdata.geographies(within=geo, src = 'acs1', year = 2017)

{'Alameda County, California': censusgeo((('state', '06'), ('county', '001'))),
 'Butte County, California': censusgeo((('state', '06'), ('county', '007'))),
 'Contra Costa County, California': censusgeo((('state', '06'), ('county', '013'))),
 'El Dorado County, California': censusgeo((('state', '06'), ('county', '017'))),
 'Fresno County, California': censusgeo((('state', '06'), ('county', '019'))),
 'Humboldt County, California': censusgeo((('state', '06'), ('county', '023'))),
 'Imperial County, California': censusgeo((('state', '06'), ('county', '025'))),
 'Kern County, California': censusgeo((('state', '06'), ('county', '029'))),
 'Kings County, California': censusgeo((('state', '06'), ('county', '031'))),
 'Lake County, California': censusgeo((('state', '06'), ('county', '033'))),
 'Los Angeles County, California': censusgeo((('state', '06'), ('county', '037'))),
 'Madera County, California': censusgeo((('state', '06'), ('county', '039'))),
 'Marin County, California': censusgeo(

# Exercise

Internet variables

In [None]:
 ('B28011_002E',
  'INTERNET SUBSCRIPTIONS IN HOUSEHOLD',
  'Estimate!!Total!!With an Internet subscription'),
 ('B28011_003E',
  'INTERNET SUBSCRIPTIONS IN HOUSEHOLD',
  'Estimate!!Total!!With an Internet subscription!!Dial-up alone'),
 ('B28011_004E',
  'INTERNET SUBSCRIPTIONS IN HOUSEHOLD',
  'Estimate!!Total!!With an Internet subscription!!Broadband such as cable, fiber optic, or DSL'),
 ('B28011_005E',
  'INTERNET SUBSCRIPTIONS IN HOUSEHOLD',
  'Estimate!!Total!!With an Internet subscription!!Satellite Internet service'),
 ('B28011_006E',
  'INTERNET SUBSCRIPTIONS IN HOUSEHOLD',
  'Estimate!!Total!!With an Internet subscription!!Other service'),
 ('B28011_007E',
  'INTERNET SUBSCRIPTIONS IN HOUSEHOLD',
  'Estimate!!Total!!Internet access without a subscription'),
 ('B28011_008E',
  'INTERNET SUBSCRIPTIONS IN HOUSEHOLD',
  'Estimate!!Total!!No Internet access')

In [31]:
my_vars = ["B28011_002E",
           "B28011_003E",
           "B28011_004E",
           "B28011_005E",
           "B28011_006E",
           "B28011_007E",
           "B28011_008E",
          ]

Grab your geographies

In [32]:
geography_of_interest = [("state", "06")]
my_geo = censusdata.censusgeo(geo = geography_of_interest)

In [33]:
data = censusdata.download(src = 'acs1', year = 2017, geo = my_geo, var = my_vars)

In [34]:
data.head()

Unnamed: 0,B28011_002E,B28011_003E,B28011_004E,B28011_005E,B28011_006E,B28011_007E,B28011_008E
"California: Summary level: 040, state:06",11432470,31868,9672461,1018655,115040,293167,1279460


In [35]:
rename_vars = {
    "B28011_002E": 'internet access',
    "B28011_003E": 'dialup',
    "B28011_004E": 'broadband',
    "B28011_005E": 'satellite',
    "B28011_006E": 'other',
    "B28011_007E": 'no subscription',
    "B28011_008E": 'no internet access',
}

In [36]:
# need to do this because rename returns dataframe
data = data.rename(mapper = rename_vars, axis = 1)

In [37]:
data.head()

Unnamed: 0,internet access,dialup,broadband,satellite,other,no subscription,no internet access
"California: Summary level: 040, state:06",11432470,31868,9672461,1018655,115040,293167,1279460


In [39]:
data['broadband'] / (data['internet access'] + data['no internet access'])

California: Summary level: 040, state:06    0.760896
dtype: float64

# Download data

In [None]:
('B28009I_003E',
  'PRESENCE OF A COMPUTER AND TYPE OF INTERNET SUBSCRIPTION IN HOUSEHOLD (HISPANIC OR LATINO)',
  'Estimate!!Total!!Has a computer!!With dial-up Internet subscription alone'),
 ('B28009I_004E',
  'PRESENCE OF A COMPUTER AND TYPE OF INTERNET SUBSCRIPTION IN HOUSEHOLD (HISPANIC OR LATINO)',
  'Estimate!!Total!!Has a computer!!With a broadband Internet subscription'),
 ('B28009I_005E',
  'PRESENCE OF A COMPUTER AND TYPE OF INTERNET SUBSCRIPTION IN HOUSEHOLD (HISPANIC OR LATINO)',
  'Estimate!!Total!!Has a computer!!Without an Internet subscription'),

In [14]:
my_vars = ["B28009I_003E", "B28009I_004E", "B28009I_005E"]

Grab your geographies

In [15]:
geography_of_interest = [("state", "06")]
my_geo = censusdata.censusgeo(geo = geography_of_interest)

In [16]:
data = censusdata.download(src = 'acs1', year = 2017, geo = my_geo, var = my_vars)

In [17]:
data.head()

Unnamed: 0,B28009I_003E,B28009I_004E,B28009I_005E
"California: Summary level: 040, state:06",15944,13245600,1244285


In [19]:
rename_vars = {
    "B28009I_003E": "dialup", 
    "B28009I_004E": "broadband", 
    "B28009I_005E": "no internet",
}

In [20]:
# need to do this because rename returns dataframe
data = data.rename(mapper = rename_vars, axis = 1)

In [21]:
data.head()

Unnamed: 0,dialup,broadband,no internet
"California: Summary level: 040, state:06",15944,13245600,1244285


# Download data

In [29]:
censusdata.download?

[1;31mSignature:[0m
[0mcensusdata[0m[1;33m.[0m[0mdownload[0m[1;33m([0m[1;33m
[0m    [0msrc[0m[1;33m,[0m[1;33m
[0m    [0myear[0m[1;33m,[0m[1;33m
[0m    [0mgeo[0m[1;33m,[0m[1;33m
[0m    [0mvar[0m[1;33m,[0m[1;33m
[0m    [0mkey[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mtabletype[0m[1;33m=[0m[1;34m'detail'[0m[1;33m,[0m[1;33m
[0m    [0mendpt[0m[1;33m=[0m[1;34m''[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Download data from Census API.

Args:
        src (str): Census data source: 'acs1' for ACS 1-year estimates, 'acs5' for ACS 5-year estimates, 'acs3' for
                ACS 3-year estimates, 'acsse' for ACS 1-year supplemental estimates, 'sf1' for SF1 data.
        year (int): Year of data.
        geo (censusgeo): Geographies for which to download data.
        var (list of str): Census variables to download.
        key (str, optional): Census API key.
        tabletype (str

In [16]:
data = censusdata.download(src = 'acs1', year = 2017, geo = my_geo, var = my_vars)

In [17]:
data.head()

Unnamed: 0,B28009I_003E,B28009I_004E,B28009I_005E
"California: Summary level: 040, state:06",15944,13245600,1244285


In [19]:
rename_vars = {
    "B28009I_003E": "dialup", 
    "B28009I_004E": "broadband", 
    "B28009I_005E": "no internet",
}

In [20]:
# need to do this because rename returns dataframe
data = data.rename(mapper = rename_vars, axis = 1)

In [21]:
data.head()

Unnamed: 0,dialup,broadband,no internet
"California: Summary level: 040, state:06",15944,13245600,1244285


In [22]:
data['total'] = data['dialup'] + data['broadband'] + data['no internet']

In [25]:
data['share of broadband'] = data['broadband'] / data['total']

In [26]:
data.head()

Unnamed: 0,dialup,broadband,no internet,total,share of broadband
"California: Summary level: 040, state:06",15944,13245600,1244285,14505829,0.913123


# Advanced data prepping

This data kind of works but ideally we would have something like this

| County |    State   | Summary level | State Code | County Code | Bachelors | Masters | Professional degree | Doctorates | Total | share of bachelors+ |
|:------:|:----------:|:-------------:|:----------:|:-----------:|:---------:|:-------:|:-------------------:|:----------:|:-----:|:-------------------:|
| Modoc  | California |       050     |     06     |     049     |     859   |   227   |          50         |      0     |  6852 |       0.165791      |
| Merced | California |       050     |     06     |     047     |   14006   |  5063   |        1337         |   1135     |156562 |       0.137588      |
| Lake   | California |       050     |     06     |     033     |    4588   |  1678   |         577         |    273     | 56386 |       0.153408      |
|Mariposa| California |       050     |     06     |     043     |    2017   |   933   |         253         |    132     | 13656 |       0.244215      |
|  Yuba  | California |       050     |     06     |     115     |    5573   |  1565   |         406         |     84     | 56582 |       0.163754      |

First step, `.reset_index()`

In [42]:
# this returns dataframe so we need to assign it back to itself
data = data.reset_index()

In [43]:
data.head()

Unnamed: 0,index,Bachelors,Masters,Professional degree,Doctorates,Total,share of bachelors+
0,"Modoc County, California: Summary level: 050, ...",859,227,50,0,6852,0.165791
1,"Merced County, California: Summary level: 050,...",14006,5063,1337,1135,156562,0.137588
2,"Lake County, California: Summary level: 050, s...",4588,1678,577,273,46386,0.153408
3,"Mariposa County, California: Summary level: 05...",2017,933,253,132,13656,0.244215
4,"Yuba County, California: Summary level: 050, s...",5573,1565,406,84,46582,0.163754


`.str` accessor - lets you apply string methods to pandas series - however our index is a `censusgeo` object

In [57]:
print(data['index'][0])
print(data['index'][0].hierarchy())

Modoc County, California: Summary level: 050, state:06> county:049
state> county


Save as string

In [58]:
data['index_as_string'] = data['index'].astype(str)

In [59]:
data.head()

Unnamed: 0,index,Bachelors,Masters,Professional degree,Doctorates,Total,share of bachelors+,index_as_string
0,"Modoc County, California: Summary level: 050, ...",859,227,50,0,6852,0.165791,"Modoc County, California: Summary level: 050, ..."
1,"Merced County, California: Summary level: 050,...",14006,5063,1337,1135,156562,0.137588,"Merced County, California: Summary level: 050,..."
2,"Lake County, California: Summary level: 050, s...",4588,1678,577,273,46386,0.153408,"Lake County, California: Summary level: 050, s..."
3,"Mariposa County, California: Summary level: 05...",2017,933,253,132,13656,0.244215,"Mariposa County, California: Summary level: 05..."
4,"Yuba County, California: Summary level: 050, s...",5573,1565,406,84,46582,0.163754,"Yuba County, California: Summary level: 050, s..."


In [68]:
# .str accessor
data['index_as_string'].str.split(":").str[0]

0               Modoc County, California
1              Merced County, California
2                Lake County, California
3            Mariposa County, California
4                Yuba County, California
5        Contra Costa County, California
6                Inyo County, California
7              Lassen County, California
8          Stanislaus County, California
9       Santa Barbara County, California
10             Sonoma County, California
11             Tehama County, California
12           Imperial County, California
13               Mono County, California
14         San Benito County, California
15            Alameda County, California
16         Sacramento County, California
17          El Dorado County, California
18               Napa County, California
19          San Diego County, California
20           Monterey County, California
21      San Francisco County, California
22             Sierra County, California
23             Tulare County, California
24              

### now we know somethings already 
1. these are all in California so `State` and `State Code` are 'California' and '06' respectively for all of them.
2. Summary Level is the same for all: 050

So what we really need is `County` and `County Code`

In [73]:
# rough way 
data['index_as_string'].str.split("County").str[0]

0               Modoc 
1              Merced 
2                Lake 
3            Mariposa 
4                Yuba 
5        Contra Costa 
6                Inyo 
7              Lassen 
8          Stanislaus 
9       Santa Barbara 
10             Sonoma 
11             Tehama 
12           Imperial 
13               Mono 
14         San Benito 
15            Alameda 
16         Sacramento 
17          El Dorado 
18               Napa 
19          San Diego 
20           Monterey 
21      San Francisco 
22             Sierra 
23             Tulare 
24               Yolo 
25           Humboldt 
26             Nevada 
27             Alpine 
28             Madera 
29          Mendocino 
30         Santa Cruz 
31            Trinity 
32        Los Angeles 
33          Riverside 
34     San Bernardino 
35        Santa Clara 
36              Marin 
37             Placer 
38           Siskiyou 
39             Shasta 
40             Solano 
41          Del Norte 
42             Colusa 
43         

In [75]:
data['index_as_string'].str.split("county").str[-1].str.replace(":", "")

0     049
1     047
2     033
3     043
4     115
5     013
6     027
7     035
8     099
9     083
10    097
11    103
12    025
13    051
14    069
15    001
16    067
17    017
18    055
19    073
20    053
21    075
22    091
23    107
24    113
25    023
26    057
27    003
28    039
29    045
30    087
31    105
32    037
33    065
34    071
35    085
36    041
37    061
38    093
39    089
40    095
41    015
42    011
43    021
44    007
45    031
46    063
47    079
48    019
49    029
50    059
51    111
52    009
53    101
54    081
55    109
56    077
57    005
Name: index_as_string, dtype: object

Pandas split has `expand` option

examine first observation

In [80]:
data['index_as_string'][0]

'Modoc County, California: Summary level: 050, state:06> county:049'

The county name is right before "County,"

In [86]:
# gotta chain .str accessor every time you need to apply a string method to a series
data['index_as_string'].str.split("County,").str[0].str.strip()

0               Modoc
1              Merced
2                Lake
3            Mariposa
4                Yuba
5        Contra Costa
6                Inyo
7              Lassen
8          Stanislaus
9       Santa Barbara
10             Sonoma
11             Tehama
12           Imperial
13               Mono
14         San Benito
15            Alameda
16         Sacramento
17          El Dorado
18               Napa
19          San Diego
20           Monterey
21      San Francisco
22             Sierra
23             Tulare
24               Yolo
25           Humboldt
26             Nevada
27             Alpine
28             Madera
29          Mendocino
30         Santa Cruz
31            Trinity
32        Los Angeles
33          Riverside
34     San Bernardino
35        Santa Clara
36              Marin
37             Placer
38           Siskiyou
39             Shasta
40             Solano
41          Del Norte
42             Colusa
43              Glenn
44              Butte
45        

In [87]:
# save it as a colum
data['County'] = data['index_as_string'].str.split("County,").str[0].str.strip()

In [88]:
data.head()

Unnamed: 0,index,Bachelors,Masters,Professional degree,Doctorates,Total,share of bachelors+,index_as_string,County
0,"Modoc County, California: Summary level: 050, ...",859,227,50,0,6852,0.165791,"Modoc County, California: Summary level: 050, ...",Modoc
1,"Merced County, California: Summary level: 050,...",14006,5063,1337,1135,156562,0.137588,"Merced County, California: Summary level: 050,...",Merced
2,"Lake County, California: Summary level: 050, s...",4588,1678,577,273,46386,0.153408,"Lake County, California: Summary level: 050, s...",Lake
3,"Mariposa County, California: Summary level: 05...",2017,933,253,132,13656,0.244215,"Mariposa County, California: Summary level: 05...",Mariposa
4,"Yuba County, California: Summary level: 050, s...",5573,1565,406,84,46582,0.163754,"Yuba County, California: Summary level: 050, s...",Yuba


In [91]:
# same with county code
data['index_as_string'].str.split("county:").str[-1].str.strip()

0     049
1     047
2     033
3     043
4     115
5     013
6     027
7     035
8     099
9     083
10    097
11    103
12    025
13    051
14    069
15    001
16    067
17    017
18    055
19    073
20    053
21    075
22    091
23    107
24    113
25    023
26    057
27    003
28    039
29    045
30    087
31    105
32    037
33    065
34    071
35    085
36    041
37    061
38    093
39    089
40    095
41    015
42    011
43    021
44    007
45    031
46    063
47    079
48    019
49    029
50    059
51    111
52    009
53    101
54    081
55    109
56    077
57    005
Name: index_as_string, dtype: object

In [92]:
data['County Code'] = data['index_as_string'].str.split("county:").str[-1].str.strip()

In [93]:
data.head()

Unnamed: 0,index,Bachelors,Masters,Professional degree,Doctorates,Total,share of bachelors+,index_as_string,County,County Code
0,"Modoc County, California: Summary level: 050, ...",859,227,50,0,6852,0.165791,"Modoc County, California: Summary level: 050, ...",Modoc,49
1,"Merced County, California: Summary level: 050,...",14006,5063,1337,1135,156562,0.137588,"Merced County, California: Summary level: 050,...",Merced,47
2,"Lake County, California: Summary level: 050, s...",4588,1678,577,273,46386,0.153408,"Lake County, California: Summary level: 050, s...",Lake,33
3,"Mariposa County, California: Summary level: 05...",2017,933,253,132,13656,0.244215,"Mariposa County, California: Summary level: 05...",Mariposa,43
4,"Yuba County, California: Summary level: 050, s...",5573,1565,406,84,46582,0.163754,"Yuba County, California: Summary level: 050, s...",Yuba,115


In [95]:
# the rest of the columns are easy to create
data['State'] = 'California'
data['State Code'] = '06'
data['Summary Level'] = '050'

Pandas creates a series with all values = to the string you pass

In [96]:
data.head()

Unnamed: 0,index,Bachelors,Masters,Professional degree,Doctorates,Total,share of bachelors+,index_as_string,County,County Code,State,State Code,Summary Level
0,"Modoc County, California: Summary level: 050, ...",859,227,50,0,6852,0.165791,"Modoc County, California: Summary level: 050, ...",Modoc,49,California,6,50
1,"Merced County, California: Summary level: 050,...",14006,5063,1337,1135,156562,0.137588,"Merced County, California: Summary level: 050,...",Merced,47,California,6,50
2,"Lake County, California: Summary level: 050, s...",4588,1678,577,273,46386,0.153408,"Lake County, California: Summary level: 050, s...",Lake,33,California,6,50
3,"Mariposa County, California: Summary level: 05...",2017,933,253,132,13656,0.244215,"Mariposa County, California: Summary level: 05...",Mariposa,43,California,6,50
4,"Yuba County, California: Summary level: 050, s...",5573,1565,406,84,46582,0.163754,"Yuba County, California: Summary level: 050, s...",Yuba,115,California,6,50


In [106]:
# select a group of columns
columns_im_interested_in = ['State', 'State Code', 'County', 'County Code', 'Summary Level', 'share of bachelors+']
data[columns_im_interested_in].head()

Unnamed: 0,State,State Code,County,County Code,Summary Level,share of bachelors+
0,California,6,Modoc,49,50,0.165791
1,California,6,Merced,47,50,0.137588
2,California,6,Lake,33,50,0.153408
3,California,6,Mariposa,43,50,0.244215
4,California,6,Yuba,115,50,0.163754


save that to a clean dataframe

In [107]:
clean_data = data[columns_im_interested_in]

# .copy() vs subset

Right now, clean_data is "pointing" (is a `view`) to a subset of data so any changes you do to clean_data aren't _guaranteed_ to work. sometimes they do sometimes they don't. So you should create a .copy() of the data if you plan on modifying it after.

In [108]:
clean_data['State'] = 'CA'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [109]:
clean_data = data[columns_im_interested_in].copy()

In [110]:
clean_data['State'] = 'CA'

No warning

In [111]:
clean_data.head()

Unnamed: 0,State,State Code,County,County Code,Summary Level,share of bachelors+
0,CA,6,Modoc,49,50,0.165791
1,CA,6,Merced,47,50,0.137588
2,CA,6,Lake,33,50,0.153408
3,CA,6,Mariposa,43,50,0.244215
4,CA,6,Yuba,115,50,0.163754


Save data

In [112]:
clean_data.to_csv("../data/interim/censusdata_example.csv", index = False,)