## This notebook handles making a subset of Census data
## It also handles creating a conversion file between srprec and Census-tract 

In [65]:
# imports
import pandas as pd
import numpy as np
import requests
import pickle as pkl
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [66]:
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [67]:
data18 = pd.read_pickle('./data/trend18.pkl')
data18.shape

(1334, 26)

In [68]:
data18.head()

Unnamed: 0,county,srprec,cddist,TOTREG,TOTVOTE,CNGDEM01,CNGREP01,election,type,totreg_r,...,nlp,grn,ref,dcl,male,female,hispdem,hisprep,hispdcl,hispoth
0,30,10316,47,1735,1278,630,592,g18,V,1238.0,...,0.0,2.0,0.0,265.0,582.0,656.0,55.0,43.0,30.0,6.0
1,30,10317,47,2079,1214,638,513,g18,V,1184.0,...,0.0,4.0,0.0,286.0,588.0,596.0,165.0,60.0,71.0,10.0
2,30,10319,47,1448,999,495,462,g18,V,945.0,...,0.0,2.0,2.0,256.0,443.0,502.0,68.0,37.0,49.0,5.0
3,30,10320,47,2407,1686,847,758,g18,V,1644.0,...,0.0,5.0,1.0,423.0,851.0,793.0,79.0,57.0,47.0,7.0
4,30,10325,47,1465,1035,488,524,g18,V,1012.0,...,0.0,2.0,1.0,189.0,488.0,524.0,67.0,38.0,18.0,1.0


In [69]:
data18['cddist'].value_counts()

45    347
48    334
39    226
46    198
47    133
49     87
38      9
Name: cddist, dtype: int64

### Select desired columns from Census data for a subset
- Voting data contains 'srprec' or voting precints.
- Statewidedatabase.org provides conversion tables between precints and census tracts.

#### Bring in census DP03 data and select columns.

In [70]:
dp03_18 = pd.read_pickle('./census_data/DP03_clean.pkl')
dp03_18.shape

(583, 143)

In [71]:
dp03_18.head(3)

Unnamed: 0,Geographic Area Name,GEO_ID,Estimate EMPLOYMENT STATUS Population 16 years and over,Estimate EMPLOYMENT STATUS Population 16 years and over In labor force,Estimate EMPLOYMENT STATUS Population 16 years and over In labor force Civilian labor force,Estimate EMPLOYMENT STATUS Population 16 years and over In labor force Civilian labor force Employed,Estimate EMPLOYMENT STATUS Population 16 years and over In labor force Civilian labor force Unemployed,Estimate EMPLOYMENT STATUS Population 16 years and over In labor force Armed Forces,Estimate EMPLOYMENT STATUS Population 16 years and over Not in labor force,Estimate EMPLOYMENT STATUS Civilian labor force,...,Estimate PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE PAST 12 MONTHS IS BELOW THE POVERTY LEVEL All people Under 18 years Related children of the householder under 18 years Related children of the householder 5 to 17 years,Estimate PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE PAST 12 MONTHS IS BELOW THE POVERTY LEVEL All people 18 years and over,Estimate PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE PAST 12 MONTHS IS BELOW THE POVERTY LEVEL All people 18 years and over 18 to 64 years,Estimate PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE PAST 12 MONTHS IS BELOW THE POVERTY LEVEL All people 18 years and over 65 years and over,Estimate PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE PAST 12 MONTHS IS BELOW THE POVERTY LEVEL People in families,Estimate PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE PAST 12 MONTHS IS BELOW THE POVERTY LEVEL Unrelated individuals 15 years and over,Geographic Area Name.1,state,county,tract
1,"Census Tract 626.44, Orange County, California",1400000US06059062644,6767,3967,3967,3811,156,0,2800,3967,...,-888888888.0,-888888888.0,-888888888.0,-888888888.0,-888888888.0,-888888888.0,"Census Tract 626.44, Orange County, California",6,59,62644
2,"Census Tract 626.40, Orange County, California",1400000US06059062640,3062,2446,2436,2320,116,10,616,2436,...,-888888888.0,-888888888.0,-888888888.0,-888888888.0,-888888888.0,-888888888.0,"Census Tract 626.40, Orange County, California",6,59,62640
3,"Census Tract 630.08, Orange County, California",1400000US06059063008,833,431,431,412,19,0,402,431,...,-888888888.0,-888888888.0,-888888888.0,-888888888.0,-888888888.0,-888888888.0,"Census Tract 630.08, Orange County, California",6,59,63008


In [72]:
dp03_18['tract'] = dp03_18['tract'].astype('int64')

In [73]:
dp03_18['tract'].dtype

dtype('int64')

In [74]:
# to select columns, read a (very long) list of column names, and select which to keep.
# helpful to copy into a .py file named 'variables' for future reference

In [75]:
#uncomment in order to see full list of columns names

# dp03_18.columns.tolist()

['Geographic Area Name',
 'GEO_ID',
 'Estimate EMPLOYMENT STATUS Population 16 years and over',
 'Estimate EMPLOYMENT STATUS Population 16 years and over In labor force',
 'Estimate EMPLOYMENT STATUS Population 16 years and over In labor force Civilian labor force',
 'Estimate EMPLOYMENT STATUS Population 16 years and over In labor force Civilian labor force Employed',
 'Estimate EMPLOYMENT STATUS Population 16 years and over In labor force Civilian labor force Unemployed',
 'Estimate EMPLOYMENT STATUS Population 16 years and over In labor force Armed Forces',
 'Estimate EMPLOYMENT STATUS Population 16 years and over Not in labor force',
 'Estimate EMPLOYMENT STATUS Civilian labor force',
 'Estimate EMPLOYMENT STATUS Civilian labor force Unemployment Rate',
 'Estimate EMPLOYMENT STATUS Females 16 years and over',
 'Estimate EMPLOYMENT STATUS Females 16 years and over In labor force',
 'Estimate EMPLOYMENT STATUS Females 16 years and over In labor force Civilian labor force',
 'Estimate

In [76]:
dp03_cols = ['Geographic Area Name',
             'GEO_ID',
             'tract',
             'Estimate EMPLOYMENT STATUS Population 16 years and over',
             'Estimate EMPLOYMENT STATUS Population 16 years and over In labor force',
             'Estimate EMPLOYMENT STATUS Population 16 years and over Not in labor force', 
             'Estimate OCCUPATION Civilian employed population 16 years and over Management, business, science, and arts occupations',
             'Estimate OCCUPATION Civilian employed population 16 years and over Service occupations',
             'Estimate OCCUPATION Civilian employed population 16 years and over Sales and office occupations',
             'Estimate OCCUPATION Civilian employed population 16 years and over Natural resources, construction, and maintenance occupations',
             'Estimate INCOME AND BENEFITS (IN 2018 INFLATION-ADJUSTED DOLLARS) Total households Median household income (dollars)'
            ]

In [77]:
dp03_18sub = dp03_18[dp03_cols]

In [78]:
dp03_18sub.head(3)

Unnamed: 0,Geographic Area Name,Geographic Area Name.1,GEO_ID,tract,Estimate EMPLOYMENT STATUS Population 16 years and over,Estimate EMPLOYMENT STATUS Population 16 years and over In labor force,Estimate EMPLOYMENT STATUS Population 16 years and over Not in labor force,"Estimate OCCUPATION Civilian employed population 16 years and over Management, business, science, and arts occupations",Estimate OCCUPATION Civilian employed population 16 years and over Service occupations,Estimate OCCUPATION Civilian employed population 16 years and over Sales and office occupations,"Estimate OCCUPATION Civilian employed population 16 years and over Natural resources, construction, and maintenance occupations",Estimate INCOME AND BENEFITS (IN 2018 INFLATION-ADJUSTED DOLLARS) Total households Median household income (dollars)
1,"Census Tract 626.44, Orange County, California","Census Tract 626.44, Orange County, California",1400000US06059062644,62644,6767,3967,2800,2547,262,819,91,146953
2,"Census Tract 626.40, Orange County, California","Census Tract 626.40, Orange County, California",1400000US06059062640,62640,3062,2446,616,1202,287,563,63,84632
3,"Census Tract 630.08, Orange County, California","Census Tract 630.08, Orange County, California",1400000US06059063008,63008,833,431,402,255,22,104,0,100396


In [79]:
dp03_18sub.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 583 entries, 1 to 583
Data columns (total 12 columns):
 #   Column                                                                                                                           Non-Null Count  Dtype 
---  ------                                                                                                                           --------------  ----- 
 0   Geographic Area Name                                                                                                             583 non-null    object
 1   Geographic Area Name                                                                                                             583 non-null    object
 2   GEO_ID                                                                                                                           583 non-null    object
 3   tract                                                                                                    

In [81]:
# change multiple columns to numeric

num_cols = ['Estimate EMPLOYMENT STATUS Population 16 years and over',
             'Estimate EMPLOYMENT STATUS Population 16 years and over In labor force',
             'Estimate EMPLOYMENT STATUS Population 16 years and over Not in labor force', 
             'Estimate OCCUPATION Civilian employed population 16 years and over Management, business, science, and arts occupations',
             'Estimate OCCUPATION Civilian employed population 16 years and over Service occupations',
             'Estimate OCCUPATION Civilian employed population 16 years and over Sales and office occupations',
             'Estimate OCCUPATION Civilian employed population 16 years and over Natural resources, construction, and maintenance occupations',
             'Estimate INCOME AND BENEFITS (IN 2018 INFLATION-ADJUSTED DOLLARS) Total households Median household income (dollars)']

In [83]:
for col in num_cols:
    dp03_18sub[col] = pd.to_numeric(dp03_18sub[col])

# will generate a warning (not an error) about making a change a copy of a slide of a dataframe.  
# we will just proceed.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dp03_18sub[col] = pd.to_numeric(dp03_18sub[col])


In [84]:
dp03_18sub.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 583 entries, 1 to 583
Data columns (total 12 columns):
 #   Column                                                                                                                           Non-Null Count  Dtype 
---  ------                                                                                                                           --------------  ----- 
 0   Geographic Area Name                                                                                                             583 non-null    object
 1   Geographic Area Name                                                                                                             583 non-null    object
 2   GEO_ID                                                                                                                           583 non-null    object
 3   tract                                                                                                    

In [86]:
pd.set_option('display.max_rows', 10)
dp03_18sub['tract'].value_counts()

88702     1
21822     1
21802     1
52521     1
32040     1
         ..
86601     1
110109    1
99906     1
1601      1
52506     1
Name: tract, Length: 583, dtype: int64

In [87]:
#save subset of Census data to pickle file for enhancing voter data, and for modeling
dp03_18sub.to_pickle('./census_data/DP03_subset.pkl')

### Conversion file:  srprec < > census tract

Conversion file (`.csv` fomat) from [Statewidedatabase.org](https://statewidedatabase.org/d10/g18_geo_conv.html) geographic conversion page.  

Orange County, right-most column, download 'SRPREC TO 2010 BLK' csv option, [link to the 2018 voting precint file](https://statewidedatabase.org/pub/data/G18/c059/c059_g18_sr_blk_map.csv).

In [88]:
convert18 = pd.read_csv('./map_files/c059_g18_sr_blk_map.csv')
convert18.shape

(22985, 8)

#### explore dataset

In [89]:
convert18.head()

# most relevant columns are ['sprec', 'tract', 'block', 'pctsrprec']

Unnamed: 0,srprec,tract,block,blkreg,srtotreg,pctsrprec,blktotreg,pctblk
0,,0,0,20,,,20,100.0
1,10316.0,110106,2000,11,1720.0,0.639535,11,100.0
2,10316.0,110106,2001,15,1720.0,0.872093,15,100.0
3,10316.0,110106,2002,62,1720.0,3.604651,62,100.0
4,10316.0,110106,2003,10,1720.0,0.581395,21,47.619048


In [23]:
convert18.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22985 entries, 0 to 22984
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   srprec     22984 non-null  float64
 1   tract      22985 non-null  int64  
 2   block      22985 non-null  int64  
 3   blkreg     22985 non-null  int64  
 4   srtotreg   22984 non-null  float64
 5   pctsrprec  22984 non-null  float64
 6   blktotreg  22985 non-null  int64  
 7   pctblk     22985 non-null  float64
dtypes: float64(4), int64(4)
memory usage: 1.4 MB


In [24]:
convert18['tract'].nunique()

583

In [25]:
convert18['srprec'].nunique()

1334

#### confirm that Vote data (data18 df) and Conversion file (convert18 df) have the same values for 'srprec'

In [26]:
convert18['srprec'].sort_values()

3341      2001.0
3371      2001.0
3370      2001.0
3369      2001.0
3368      2001.0
          ...   
22957    75702.0
22956    75702.0
22970    75702.0
22960    75702.0
0            NaN
Name: srprec, Length: 22985, dtype: float64

In [27]:
data18['srprec'].sort_values()

179      2001
180      2002
181      2008
182      2009
183      2011
        ...  
1540    75116
1541    75117
1542    75122
1543    75701
1544    75702
Name: srprec, Length: 1334, dtype: Int64

#### understand how srprec, tract, block, and pctsrprec relate

In [28]:
convert18.loc[convert18['srprec'] == 2001]['tract'].unique()

# precinct 2001 has 3 tracts

array([11602, 86501, 86502])

In [29]:
convert18.loc[convert18['srprec'] == 2001]

# precinct 2001 has 31 blocks

Unnamed: 0,srprec,tract,block,blkreg,srtotreg,pctsrprec,blktotreg,pctblk
3341,2001.0,11602,1038,1,679.0,0.147275,1,100.0
3342,2001.0,11602,2025,33,679.0,4.860088,33,100.0
3343,2001.0,11602,2026,12,679.0,1.767305,12,100.0
3344,2001.0,11602,2028,24,679.0,3.53461,24,100.0
3345,2001.0,11602,2032,3,679.0,0.441826,3,100.0
3346,2001.0,11602,2042,18,679.0,2.650957,18,100.0
3347,2001.0,11602,2044,3,679.0,0.441826,3,100.0
3348,2001.0,11602,2045,2,679.0,0.294551,2,100.0
3349,2001.0,11602,2047,20,679.0,2.945508,20,100.0
3350,2001.0,11602,2048,30,679.0,4.418262,30,100.0


In [90]:
convert18.loc[convert18['tract'] == 11602]

Unnamed: 0,srprec,tract,block,blkreg,srtotreg,pctsrprec,blktotreg,pctblk
441,13031.0,11602,1035,1,6.0,16.666667,1,100.0
442,13031.0,11602,1037,2,6.0,33.333333,2,100.0
443,13031.0,11602,1049,3,6.0,50.000000,3,100.0
566,13043.0,11602,3014,95,1813.0,5.239934,95,100.0
567,13043.0,11602,3015,92,1813.0,5.074462,92,100.0
...,...,...,...,...,...,...,...,...
3824,2117.0,11602,1061,1,2519.0,0.039698,1,100.0
3825,2117.0,11602,1062,37,2519.0,1.468837,37,100.0
3826,2117.0,11602,1063,34,2519.0,1.349742,34,100.0
3827,2117.0,11602,1068,49,2519.0,1.945216,49,100.0


#### NOTE:  Many-to-many relationship between srprec-tract.  
- Each precint contains multiple tracts AND 
- each tract can apply to multiple precints

#### Important to maintain srprec as reference column for connecting vote data and to detailed mapping.

#### Important to maintain pctsrprec at tract level for applying weighted census-tract data.
- `.groupby()` operations force a summation of sprec or tract.  
- THUS need to aggregate by each precint individually into a new dataframe.  

In [31]:
convert18.groupby('srprec').sum().head()

Unnamed: 0_level_0,tract,block,blkreg,srtotreg,pctsrprec,blktotreg,pctblk
srprec,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2001.0,1183559,50994,679,21049.0,100.0,679,3100.0
2002.0,82040,11039,291,2037.0,100.0,291,700.0
2008.0,2774464,70230,2304,73728.0,100.0,2304,3200.0
2009.0,2526444,54257,1441,40348.0,100.0,2006,2663.116825
2011.0,306670,23144,1105,15470.0,100.0,1264,1324.082935


### create a weighted tract-to-srprec conversion file
#### test using one precinct only

In [93]:
pr002001 = convert18.loc[convert18['srprec'] == 2001].groupby('tract', as_index=False).sum()
# for precinct 2001, one tract represents 56% of the population, the other two represent less.
# but no preponderance of representation
# theoretically, these weights could be applied to census-tract data, then attached to srprec
# said differently ... vote results could be mapped to precint directly ...
# ... and census-tract data could be mapped with the pctsrprec applied

pr002001

Unnamed: 0,tract,srprec,block,blkreg,srtotreg,pctsrprec,blktotreg,pctblk
0,11602,40020.0,39911,254,13580.0,37.407953,254,2000.0
1,86501,6003.0,3009,47,2037.0,6.921944,47,300.0
2,86502,16008.0,8074,378,5432.0,55.670103,378,800.0


In [94]:
# create a column which captures 'srprec' for use later
pr002001['srprec_orig'] = np.full_like(pr002001.shape[0], 2001)

In [95]:
pr002001.rename(columns={'pctsrprec':'pctsrprec_tract'}, inplace=True)

In [96]:
pr002001 = pr002001[['tract', 'pctsrprec_tract', 'srprec_orig']]

In [97]:
pr002001

Unnamed: 0,tract,pctsrprec_tract,srprec_orig
0,11602,37.407953,2001
1,86501,6.921944,2001
2,86502,55.670103,2001


#### test on a precinct with only one tract, just in case

In [37]:
pr002002 = convert18.loc[convert18['srprec'] == 2002].groupby('tract', as_index=False).sum()
pr002002

Unnamed: 0,tract,srprec,block,blkreg,srtotreg,pctsrprec,blktotreg,pctblk
0,11720,14014.0,11039,291,2037.0,100.0,291,700.0


In [38]:
pr002002['srprec_orig'] = np.full_like(pr002002.shape[0], 2002)

In [39]:
pr002002.rename(columns={'pctsrprec':'pctsrprec_tract'}, inplace=True)

In [40]:
pr002002 = pr002002[['tract', 'pctsrprec_tract', 'srprec_orig']]

In [41]:
pr002002

Unnamed: 0,tract,pctsrprec_tract,srprec_orig
0,11720,100.0,2002


#### add together into one dataframe

In [42]:
df = pd.concat([pr002001, pr002002])

In [43]:
df

Unnamed: 0,tract,pctsrprec_tract,srprec_orig
0,11602,37.407953,2001
1,86501,6.921944,2001
2,86502,55.670103,2001
0,11720,100.0,2002


### for-loop to create tract aggregates for each precinct

In [44]:
precincts_all = convert18['srprec'].unique().tolist()[1:]

In [45]:
len(precincts_all)

1334

In [46]:
'''for-loop to calculate the tract's percent composition of the voting precinct ('srprec').  
while maintaining the original srprec ID number (avoiding addition)
    these weights will be applied to census-tract data to the voting dataset to provide a weighted result of the data'''

# precincts = [2001, 2002, 75116]
df_convert_subidx = pd.DataFrame()

for prec in precincts_all:
    
    #1. select srprec, group by tract to calculate tract % composition of srprec
    df_prec = convert18.loc[convert18['srprec'] == prec].groupby('tract', as_index=False).sum()
    
    #2. add column to identify original srprec, since it is washed away by groupby().sum()
    df_prec['srprec_orig'] = np.full_like(df_prec.shape[0], prec)

    #3. rename pctsrprec column to reflect that it pertains to the tract, not the block
    df_prec.rename(columns={'pctsrprec':'pctsrprec_tract'}, inplace=True)

    #4. define the df to contain only the needed columns, remove confusing colums 
    df_prec = df_prec[['srprec_orig', 'tract', 'pctsrprec_tract']]
    
    #5. concat the new df
    df_convert_subidx = pd.concat([df_prec, df_convert_subidx])


In [47]:
df_convert_subidx.head()

Unnamed: 0,srprec_orig,tract,pctsrprec_tract
0,7701,110304,100.0
0,75702,75809,97.095134
1,75702,75814,2.904866
0,75701,75810,100.0
0,75122,75810,100.0


In [48]:
df_convert_subidx.shape

(2011, 3)

In [59]:
pd.set_option('display.max_rows', None)
df_convert_subidx['srprec_orig'].value_counts().value_counts()

# a little over half precincts are made up of just 1 tract (60%).
# however, many precincts straddle 2 or more tracts, up to 5 tracts.  

1    799
2    410
3    110
4     13
5      2
Name: srprec_orig, dtype: int64

In [50]:
df_convert = df_convert_subidx.reset_index()

In [412]:
pd.set_option('display.max_rows', 20)
df_convert.rename(columns={'index':'subindex'}, inplace=True)

In [411]:
df_convert.head(15)

Unnamed: 0,subindex,srprec_orig,tract,pctsrprec_tract
0,0,7701,110304,100.0
1,0,75702,75809,97.095134
2,1,75702,75814,2.904866
3,0,75701,75810,100.0
4,0,75122,75810,100.0
5,0,75117,75810,100.0
6,0,75116,75809,70.290635
7,1,75116,75810,21.097955
8,2,75116,75811,6.243272
9,3,75116,75813,1.937567


#### save to pickle file

In [429]:
df_convert.to_pickle('./data/df_convert18.pkl')

### test merging the dataframes
The full merge is performed in 27_combine_census_data page.

#### a. set up mini-frames

In [508]:
a = data18.loc[data18['srprec'] == 75702]
a

Unnamed: 0,county,srprec,cddist,TOTREG,TOTVOTE,CNGDEM01,CNGREP01,election,type,totreg_r,...,nlp,grn,ref,dcl,male,female,hispdem,hisprep,hispdcl,hispoth
1544,30,75702,45,1382,1125,319,784,g18,V,1089.0,...,0.0,0.0,0.0,199.0,562.0,527.0,15.0,20.0,16.0,5.0


In [509]:
b = dp03_18.loc[dp03_18['tract'] == 75809]
b = b[['Estimate EMPLOYMENT STATUS Population 16 years and over', 'tract']]
b

Unnamed: 0,Estimate EMPLOYMENT STATUS Population 16 years and over,tract
155,2486,75809


In [510]:
c = df_convert.loc[df_convert['srprec_orig'] == 75702]
c

Unnamed: 0,subindex,srprec_orig,tract,pctsrprec_tract
1,0,75702,75809,97.095134
2,1,75702,75814,2.904866


In [None]:
# anchor df is df_convert (c)

# change to converted_18, to preserve the df_convert 

# add vote data to converted_18
    # pd.merge() converted_18 with data18 on srprec/srprec_orig --> will pandas fill in rows multiple times?
    # c['srprec_orig'] type = int64
    # a['srprec'] type = Int64  (capital I)
    
# add census data to converted_18
    # pd.merge() converted_18 with dp03_test on tract --> will pandas fill in rows multiple times?
    # there will be errors where census data srprec
    
# create new col that multiplies ['pctsrprec_tract'] * ['census col']

#### b. try merging vote data (a) onto anchor df, converter table (c)

In [456]:
# pd.merge(left=c, right=a, how='outer', on=['srprec_orig', 'srprec'], indicator=True, suffixes=('_c', '_a'))  

# gave an error  KeyError: 'srprec_orig'.  Also # Check for duplicates

In [511]:
test = pd.merge(left=c, right=a, how='outer', left_on=['srprec_orig'], right_on=['srprec'], indicator=True)

In [512]:
test

Unnamed: 0,subindex,srprec_orig,tract,pctsrprec_tract,county,srprec,cddist,TOTREG,TOTVOTE,CNGDEM01,...,grn,ref,dcl,male,female,hispdem,hisprep,hispdcl,hispoth,_merge
0,0,75702,75809,97.095134,30,75702,45,1382,1125,319,...,0.0,0.0,199.0,562.0,527.0,15.0,20.0,16.0,5.0,both
1,1,75702,75814,2.904866,30,75702,45,1382,1125,319,...,0.0,0.0,199.0,562.0,527.0,15.0,20.0,16.0,5.0,both


In [513]:
test.rename(columns={'_merge':'prec_merge'}, inplace=True)

#### c. try merging census data (b) onto anchor df, converter table (c)

In [514]:
# pd.merge(left=c, right=b, how='outer', left_on=['tract'], right_on=['tract'], indicator=True, suffixes=('_c', '_b'))
test = pd.merge(left=test, right=b, how='outer', left_on=['tract'], right_on=['tract'], indicator=True)

In [515]:
test

Unnamed: 0,subindex,srprec_orig,tract,pctsrprec_tract,county,srprec,cddist,TOTREG,TOTVOTE,CNGDEM01,...,dcl,male,female,hispdem,hisprep,hispdcl,hispoth,prec_merge,Estimate EMPLOYMENT STATUS Population 16 years and over,_merge
0,0,75702,75809,97.095134,30,75702,45,1382,1125,319,...,199.0,562.0,527.0,15.0,20.0,16.0,5.0,both,2486.0,both
1,1,75702,75814,2.904866,30,75702,45,1382,1125,319,...,199.0,562.0,527.0,15.0,20.0,16.0,5.0,both,,left_only


In [523]:
test.dropna(inplace=True)

In [526]:
test['Estimate EMPLOYMENT STATUS Population 16 years and over'] = test['Estimate EMPLOYMENT STATUS Population 16 years and over'].astype('int64')

In [527]:
test['test'] = test['pctsrprec_tract']/100 * test['Estimate EMPLOYMENT STATUS Population 16 years and over']

In [528]:
test

Unnamed: 0,subindex,srprec_orig,tract,pctsrprec_tract,county,srprec,cddist,TOTREG,TOTVOTE,CNGDEM01,...,male,female,hispdem,hisprep,hispdcl,hispoth,prec_merge,Estimate EMPLOYMENT STATUS Population 16 years and over,_merge,test
0,0,75702,75809,97.095134,30,75702,45,1382,1125,319,...,562.0,527.0,15.0,20.0,16.0,5.0,both,2486,both,2413.78504


#### compare conversion fields

In [98]:
data18[['srprec', 'cddist']].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1334 entries, 0 to 1545
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   srprec  1334 non-null   Int64
 1   cddist  1334 non-null   int64
dtypes: Int64(1), int64(1)
memory usage: 32.6 KB


In [99]:
convert18[['srprec', 'tract', 'block']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22985 entries, 0 to 22984
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   srprec  22984 non-null  float64
 1   tract   22985 non-null  int64  
 2   block   22985 non-null  int64  
dtypes: float64(1), int64(2)
memory usage: 538.8 KB


In [100]:
dp03_18sub[['tract']].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 583 entries, 1 to 583
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   tract   583 non-null    int64
dtypes: int64(1)
memory usage: 9.1 KB
