In [63]:
# Common imports

import pandas as pd
import numpy as np

In [64]:
PATH_TO_FAR_ATLAS = 'data/far_atlas.csv'
PATH_TO_FVFB_DATA = 'data/fvfb_data.csv'

In [65]:
def load_data_locally():
    # as opposed to using Colab
    return pd.read_csv(PATH_TO_FAR_ATLAS), pd.read_csv(PATH_TO_FVFB_DATA)

In [66]:
df_fa, df_fvfb = load_data_locally()

In [67]:
def modify_fvfb(df):
    # Only get 2019 data
    df = df[df['Year'] == 2019]

    # Drop geopoint
    df.drop(columns=['Geopoint'])

    # Capitalize County
    capitalized = df['Locality'].apply(lambda x: x.title())
    df['Locality'] = capitalized

    return df

df_fvfb = modify_fvfb(df_fvfb)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Locality'] = capitalized


In [68]:
def modify_fa(df):
    # Only get VA data
    df = df[df['State'] == 'Virginia']

    # Drop Census Tract
    df.drop(columns=['CensusTract'])

    # Capitalize County
    capitalized = df['County'].apply(lambda x: x.title())
    df['County'] = capitalized

    return df

df_fa = modify_fa(df_fa)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['County'] = capitalized


In [69]:
fa_county_set = set(df_fa['County'].values)
fvfb_county_set = set(df_fvfb['Locality'].values)
county_intersection = fa_county_set.intersection(fvfb_county_set)
counties_only_in_fa = fa_county_set - county_intersection
counties_only_in_fvfb = fvfb_county_set - county_intersection

print('We have %d final counties, with only %d counties left out.' % (len(county_intersection), len(counties_only_in_fa) + len(counties_only_in_fvfb)))
print('Counties only in Far Atlas: %s' % counties_only_in_fa)
print('Counties only in VA dataset: %s' % counties_only_in_fvfb)

We have 130 final counties, with only 5 counties left out.
Counties only in Far Atlas: {'Fairfax City', 'Falls Church City', 'Manassas Park City'}
Counties only in VA dataset: {'King And Queen', 'Washington Dc'}


In [70]:
df_fa['PovertyRate'].value_counts()

0.0     22
1.8     18
2.3     18
7.9     17
3.2     17
        ..
31.9     1
43.9     1
22.2     1
27.0     1
22.1     1
Name: PovertyRate, Length: 367, dtype: int64

In [78]:
def partition_fa_attribs(df_fa):
    # TODO: Figure out why POP2010 changed to Pop2010

    unwanted_attribs = ['State', 'CensusTract', 'County'] 
    fa_attribs_all = [attrib for attrib in df_fa.columns if attrib not in unwanted_attribs]
    print(fa_attribs_all)
    fa_attribs_to_sum = ['Pop2010','OHU2010','NUMGQTRS','LAPOP1_10','LAPOP05_10','LAPOP1_20','LALOWI1_10','LALOWI05_10','LALOWI1_20']
    fa_attribs_flag = ['Urban','GroupQuartersFlag','LILATracts_1And10','LILATracts_halfAnd10','LILATracts_1And20','LILATracts_Vehicle',
                   'HUNVFlag','LowIncomeTracts','LA1and10','LAhalfand10','LA1and20','LATracts_half','LATracts1','LATracts10',
                   'LATracts20','LATractsVehicle_20']
    fa_attribs_to_avg = ['PCTGQTRS','PovertyRate','MedianFamilyIncome'] + fa_attribs_flag
    for attrib in fa_attribs_all:
        if attrib.startswith('Tract'):
            fa_attribs_to_sum.append(attrib)
        if attrib.endswith('share'):
            fa_attribs_to_avg.append(attrib)
            fa_attribs_to_sum.append(attrib[:-5])
            # For debugging, can delete after it works:
            if attrib[:-5] not in fa_attribs_all:
                raise Exception('%s should be an attrib in fa to sum' % attrib[:-5])
            
    intersection = set(fa_attribs_to_sum).union(set(fa_attribs_to_avg)).intersection(set(fa_attribs_all))
            
    if not intersection:
        print(set(fa_attribs_all) - set(fa_attribs_to_sum).union(set(fa_attribs_to_avg)))
        raise Exception('We missed these attribs in fa:', intersection)

    return fa_attribs_all, fa_attribs_to_sum, fa_attribs_to_avg, fa_attribs_flag

fa_attribs_all, fa_attribs_to_sum, fa_attribs_to_avg, fa_attribs_flag = partition_fa_attribs(df_fa)


def add_df_fvfb(df, df_fvfb):
    return df


def add_df_fa(df, df_fa):

    for county in df.index:
        rows = df_fa[df_fa['County'] == county]
        for attrib in fa_attribs_to_sum:
            summed = rows[attrib].sum()
            df.loc[county, attrib] = summed
        for attrib in fa_attribs_to_avg:
            result = rows[attrib].mean()
            if attrib in fa_attribs_flag:
                result = round(result)
            df.loc[county, attrib] = result

    return df


def merge_tables(df_fvfb, df_fa):
    df = pd.DataFrame(index=sorted(county_intersection), columns=['County'])

    df = add_df_fa(df, df_fa)
    df = add_df_fvfb(df, df_fvfb)
    
    return df

df = merge_tables(df_fvfb, df_fa)
df


['Urban', 'Pop2010', 'OHU2010', 'GroupQuartersFlag', 'NUMGQTRS', 'PCTGQTRS', 'LILATracts_1And10', 'LILATracts_halfAnd10', 'LILATracts_1And20', 'LILATracts_Vehicle', 'HUNVFlag', 'LowIncomeTracts', 'PovertyRate', 'MedianFamilyIncome', 'LA1and10', 'LAhalfand10', 'LA1and20', 'LATracts_half', 'LATracts1', 'LATracts10', 'LATracts20', 'LATractsVehicle_20', 'LAPOP1_10', 'LAPOP05_10', 'LAPOP1_20', 'LALOWI1_10', 'LALOWI05_10', 'LALOWI1_20', 'lapophalf', 'lapophalfshare', 'lalowihalf', 'lalowihalfshare', 'lakidshalf', 'lakidshalfshare', 'laseniorshalf', 'laseniorshalfshare', 'lawhitehalf', 'lawhitehalfshare', 'lablackhalf', 'lablackhalfshare', 'laasianhalf', 'laasianhalfshare', 'lanhopihalf', 'lanhopihalfshare', 'laaianhalf', 'laaianhalfshare', 'laomultirhalf', 'laomultirhalfshare', 'lahisphalf', 'lahisphalfshare', 'lahunvhalf', 'lahunvhalfshare', 'lasnaphalf', 'lasnaphalfshare', 'lapop1', 'lapop1share', 'lalowi1', 'lalowi1share', 'lakids1', 'lakids1share', 'laseniors1', 'laseniors1share', 'lawhi

  df.loc[county, attrib] = result
  df.loc[county, attrib] = result
  df.loc[county, attrib] = result
  df.loc[county, attrib] = result
  df.loc[county, attrib] = result
  df.loc[county, attrib] = result
  df.loc[county, attrib] = result
  df.loc[county, attrib] = result
  df.loc[county, attrib] = result
  df.loc[county, attrib] = result
  df.loc[county, attrib] = result
  df.loc[county, attrib] = result
  df.loc[county, attrib] = result
  df.loc[county, attrib] = result
  df.loc[county, attrib] = result
  df.loc[county, attrib] = result
  df.loc[county, attrib] = result
  df.loc[county, attrib] = result
  df.loc[county, attrib] = result
  df.loc[county, attrib] = result
  df.loc[county, attrib] = result
  df.loc[county, attrib] = result
  df.loc[county, attrib] = result
  df.loc[county, attrib] = result
  df.loc[county, attrib] = result
  df.loc[county, attrib] = result
  df.loc[county, attrib] = result
  df.loc[county, attrib] = result
  df.loc[county, attrib] = result
  df.loc[count

Unnamed: 0,County,Pop2010,OHU2010,NUMGQTRS,LAPOP1_10,LAPOP05_10,LAPOP1_20,LALOWI1_10,LALOWI05_10,LALOWI1_20,...,laseniors20share,lawhite20share,lablack20share,laasian20share,lanhopi20share,laaian20share,laomultir20share,lahisp20share,lahunv20share,lasnap20share
Accomack County,,33164.0,13798.0,428.0,727.0,727.0,0.0,298.0,298.0,0.0,...,,,,,,,,,,
Albemarle County,,98970.0,38157.0,6864.0,11786.0,30767.0,11450.0,3490.0,8892.0,3379.0,...,,,,,,,,,,
Alexandria City,,139966.0,68082.0,1827.0,0.0,28658.0,0.0,0.0,4497.0,0.0,...,,,,,,,,,,
Alleghany County,,16250.0,6891.0,281.0,2664.0,3797.0,1914.0,1283.0,1855.0,1067.0,...,,,,,,,,,,
Amelia County,,12690.0,4821.0,128.0,2089.0,2089.0,0.0,781.0,781.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Williamsburg City,,14068.0,4571.0,4171.0,2740.0,10877.0,2740.0,795.0,4632.0,795.0,...,,,,,,,,,,
Winchester City,,26203.0,10607.0,976.0,6660.0,19377.0,6660.0,1144.0,5195.0,1144.0,...,,,,,,,,,,
Wise County,,41452.0,15968.0,3131.0,7891.0,11792.0,7891.0,3538.0,5063.0,3538.0,...,,,,,,,,,,
Wythe County,,29235.0,12472.0,260.0,3105.0,5390.0,2958.0,1298.0,2078.0,1261.0,...,,,,,,,,,,
