# Supervised Data Prep & Feature Building

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

Set the Pandas defaults

In [2]:
pd.set_option('mode.chained_assignment', None)
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.max_columns = None
pd.options.display.max_rows = 100

Import the data

In [3]:
zillow = pd.read_parquet('data/zillow.parquet')
sld = pd.read_parquet('data/Smart_Location_Database.parquet')
food_atlas = pd.read_parquet('data/food_atlas.parquet')
fips = pd.read_parquet('data/fips.parquet')

Fix minor data issues

In [4]:
# Fix the SLD tract data
sld['StCtyTract'] = sld['GEOID10'].astype('str').str[:-1].str.zfill(11)

# Fix the fips dataframe's zfill
fips['FIPS'] = fips['FIPS'].astype('str').str.zfill(5)

Create Features

In [5]:
# Create the features dataframe that will be used in the modeling process
fa_features = food_atlas[['CensusTract', 'Urban', 'PovertyRate', 'MedianFamilyIncome', 'LILATracts_1And10', 'LILATracts_halfAnd10', 'LILATracts_1And20', 'LILATracts_Vehicle', 'HUNVFlag', 'LowIncomeTracts', 'LATracts1', 'LATracts_half', 'LATracts10', 'LA1and10','LAhalfand10','LA1and20','LATracts20','LATractsVehicle_20']]

desert_conditions = (
    ((fa_features.Urban == 1) & 
    (fa_features.PovertyRate >= 20) & 
    ((fa_features.LATracts1 == 1) | (fa_features.LATracts_half == 1))) |

    ((fa_features.Urban == 0) & 
    (fa_features.PovertyRate >= 20) & 
    ((fa_features.LATracts10 == 1) | (fa_features.LATracts1 == 1) | (fa_features.LATracts_half == 1))))

# Create the binary column
fa_features['Desert'] = desert_conditions.astype(int)

fa_features['FIPS'] = fa_features['CensusTract'].astype('str').str[:5]

# Group by 'FIPS' and calculate the sum of 'LowIncomeTracts'
LIT_grouped = fa_features.groupby('FIPS')['LowIncomeTracts'].transform('sum')
# Calculate the count of observations in each group
LIT_counted = fa_features.groupby('FIPS')['LowIncomeTracts'].transform('count')
# Create the new column by dividing the sum by the count
fa_features['LowIncTracts_Pct'] = LIT_grouped / LIT_counted


# Group by 'FIPS' and calculate the sum of 'Urban'
Urban_grouped = fa_features.groupby('FIPS')['Urban'].transform('sum')
# Calculate the count of observations in each group
Urban_counted = fa_features.groupby('FIPS')['Urban'].transform('count')
# Create the new column by dividing the sum by the count
fa_features['Urban_Pct'] = Urban_grouped / Urban_counted


# Group by 'FIPS' and calculate the sum of 'Desert'
Desert_grouped = fa_features.groupby('FIPS')['Desert'].transform('sum')
# Calculate the count of observations in each group
Desert_counted = fa_features.groupby('FIPS')['Desert'].transform('count')
# Create the new column by dividing the sum by the count
fa_features['Desert_Pct'] = Desert_grouped / Desert_counted

# Group by 'FIPS' and calculate the sum of 'LA1and10'
LA1and10_grouped = fa_features.groupby('FIPS')['LA1and10'].transform('sum')
# Calculate the count of observations in each group
LA1and10_counted = fa_features.groupby('FIPS')['LA1and10'].transform('count')
# Create the new column by dividing the sum by the count
fa_features['LA1and10_Pct'] = LA1and10_grouped / LA1and10_counted


# Group by 'FIPS' and calculate the sum of 'LAhalfand10'
LAhalfand10_grouped = fa_features.groupby('FIPS')['LAhalfand10'].transform('sum')
# Calculate the count of observations in each group
LAhalfand10_counted = fa_features.groupby('FIPS')['LAhalfand10'].transform('count')
# Create the new column by dividing the sum by the count
fa_features['LAhalfand10_Pct'] = LAhalfand10_grouped / LAhalfand10_counted


# Group by 'FIPS' and calculate the sum of 'LA1and20'
LA1and20_grouped = fa_features.groupby('FIPS')['LA1and20'].transform('sum')
# Calculate the count of observations in each group
LA1and20_counted = fa_features.groupby('FIPS')['LA1and20'].transform('count')
# Create the new column by dividing the sum by the count
fa_features['LA1and20_Pct'] = LA1and20_grouped / LA1and20_counted


# Group by 'FIPS' and calculate the sum of 'LATracts_half'
LATracts_half_grouped = fa_features.groupby('FIPS')['LATracts_half'].transform('sum')
# Calculate the count of observations in each group
LATracts_half_counted = fa_features.groupby('FIPS')['LATracts_half'].transform('count')
# Create the new column by dividing the sum by the count
fa_features['LATracts_half_Pct'] = LATracts_half_grouped / LATracts_half_counted


# Group by 'FIPS' and calculate the sum of 'LATracts1'
LATracts1_grouped = fa_features.groupby('FIPS')['LATracts1'].transform('sum')
# Calculate the count of observations in each group
LATracts1_counted = fa_features.groupby('FIPS')['LATracts1'].transform('count')
# Create the new column by dividing the sum by the count
fa_features['LATracts1_Pct'] = LATracts1_grouped / LATracts1_counted


# Group by 'FIPS' and calculate the sum of 'LATracts10'
LATracts10_grouped = fa_features.groupby('FIPS')['LATracts10'].transform('sum')
# Calculate the count of observations in each group
LATracts10_counted = fa_features.groupby('FIPS')['LATracts10'].transform('count')
# Create the new column by dividing the sum by the count
fa_features['LATracts10_Pct'] = LATracts10_grouped / LATracts10_counted


# Group by 'FIPS' and calculate the sum of 'LATracts20'
LATracts20_grouped = fa_features.groupby('FIPS')['LATracts20'].transform('sum')
# Calculate the count of observations in each group
LATracts20_counted = fa_features.groupby('FIPS')['LATracts20'].transform('count')
# Create the new column by dividing the sum by the count
fa_features['LATracts20_Pct'] = LATracts20_grouped / LATracts20_counted


# Group by 'FIPS' and calculate the sum of 'LATractsVehicle_20'
LATractsVehicle_20_grouped = fa_features.groupby('FIPS')['LATractsVehicle_20'].transform('sum')
# Calculate the count of observations in each group
LATractsVehicle_20_counted = fa_features.groupby('FIPS')['LATractsVehicle_20'].transform('count')
# Create the new column by dividing the sum by the count
fa_features['LATractsVehicle_20_Pct'] = LATractsVehicle_20_grouped / LATractsVehicle_20_counted

# Create the PovertyRate by FIPS feature by taking the mean of the tracts
fa_features['PovertyRate_FIPS'] = fa_features.groupby('FIPS')['PovertyRate'].transform('mean')


# Drop the dupes caused by the groupby sum/count processes
food_atlas_df = fa_features[['FIPS', 'LowIncTracts_Pct', 'Urban_Pct', 'Desert_Pct', 'LA1and10_Pct', 'LAhalfand10_Pct', 'LA1and20_Pct', 'LATracts_half_Pct', 'LATracts1_Pct', 'LATracts10_Pct', 'LATracts20_Pct', 'LATractsVehicle_20_Pct', 'PovertyRate_FIPS']].drop_duplicates()

# Fill the missing PovertyRate data with 0
food_atlas_df['PovertyRate_FIPS'] = food_atlas_df['PovertyRate_FIPS'].fillna(0.00)

# Add the State & County names to the dataframe
food_atlas_df = pd.merge(food_atlas_df, fips[['FIPS','State','County']], on='FIPS', how='left')

# Write out the food_atlas_df
food_atlas_df.to_parquet('final_output/food_atlas_df.parquet')

Create Training and Testing data

In [6]:
# Create the home value dataframe
hv_012023_df = zillow[(zillow.Y==2023) & (zillow.M==1)][['FIPS','State','Home Value']].dropna(axis=0)

# Extract the dependent var data and roll it up to the fips level
sld_temp_df = sld[['FIPS','NatWalkInd']]
sld_temp_df = sld_temp_df.groupby('FIPS').mean().reset_index()

# Merge the dependent var to the home value data
hv_012023_df = pd.merge(hv_012023_df, sld_temp_df[['FIPS','NatWalkInd']], on='FIPS', how='left')

# Merge the food atlas features with the home value data
hv_012023_df = pd.merge(hv_012023_df, food_atlas_df.drop(columns=['State','County']), on='FIPS', how='left')
hv_012023_df.set_index('FIPS', inplace=True)

# Create features and target dataframes
features = hv_012023_df.drop(columns=['NatWalkInd','State'])
target = hv_012023_df['NatWalkInd']

# Build the Train and Test datasets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.30, random_state=42)

In [7]:
# Write out Test/Train Data
X_train.to_parquet('final_output/X_train.parquet')
X_test.to_parquet('final_output/X_test.parquet')
pd.DataFrame(y_train).to_parquet('final_output/y_train.parquet')  # Converted to DataFrame for parquet write out
pd.DataFrame(y_test).to_parquet('final_output/y_test.parquet')  # Converted to DataFrame for parquet write out

In [8]:
# Write out the hv_012023_df DataFrame
hv_012023_df.to_parquet('final_output/hv_012023_df.parquet')

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=e75edf0e-32f1-42b8-8a27-9dc0078a206d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>