In [None]:
import pandas as pd
import geopandas as gpd
from shapely import wkt

In [None]:
# Import data\WalkabilityIndex\Natl_WI_simplified.csv using pandas
df = pd.read_csv(r'data\WalkabilityIndex\Natl_WI_simplified.csv')

# Convert WKT geometries to actual geometries
df['geometry'] = df['geometry'].apply(wkt.loads)

# Create a GeoDataFrame with the initial CRS (assuming EPSG:4326)
gdf = gpd.GeoDataFrame(df, geometry='geometry', crs='EPSG:4326')

# Display the first row
gdf.head(n=1)

In [None]:
# Trying to figure out how unproductive land works in the data set
# Calculate the percent of unproductive land by total acres
gdf['percent_unproductive'] = round(gdf['ac_unpr'] / gdf['ac_total'] * 100, 2)

# Create bins for the percent_unproductive values
bins = range(0, 100, 10)
gdf['bins'] = pd.cut(gdf['percent_unproductive'], bins=bins)

# Group by the bins and count the geoid20s in each bin
bin_counts = gdf.groupby('bins', observed=False)['geoid20'].count()

# Convert the Series to a DataFrame
bin_counts_df = bin_counts.reset_index()
bin_counts_df.columns = ['bin', 'geoid20_count']

# Calculate the total count of geoid20
total_count = bin_counts_df['geoid20_count'].sum()

# Calculate the percentage for each bin
bin_counts_df['percentage'] = (bin_counts_df['geoid20_count'] / total_count) * 100

# Print the DataFrame
bin_counts_df

In [31]:
# Count the geoid20 per cbsa_name
cbsa_counts = gdf.groupby('cbsa_name')['geoid20'].count()

# Convert the Series to a DataFrame
cbsa_counts_df = cbsa_counts.reset_index()
cbsa_counts_df.columns = ['cbsa_name', 'geoid20_count']

# Calculate the total count of geoid20
total_geoid20_count = cbsa_counts_df['geoid20_count'].sum()

# Calculate the percentage for each cbsa_name
cbsa_counts_df['percentage'] = round((cbsa_counts_df['geoid20_count'] / total_geoid20_count) * 100, 2)

# Sort the DataFrame by geoid20_count in descending order
cbsa_counts_df.sort_values('geoid20_count', ascending=False, inplace=True)

# Display the DataFrame
cbsa_counts_df.head(n=1)

Unnamed: 0,cbsa_name,geoid20_count,percentage
291,"Los Angeles-Long Beach-Anaheim, CA",8248,6.81


In [27]:
# Trying to figure out if it makes sense to drop the cases where the CBSA name is null
# Count the geoid20 where cbsa_name is null
null_cbsa_count = gdf[gdf['cbsa_name'].isnull()]['geoid20'].count()
print(f'Number of geoid20 where cbsa_name is null: {null_cbsa_count}')

# Count the total number of rows in the DataFrame
total_rows = len(gdf)
print(f'Total number of rows in the DataFrame: {total_rows}')

# Calculate the percentage of rows where cbsa_name is null
null_cbsa_percentage = (null_cbsa_count / total_rows) * 100
print(f'Percentage of rows where cbsa_name is null: {null_cbsa_percentage:.2f}%')

Number of geoid20 where cbsa_name is null: 10178
Total number of rows in the DataFrame: 131264
Percentage of rows where cbsa_name is null: 7.75%


In [32]:
# Looking to understand the distribution of the ac_total column
gdf['ac_total'].describe()

count    1.312640e+05
mean     1.124656e+04
std      1.944862e+05
min      1.096136e+00
25%      1.290690e+02
50%      3.246184e+02
75%      1.996697e+03
max      4.055649e+07
Name: ac_total, dtype: float64

Large Variability: The data exhibits a wide range of area sizes, from just over 1 acre to over 40 million acres. The large standard deviation and high maximum value indicate that while most areas are relatively small to moderate in size, there are a few extremely large areas that are significantly larger than the rest.

Skewed Distribution: The mean being much higher than the median suggests a right-skewed distribution, where the presence of a few very large values is pulling the average up. This is further evidenced by the 75th percentile being well below the mean.

Outliers: The maximum value of over 40 million acres is an outlier, and it could significantly affect the mean and standard deviation. Depending on the analysis, it might be important to consider how this outlier influences overall results.