# Food Desert Investigation -- Exploratory Data Analysis
Hypothesis: Access to grocery stores, specialty food stores, and recreation centers are measures populations with higher education and employment outcomes.

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.graphics.api import abline_plot
from sklearn.metrics import mean_squared_error, r2_score 
from sklearn.model_selection import train_test_split
from sklearn import linear_model, preprocessing 
import geopandas as gpd

ModuleNotFoundError: No module named 'geopandas'

In [None]:
df = pd.read_csv('new_clean_combined.csv')

## Initial Review

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.hist(figsize = (25,20))
plt.subplots_adjust(hspace=0.5);

## Correlation

In [None]:
corr = df.corr(numeric_only=True)
corr

In [None]:
plt.figure(figsize=(20,20))
_ = sns.heatmap(df.corr(numeric_only=True), annot=False)
plt.show()

Correlation -- Sorted (Absolute Value)

In [None]:
abs_corr = corr.abs()

In [None]:
s = abs_corr.unstack()
so = s.sort_values(kind='quicksort', ascending=False)
so = so[so > 0.6]
so = so[so < 1.0]

In [None]:
sorted_correlation = pd.DataFrame(so)
sorted_correlation.head()

In [None]:
#plt.figure(figsize=(20,20))
#_ = sns.heatmap(sorted_correlation, annot=False)
#plt.ylabel('Correlation of Column Variables')
#plt.show()

## Examining Variables

### Bar plots

In [None]:
plt.figure(figsize=(8,8))
groc_by_state = df.groupby("State")['GROC16'].sum().sort_values(ascending=True)
_ = groc_by_state.plot(kind="bar", title="Number of Grocery Stores by State")
plt.show()

In [None]:
#plt.figure(figsize=(20,20))
groc_pop = df.groupby('State')[['GROC16','SPECS16','RECFAC16']].sum()
(groc_pop.reindex(index=groc_pop.mean(axis=1)
    .sort_values(ascending=False)
    .index)
    .plot(kind='bar', figsize=(10, 10), title='Number of restaurants by state'))
plt.xlabel('State')
plt.ylabel('Number');

In [None]:
plt.figure(figsize=(8,8))
spec_by_state = df.groupby("State")['SPECS16'].sum().sort_values(ascending=True)
spec_by_state.plot(kind="bar", title="Number of Speciality Food Stores by State")
plt.show()

In [None]:
plt.figure(figsize=(8,8))
ff_by_state = df.groupby("State")['FFR16'].sum().sort_values(ascending=True)
ff_by_state.plot(kind="bar", title="Number of Fast Food Restaurants by State")
plt.show()

In [None]:
restaurants_by_state = df.groupby('State')[['FFR16','FSR16']].sum()
restaurants_by_state.head()

In [None]:
#plt.figure(figsize=(20,20))
(restaurants_by_state.reindex(index=restaurants_by_state.mean(axis=1)
    .sort_values(ascending=False)
    .index)
    .plot(kind='bar', figsize=(10, 10), title='Number of restaurants by state'))
plt.xlabel('State')
plt.ylabel('Number');

### Scatter plots

In [None]:
# Plot number of grocery stores against number of fast food restaurants
_ = sns.scatterplot(data=df, x='GROC16', y='FFR16', alpha=0.5)
_ = plt.xlabel('Number of grocery stores')
_ = plt.ylabel('Number of fast food restaurants')
_ = plt.title("Relationship between the number of grocery stores and fast food restaurants")
plt.show()

In [None]:
# Plot number of grocery stores against number of full-service restaurants
_ = sns.scatterplot(data=df, x='GROC16', y='FSR16', alpha=0.5)
_ = plt.xlabel('Number of grocery stores')
_ = plt.ylabel('Number of full service restaurants')
_ = plt.title("Relationship between the number of grocery stores and full service restaurants")
plt.show()

In [None]:
# Plot number of grocery stores against number of full-service restaurants
_ = sns.scatterplot(data=df, x='GROC16', y='SPECS16', alpha=0.5)
_ = plt.xlabel('Number of grocery stores')
_ = plt.ylabel('Number of specialty food stores')
_ = plt.title("Relationship between the number of grocery stores and specialty food stores")
plt.show()

In [None]:
# Plot number of grocery stores against number of WIC-authorized stores
_ = sns.scatterplot(data=df, x='GROC16', y='WICS16', alpha=0.5)
_ = plt.xlabel('Number of grocery stores')
_ = plt.ylabel('Number of WIC-authorized stores')
_ = plt.title("Relationship between the number of grocery stores and WIC-authorized stores")
plt.show()

In [None]:
#_ = sns.scatterplot(data=df, x='GROC16', y='PovertyEstimateAllAges', alpha=0.5)
_ = plt.xlabel('Number of grocery stores')
_ = plt.ylabel('Estimated number of people in poverty')
_ = plt.title("Relationship between the number of grocery stores and estimated number of people in poverty")
plt.show()

In [None]:
#_ = sns.scatterplot(data=df, x='GROC16', y='MedianHouseholdIncome', alpha=0.5)
_ = plt.xlabel('Number of grocery stores')
_ = plt.ylabel('Median household income')
_ = plt.title("Relationship between the number of grocery stores and median household income")
plt.show()

### Box plots

In [None]:
plt.figure(figsize=(20,20))
_ = sns.boxplot(x='State',y='GROC16',data=df.query('GROC16 < 300'))
_ = plt.xlabel('State')
_ = plt.ylabel('Number of grocery stores')
_ = plt.title('Grocery stores by state (Limited to < 300 stores)')
plt.show

In [None]:
plt.figure(figsize=(20,20))
_ = sns.boxplot(x='State',y='FFR16',data=df.query('FFR16 < 2000'))
_ = plt.xlabel('State')
_ = plt.ylabel('Number of fast food restaurants')
_ = plt.title('Fast food restaurants by state (Limited to < 2000 restaurants)')
plt.show

In [None]:
plt.figure(figsize=(20,20))
_ = sns.boxplot(x='State',y='SPECS16',data=df.query('SPECS16 < 300'))
_ = plt.xlabel('State')
_ = plt.ylabel('Number of specialty food stores')
_ = plt.title('Specialty food stores by state (Limited to < 2000 restaurants)')
plt.show

In [None]:
ax.boxplot(df['GROC16'])
#ax.set_xticklabels(["Rowing", "Gymnastics"])
#ax.set_ylabel("Height (cm)")
plt.show()

### Comparisons using .describe()

In [None]:
# Grocery stores
df['GROC16'].describe()

In [None]:
# Fast food restaurants
df['FFR16'].describe()

In [None]:
# Full service restaurants
df['FSR16'].describe()

In [None]:
# Specialty food stores
df['SPECS16'].describe()