In [1]:
# Start
import os
import time
print("last time run: {}".format(time.strftime("%c")))
CWD = os.getcwd()
print("Current working dir: {}".format(CWD))


last time run: Sat Apr 27 15:01:59 2019
Current working dir: /home/bren/Home/projects/CodeForNashville/food-desert/data/census


In [2]:
# Style
from IPython.core.display  import HTML
from IPython.core.debugger import set_trace

def css_styling():
    styles = open("/home/bren/Home/python/Jupyter/Jupyter Notebooks/custom.css", "r").read()
    return HTML(styles)
css_styling()


In [3]:
# setup TOC
import sys
toc_path = '/home/bren/miniconda3/lib/python3.7/site-packages/IPythonTOC-1.2.0-py3.7.egg'
if toc_path not in sys.path:
    sys.path.append(toc_path)
else:
    print('toc_path in sys.path already')

from IPythonTOC import IPythonTOC
toc = IPythonTOC()

In [4]:
# imports
from matplotlib import pyplot as plt, rcParams
import numpy as np
import pandas as pd
import seaborn as sns

%matplotlib inline
plt.style.use("ggplot")
rcParams["figure.figsize"] = (14, 7)

import IPython, platform
%reload_ext watermark
print("Python  {}".format(platform.python_version()))
%watermark -v -m -p IPython,matplotlib,numpy,pandas,seaborn


Python  3.7.1
CPython 3.7.1
IPython 7.2.0

IPython 7.2.0
matplotlib 3.0.2
numpy 1.15.4
pandas 0.24.1
seaborn 0.9.0

compiler   : GCC 7.3.0
system     : Linux
release    : 4.9.0-8-amd64
machine    : x86_64
processor  : 
CPU cores  : 8
interpreter: 64bit


<a id='Table_of_Contents'></a> 
### Table of Contents

[Load Data](#Load_Data)

[Explore Data](#Explore_Data)

* [Non-Zero Categoricals](#Non_Zero_Categoricals)
* [Non-Zero Numeric Variables](#Non_Zero_Numeric_Variables)
* [Zero-Valued Numeric Variables](#Zero_Valued_Numeric_Variables)
* [Boolean Variables](#Boolean_Variables)
* [Scaled Numeric Variables](#Scaled_Numeric_Variables)
* [Correlations In Numeric Variables](#Correlations_In_Numeric_Variables)
* [Violin Plot of Numeric Variables](#Violin_Plot_of_Numeric_Variables)


## Load Data
<a id='Load_Data'></a>


[Table of Contents](#Table_of_Contents)

In [5]:
# load data
if os.path.isfile('Food_Access_Research_Atlas_Data_TN.csv'):
    fara_tn_df = pd.read_csv('Food_Access_Research_Atlas_Data_TN.csv')
else:
    fara_df = pd.read_csv('Food_Access_Research_Atlas_Data.csv')
    fara_tn_df = fara_df[fara_df.State == 'TN']
    fara_tn_df.to_csv('Food_Access_Research_Atlas_Data_TN.csv', index=False)    
    try:
        del fara_df
    except:
        pass


In [6]:
# results of loading file
print('loaded Food_Access_Research_Atlas_Data_TN')
print('{:,d} rows for Tennessee'.format(len(fara_tn_df)))
print('First 5 rows')
fara_tn_df.head()


loaded Food_Access_Research_Atlas_Data_TN
1,497 rows for Tennessee
First 5 rows


Unnamed: 0,CensusTract,State,County,LILATracts_1And10,LILATracts_halfAnd10,LILATracts_1And20,LILATracts_Vehicle,Urban,Rural,LA1and10,...,lapop20,lapop20share,lalowi20,lalowi20share,lakids20,lakids20share,laseniors20,laseniors20share,lahunv20,lahunv20share
0,47001020100,TN,Anderson,0,1,0,1,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,47001020201,TN,Anderson,0,0,0,0,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,47001020202,TN,Anderson,0,0,0,0,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,47001020300,TN,Anderson,0,0,0,0,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,47001020400,TN,Anderson,0,0,0,0,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Explore Data
<a id='Explore_Data'></a>


[Table of Contents](#Table_of_Contents)

In [7]:
fara_variables = pd.read_csv('Food_Access_Research_Atlas_Data_Variables.csv')
fara_variables.set_index('Field', inplace=True)
fara_variables.head()

Unnamed: 0_level_0,LongName
Field,Unnamed: 1_level_1
CensusTract,Census tract
County,County
GroupQuartersFlag,"Group quarters, tract with high share"
HUNVFlag,"Vehicle access, tract with low vehicle access"
LA1and10,Low access tract at 1 mile for urban areas or ...


## Non-Zero Categoricals
<a id='Non_Zero_Categoricals'></a>


[Table of Contents](#Table_of_Contents)

In [8]:
# capture numerics and describe them
def get_numerics_and_objects(data):
    data_objects = set([c for c, v in  data.dtypes.items() if v == np.dtype('O')])
    data_numerics = set([c for c, v in  data.dtypes.items() if c not in data_objects])
    return data_numerics, data_objects


In [9]:
def print_categorical_uniques(df):
    vars_numeric, vars_categorical = get_numerics_and_objects(df)
    for k in vars_categorical:
        uvals = df[k].unique()
        print()
        print('{} has these unique values:'.format(k))
        for uv in uvals:
            print('\t{}'.format(uv))


In [10]:
# printing unique values of categorical variables
print_categorical_uniques(fara_tn_df)


UATYP10 has these unique values:
	U
	R
	C

County has these unique values:
	Anderson
	Bedford
	Benton
	Bledsoe
	Blount
	Bradley
	Campbell
	Cannon
	Carroll
	Carter
	Cheatham
	Chester
	Claiborne
	Clay
	Cocke
	Coffee
	Crockett
	Cumberland
	Davidson
	Decatur
	DeKalb
	Dickson
	Dyer
	Fayette
	Fentress
	Franklin
	Gibson
	Giles
	Grainger
	Greene
	Grundy
	Hamblen
	Hamilton
	Hancock
	Hardeman
	Hardin
	Hawkins
	Haywood
	Henderson
	Henry
	Hickman
	Houston
	Humphreys
	Jackson
	Jefferson
	Johnson
	Knox
	Lake
	Lauderdale
	Lawrence
	Lewis
	Lincoln
	Loudon
	McMinn
	McNairy
	Macon
	Madison
	Marion
	Marshall
	Maury
	Meigs
	Monroe
	Montgomery
	Moore
	Morgan
	Obion
	Overton
	Perry
	Pickett
	Polk
	Putnam
	Rhea
	Roane
	Robertson
	Rutherford
	Scott
	Sequatchie
	Sevier
	Shelby
	Smith
	Stewart
	Sullivan
	Sumner
	Tipton
	Trousdale
	Unicoi
	Union
	Van Buren
	Warren
	Washington
	Wayne
	Weakley
	White
	Williamson
	Wilson

State has these unique values:
	TN


## Non-Zero Numeric Variables
<a id='Non_Zero_Numeric_Variables'></a>


[Table of Contents](#Table_of_Contents)

In [23]:
fara_tn_df.dtypes

CensusTract               int64
State                    object
County                   object
LILATracts_1And10         int64
LILATracts_halfAnd10      int64
LILATracts_1And20         int64
LILATracts_Vehicle        int64
Urban                     int64
Rural                     int64
LA1and10                  int64
LAhalfand10               int64
LA1and20                  int64
LATracts_half             int64
LATracts1                 int64
LATracts10                int64
LATracts20                int64
LATractsVehicle_20        int64
HUNVFlag                  int64
GroupQuartersFlag         int64
OHU2010                   int64
NUMGQTRS                float64
PCTGQTRS                float64
LowIncomeTracts           int64
POP2010                   int64
UATYP10                  object
lapophalf               float64
lapophalfshare          float64
lalowihalf              float64
lalowihalfshare         float64
lakidshalf              float64
                         ...   
lapop1  

In [42]:
def print_describe(df, stride):
    vars_numeric = set()
    vars_numeric_zero = set()

    i = df.ftypes

    for k in i.keys():
        if 'int' in i[k] or 'float' in i[k]:
            if all(df[k] == 0):
                vars_numeric_zero.add(k)
            else:
                vars_numeric.add(k)

    desc_df = df[vars_numeric]
    for start in range(0, len(vars_numeric), stride):
        end = start + stride
        if end > len(vars_numeric):
            end = -1
        print(desc_df[start:end], '\n')
    print(len(vars_numeric), len(vars_numeric_zero))
    return vars_numeric, vars_numeric_zero


In [43]:
# list count, mean, std, min, max, and 25% quadrants of numeric variables
vars_numeric, vars_numeric_zero = print_describe(fara_tn_df, 4)


   lakids10share  lahunvhalfshare  lapop10  lapop1share  lalowi10  \
0            0.0         0.084407      0.0     0.050440       0.0   
1            0.0         0.026770      0.0     0.312018       0.0   
2            0.0         0.105603      0.0     0.242865       0.0   
3            0.0         0.042470      0.0     0.703942       0.0   

   GroupQuartersFlag  CensusTract    lahunv1  lahunv1share  lakids10  ...  \
0                  0  47001020100   7.895756      0.005107       0.0  ...   
1                  0  47001020201  16.293518      0.010615       0.0  ...   
2                  0  47001020202   9.124029      0.004506       0.0  ...   
3                  0  47001020300  27.536238      0.016628       0.0  ...   

   LA1and20  LA1and10  lalowi10share  HUNVFlag  laseniorshalf  \
0         0         0            0.0         1     296.712724   
1         1         1            0.0         0     476.939336   
2         1         1            0.0         1     944.096845   
3       

## Zero-Valued Numeric Variables
<a id='Zero_Valued_Numeric_Variables'></a>


[Table of Contents](#Table_of_Contents)

In [32]:
# variables with all values not equal to zero (to be kept)
print('These variables are not all zeros:\n')
pd.set_option('display.max_colwidth', 200)
fara_variables.loc[vars_numeric]


These variables are not all zeros:



Unnamed: 0_level_0,LongName
Field,Unnamed: 1_level_1
CensusTract,Census tract
LILATracts_1And10,Low income and low access measured at 1 and 10 miles
LILATracts_halfAnd10,Low income and low access measured at 1/2 and 10 miles
LILATracts_1And20,Low income and low access measured at 1 and 20 miles
LILATracts_Vehicle,Low income and low access using vehicle access
Urban,Urban tract
Rural,Rural tract
LA1and10,Low access tract at 1 mile for urban areas or 10 miles for rural areas
LAhalfand10,Low access tract at 1/2 mile for urban areas or 10 miles for rural areas
LA1and20,Low access tract at 1 mile for urban areas or 20 miles for rural areas


In [14]:
# variables with all values equal to zero (to be dropped)
print('These variables are all zeros:\n')
fara_variables.loc[vars_numeric_zero]


These variables are all zeros:



Unnamed: 0_level_0,LongName
Field,Unnamed: 1_level_1
lapop20,"Low access, people at 20 miles, number"
lapop20share,"Low access, people at 20 miles, share"
lalowi20,"Low access, low-income people at 20 miles, number"
lalowi20share,"Low access, low-income people at 20 miles, share"
lakids20,"Low access, children age 0-17 at 20 miles, number"
lakids20share,"Low access, children age 0-17 at 20 miles, share"
laseniors20,"Low access, seniors age 65+ at 20 miles, number"
laseniors20share,"Low access, seniors age 65+ at 20 miles, share"
lahunv20,"Vehicle access, housing units without and low access at 20 miles, number"
lahunv20share,"Vehicle access, housing units without and low access at 20 miles, share"


## Boolean Variables
<a id='Boolean_Variables'></a>


[Table of Contents](#Table_of_Contents)

In [44]:
bool_series = pd.Series([0,1])
bool_vars = set()
for col in vars_numeric:
    col_u = fara_tn_df[col].unique()
    if len(col_u) == 2 and all(col_u == bool_series):
        print(col, len(col_u))
        bool_vars.add(col)
if len(bool_vars) > 0:
    print('the folowing are boolean variables\n',
         '\n'.join(['\t{}'.format(c) for c in bool_vars]))
else:
    print('no boolean variables found')
        

GroupQuartersFlag 2
LILATracts_1And10 2
Rural 2
LATracts1 2
LILATracts_1And20 2
LA1and20 2
LA1and10 2
LATracts10 2
the folowing are boolean variables
 	LILATracts_1And10
	LILATracts_1And20
	LA1and20
	LA1and10
	Rural
	GroupQuartersFlag
	LATracts10
	LATracts1


## Scaled Numeric Variables
<a id='Scaled_Numeric_Variables'></a>


[Table of Contents](#Table_of_Contents)

In [48]:
from sklearn.preprocessing import RobustScaler
vars_scaled_df = pd.DataFrame(RobustScaler(
                                  quantile_range=(25, 75)).fit_transform(fara_tn_df[vars_numeric.difference(bool_vars)]),
                              columns=list(vars_numeric))


ModuleNotFoundError: No module named 'sklearn'

In [None]:
# scaled numeric variabiles (count, mean, std, min, max, 25% quadrants)
print("after scaling non-zero variables")
vars_numeric, vars_numeric_zero = print_describe(vars_scaled_df, 5)


## Correlations In Numeric Variables
<a id='Correlations_In_Numeric_Variables'></a>


[Table of Contents](#Table_of_Contents)

In [None]:
# look for correlations

vars_corr = vars_scaled_df.corr('pearson')

limit = 0.5
corr_count = 0
for name in vars_corr.columns:
    d = vars_corr[name][vars_corr.columns != name].describe()
    if d['mean'] > limit:
        corr_count += 1
if corr_count > 0:
    print('{:,d} found'.format(corr_count))
else:
    print('NO CORRELATIONS ABOVE {:.2%} FOUND!!!'.format(limit))
    

In [None]:
# correlation heatmap
import seaborn as sns

sns.set(style="white")

# Generate a large random dataset
d = vars_scaled_df

# Compute the correlation matrix
corr = d.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
fig, ax = plt.subplots(figsize=(16,10))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio

sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5},
           ax=ax);


## Violin Plot of Numeric Variables
<a id='Violin_Plot_of_Numeric_Variables'></a>


[Table of Contents](#Table_of_Contents)

In [None]:
# violin plot of all variables scaled
sns.set(style="whitegrid")
stride = 10

import seaborn as sns

sns.set(style="whitegrid")

def plot_violins(df, vars_names, stride):

    # Set up the matplotlib figure
    num_vars = len(vars_names)
    num_plots = num_vars // stride
    if  num_vars % stride:
        num_plots += 1
    f, ax = plt.subplots(num_plots, 1, figsize=(18, 5 * num_plots))

    # Draw a violinplot with a narrower bandwidth than the default
    for ax_indx, start in enumerate(range(0, num_vars, stride)):
        end = start + stride
        if end >= num_vars:
            df_plot = df[vars_names[start:]]
        else:
            df_plot = df[vars_names[start:end]]
        if num_plots > 1:
            ax_plot = ax[ax_indx]
        else:
            ax_plot = ax
        sns.violinplot(ax=ax_plot, data=df_plot, palette="Set3",
                       bw=.2, cut=1, linewidth=1, scale='count')
        
    # Finalize the figure
    #ax.set(ylim=(-.7, 1.05))
    sns.despine(left=True, bottom=True)

from collections import defaultdict
vars_to_plot = defaultdict(list)
vars_so_far = set()

for scale in (1000., 100., 10., 0.):
    new_cols = set([col for col in vars_scaled_df.columns[vars_scaled_df.max() > scale] if col not in vars_so_far])
    vars_to_plot[scale] = new_cols
    vars_so_far = vars_so_far.union(new_cols)

    plot_violins(vars_scaled_df, list(new_cols), stride)

