# California kindergarten immunization rates

In [1]:
# Import modules

import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import math
import bokeh
from ast import literal_eval
from bokeh.resources import INLINE
from bokeh.plotting import figure, show
from bokeh.sampledata.us_states import data as states
from bokeh.models import ColumnDataSource, Range1d
from bokeh.io import output_notebook
from bokeh.tile_providers import CARTODBPOSITRON
from bokeh.palettes import brewer
output_notebook(resources=INLINE)

In [2]:
# Set plot preferences
plt.style.use('dark_background')

In [3]:
# Set pandas options
pd.options.display.max_columns=500
pd.options.display.max_colwidth=1000

## Import data

In [4]:
df_geo = pd.read_csv('data/geoData.csv')

In [5]:
df_infant = pd.read_csv('data/InfantData.csv')

In [6]:
df_pert = pd.read_csv('data/pertusisRates2010_2015.csv')

In [7]:
df_student = pd.read_csv('data/StudentData.csv')

## Useful functions

In [8]:
def basic_usa_geo_plot(df, plot_width, plot_height):
    source_geo = ColumnDataSource(data=dict(lat=df['latitude'].values, lon=df['longitude'].values))
    state_lats = [states[code]['lats'] for code in states]
    state_longs = [states[code]['lons'] for code in states]
    p = figure(toolbar_location="left",
               plot_width=3000,
               plot_height=2000)
    # determine range to print based on min, max lat and long of the data
    margin = .2 # buffer to add to the range
    lat_min = df['latitude'].min() - margin
    lat_max = df['latitude'].max() + margin
    long_min = df['longitude'].min() - margin
    long_max = df['longitude'].max() + margin
    
    p.y_range = Range1d(lat_min, lat_max)
    p.x_range = Range1d(long_min, long_max)
    p.xaxis.visible = False
    p.yaxis.visible = False
    p.xgrid.grid_line_color = None
    p.ygrid.grid_line_color = None
    
    p.patches(state_longs, 
              state_lats, 
              fill_alpha=0.0,
              line_color="black", 
              line_width=2, 
              line_alpha=0.3)

    p.circle(x="lon", 
             y="lat", 
             source=source_geo, 
             size=4.5,
             fill_color='red',
             line_color='grey',
             line_alpha=.25)
    
    show(p)

In [9]:
def merc(lat, lon):
    # Radius of the earth in meters
    r_major = 6378137.000
    x = r_major * math.radians(lon)
    scale = x/lon
    y = 180.0/math.pi * math.log(math.tan(math.pi/4.0 + 
        lat * (math.pi/180.0)/2.0)) * scale
    return (x, y)

In [10]:
def geo_plot_world(df, lat_col, lon_col, colour_col=None):
    df_to_use = df[[lat_col, lon_col]]
    df_to_use['x_coord'] = df_to_use.apply(lambda row: merc(lat=row[lat_col], 
                                                            lon=row[lon_col])[0], axis=1)
    df_to_use['y_coord'] = df_to_use.apply(lambda row: merc(lat=row[lat_col], 
                                                            lon=row[lon_col])[1], axis=1)
#     source_geo = ColumnDataSource(data=dict(x_coord=df_to_use['x_coord'].values, 
#                                             y_coord=df_to_use['y_coord'].values))
    
    # determine range to print based on min, max lat and long of the data
    margin = .2 # buffer to add to the range
    lat_min = df[lat_col].min() - margin
    lat_max = df[lat_col].max() + margin
    lon_min = df[lon_col].min() - margin
    lon_max = df[lon_col].max() + margin
    
    merc_min_range = merc(lat_min, lon_min)
    merc_max_range = merc(lat_max, lon_max)
    
    merc_x_min = merc_min_range[0]
    merc_y_min = merc_min_range[1]
    merc_x_max = merc_max_range[0]
    merc_y_max = merc_max_range[1]
    
    p = figure(toolbar_location='left', 
           x_range=(merc_x_min, merc_x_max), y_range=(merc_y_min, merc_y_max),
           x_axis_type="mercator", y_axis_type="mercator")
    
    p.add_tile(CARTODBPOSITRON)
    
    if colour_col:
        # Get the number of colours we'll need
        len_ = len(df[colour_col].unique())
        while len_ < 3:
            len_ += 1
        colours = brewer["PuRd"][len_]
        
        # Create a map between factor and color.
        colourmap = {i: colours[i] for i in df[colour_col].unique()}
        
        # Create a list of colors for each value that we will be looking at.
        colours = [colourmap[x] for x in df[colour_col]]
        
        source_geo = ColumnDataSource(data=dict(x_coord=df_to_use['x_coord'].values, 
                                      y_coord=df_to_use['y_coord'].values,
                                      colouring=colours))
    
        p.circle(x="x_coord", 
                 y="y_coord",
                 color="colouring",
                 source=source_geo,
                 size=4.5)
    else:
        source_geo = ColumnDataSource(data=dict(x_coord=df_to_use['x_coord'].values, 
                                      y_coord=df_to_use['y_coord'].values))
        p.circle(x="x_coord", 
                 y="y_coord", 
                 source=source_geo, 
                 size=4.5,
                 fill_color='red',
                 line_color='grey',
                 line_alpha=.25)
    show(p)

## Geo data

In [11]:
df_geo.head()

Unnamed: 0,longitude,latitude,school_code,countyMatch,isSchool
0,-122.215864,37.803399,7092463,1,1
1,-121.882682,37.716306,6972533,1,1
2,-122.196244,37.758459,7082266,1,1
3,-122.234422,37.782892,6910343,1,1
4,-122.243885,37.768844,6967434,1,1


## Columns:
* school_code: Unique integer code for each school (consistent across years)
* countyMatch: indicator variable is entry is in the appropriate county
* isSchool: indicator variable (if search returned "school")

In [12]:
df_geo.shape

(10023, 5)

In [13]:
df_geo['school_code'].nunique()

10023

In [14]:
# So each row is unique for school code.
df_geo[df_geo['school_code'].isnull()].shape

(0, 5)

In [15]:
df_geo['countyMatch'].value_counts()

1    9350
0     673
Name: countyMatch, dtype: int64

In [16]:
df_geo[df_geo['countyMatch'].isnull()].shape

(0, 5)

In [17]:
9350 + 673

10023

In [18]:
# So all good here.
df_geo[df_geo['isSchool'].isnull()].shape

(0, 5)

In [19]:
df_geo['isSchool'].value_counts()

1    8997
0    1026
Name: isSchool, dtype: int64

In [20]:
8997 + 1026

10023

In [21]:
# Ok where are all of these places? They should be in california.
geo_plot_world(df=df_geo, lat_col='latitude', lon_col='longitude')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [22]:
# So some of these places are weird. What are these odd geospatial coordinates?
geo_plot_world(df=df_geo.loc[(df_geo['countyMatch'] == 0)], 
               lat_col='latitude', 
               lon_col='longitude')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [23]:
geo_plot_world(df=df_geo.loc[(df_geo['countyMatch'] == 1)], 
               lat_col='latitude', 
               lon_col='longitude')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [24]:
# Beauty.  Ok so we should only consider California examples.
df_geo = df_geo.loc[(df_geo['countyMatch'] == 1)]

In [25]:
df_geo.shape

(9350, 5)

In [26]:
# Ok, how many are identified as schools
df_geo['isSchool'].value_counts()

1    8616
0     734
Name: isSchool, dtype: int64

In [27]:
# So the majority are schools.  Can we visualize to see if there is a pattern or cluster here?
geo_plot_world(df=df_geo, 
               lat_col='latitude', 
               lon_col='longitude',
               colour_col='isSchool')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
