In [1]:
import pandas as pd
#pandas is an awesome library for python, lets you do lots of stuff you might otherwise do in R

In [2]:
#create a pandas dataframe by reading in the csv form the remote site(in a single line of code <#)
df = pd.read_csv('http://www.eia.gov/state/seds/CDF/Complete_SEDS.csv')

In [3]:
#head just prints the first few lines
df.head()

Unnamed: 0,Data_Status,MSN,StateCode,Year,Data
0,2014F,ABICB,AK,1960,0.0
1,2014F,ABICB,AK,1961,0.0
2,2014F,ABICB,AK,1962,0.0
3,2014F,ABICB,AK,1963,0.0
4,2014F,ABICB,AK,1964,0.0


In [4]:
#we only want some of the values in the raw data, we can filter to help keep the size of our DB down
values_list = [
    "TETCB",
    "FFTCB",
    "CLTCB",
    "NNTCB",
    "PMTCB",
    "NUETB",
    "RETCB",
    "EMLCB",
    "EMTCB",
    "GETCB",
    "HYTCB",
    "SOTCB",
    "WWTCB",
    "WYTCB",
    "ELNIB",
    "ELISB" 
]
#make a new dataframe that is a filtered version of the original
filtered_df = df[df['MSN'].isin(values_list)]
#lets also cast the column names to lower case
filtered_df.columns = filtered_df.columns.str.lower()

In [5]:
#filtering this data makes the size way more managable
print("Original: ", df.size, "Filtered: ", filtered_df.size)

Original:  7780315 Filtered:  228800


In [6]:
#write the file to csv for refrence. this file is about 1.5 megs, way smaller than 50 from the original file!
filtered_df.to_csv('./data/filtered_data.csv')

In [7]:
#couple ways to filter/sort this data
"""
We've got Years, States, and Data Values (MSN+ it's data)
i.e. If the user visits the site and we want them to see an inital nationwide view for the most recent year
"""
#pretty easy to sort, even by multiple values
filtered_df.sort_values(['year', 'statecode', 'msn'])


Unnamed: 0,data_status,msn,statecode,year,data
171895,2014F,CLTCB,AK,1960,7189.0
314800,2014F,ELISB,AK,1960,0.0
318960,2014F,ELNIB,AK,1960,0.0
339708,2014F,EMLCB,AK,1960,0.0
342568,2014F,EMTCB,AK,1960,0.0
428114,2014F,FFTCB,AK,1960,54634.0
478346,2014F,GETCB,AK,1960,0.0
501226,2014F,HYTCB,AK,1960,3120.0
950111,2014F,NNTCB,AK,1960,2034.0
963371,2014F,NUETB,AK,1960,0.0


In [8]:
#filter specific year (can also be a list of values) 
filtered_df[filtered_df['year'].isin([2012, 2013])].sort_values(['year', 'statecode', 'msn'])

Unnamed: 0,data_status,msn,statecode,year,data
171947,2014F,CLTCB,AK,2012,15521.0
314852,2014F,ELISB,AK,2012,0.0
319012,2014F,ELNIB,AK,2012,4.0
339760,2014F,EMLCB,AK,2012,0.0
342620,2014F,EMTCB,AK,2012,1882.0
428166,2014F,FFTCB,AK,2012,629768.0
478398,2014F,GETCB,AK,2012,186.0
501278,2014F,HYTCB,AK,2012,14988.0
950163,2014F,NNTCB,AK,2012,347228.0
963423,2014F,NUETB,AK,2012,0.0


In [62]:
"""some simple examples of rudimentary data exploration"""
#years
print("Years: ", filtered_df.year.unique(), "Total: ", filtered_df.year.unique().size)
#datatypes
print("data_types: ", filtered_df.msn.unique(), "Total: ", filtered_df.msn.unique().size)
#states
print("data_types: ", filtered_df.statecode.unique(), "Total: ", filtered_df.statecode.unique().size)
filtered_df[filtered_df['year'].isin([2013])].sort_values(['data', 'year', 'statecode', 'msn'])


Years:  [1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974
 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989
 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004
 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014] Total:  55
data_types:  ['CLTCB' 'ELISB' 'ELNIB' 'EMLCB' 'EMTCB' 'FFTCB' 'GETCB' 'HYTCB' 'NNTCB'
 'NUETB' 'PMTCB' 'RETCB' 'SOTCB' 'TETCB' 'WWTCB' 'WYTCB'] Total:  16
data_types:  ['AK' 'AL' 'AR' 'AZ' 'CA' 'CO' 'CT' 'DC' 'DE' 'FL' 'GA' 'HI' 'IA' 'ID' 'IL'
 'IN' 'KS' 'KY' 'LA' 'MA' 'MD' 'ME' 'MI' 'MN' 'MO' 'MS' 'MT' 'NC' 'ND' 'NE'
 'NH' 'NJ' 'NM' 'NV' 'NY' 'OH' 'OK' 'OR' 'PA' 'RI' 'SC' 'SD' 'TN' 'TX' 'US'
 'UT' 'VA' 'VT' 'WA' 'WI' 'WV' 'WY'] Total:  52


Unnamed: 0,data_status,msn,statecode,year,data
316943,2014F,ELISB,PA,2013,-688638.0
315623,2014F,ELISB,IL,2013,-523671.0
314908,2014F,ELISB,AL,2013,-514409.0
317603,2014F,ELISB,WV,2013,-429515.0
317658,2014F,ELISB,WY,2013,-354988.0
315018,2014F,ELISB,AZ,2013,-325837.0
316393,2014F,ELISB,ND,2013,-210253.0
316283,2014F,ELISB,MT,2013,-132752.0
316613,2014F,ELISB,NM,2013,-116455.0
317053,2014F,ELISB,SC,2013,-97054.0


In [92]:
import folium

state_geo = r'data/states_w_abrv_extra_simple.geojson'
data_point = 'SOTCB'
selection = filtered_df[filtered_df['year'].isin([2013]) & filtered_df['msn'].isin([data_point])]

selection.sort_values('data')

Unnamed: 0,data_status,msn,statecode,year,data
1326894,2014F,SOTCB,AK,2013,10.0
1329149,2014F,SOTCB,SD,2013,10.0
1328434,2014F,SOTCB,ND,2013,10.0
1329699,2014F,SOTCB,WY,2013,20.0
1328269,2014F,SOTCB,MS,2013,35.0
1328489,2014F,SOTCB,NE,2013,61.0
1328874,2014F,SOTCB,OK,2013,62.0
1328324,2014F,SOTCB,MT,2013,65.0
1327774,2014F,SOTCB,KS,2013,68.0
1327609,2014F,SOTCB,ID,2013,71.0


In [104]:

m = folium.Map([43, -100], zoom_start=4)

us_states = state_geo

m.choropleth(
    geo_path=us_states,
    data=selection,
    columns=['statecode', 'data'],
    key_on='feature.properties.STUSPS10',
    fill_color='YlGn',
    threshold_scale=[10, 25, 50, 75, 100]

)

#example binning

m

In [53]:
sim = folium.Map(location=[48, -102], zoom_start=3)
sim.choropleth(geo_path=us_states, line_color='blue',line_weight=3)

In [96]:
help(m.choloropleth)

AttributeError: 'Map' object has no attribute 'choloropleth'

In [97]:
help(m.choropleth)

Help on method choropleth in module folium.folium:

choropleth(geo_path=None, geo_str=None, data_out='data.json', data=None, columns=None, key_on=None, threshold_scale=None, fill_color='blue', fill_opacity=0.6, line_color='black', line_weight=1, line_opacity=1, legend_name='', topojson=None, reset=False) method of folium.folium.Map instance
    Apply a GeoJSON overlay to the map.
    
    Plot a GeoJSON overlay on the base map. There is no requirement
    to bind data (passing just a GeoJSON plots a single-color overlay),
    but there is a data binding option to map your columnar data to
    different feature objects with a color scale.
    
    If data is passed as a Pandas dataframe, the "columns" and "key-on"
    keywords must be included, the first to indicate which DataFrame
    columns to use, the second to indicate the layer in the GeoJSON
    on which to key the data. The 'columns' keyword does not need to be
    passed for a Pandas series.
    
    Colors are generated from c