In [8]:
import sqlite3 as sql_client
import pandas as data_analyzer

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [9]:
SQL_DB_FILE_PATH='./FPA_FOD_20170508.sqlite'

causes_map = {
  'Miscellaneous': 0,
  'Lightning': 1,
  'Debris Burning': 2,
  'Campfire': 3,
  'Equipment Use': 4,
  'Arson': 5,
  'Children': 6,
  'Railroad': 7,
  'Smoking': 8,
  'Powerline': 9,
  'Structure': 10,
  'Fireworks': 11,
  'Missing/Undefined': 12
}


In [10]:
def set_causes_map(data_frame, description):
  causes_set = set(data_frame['cause'])
  causes_map = dict(zip(causes_set, range(len(causes_set))))

In [11]:
def display_rows(data_frame, description):
  print('='*88)
  print('Description: ' + description)
  print('Result:')
  print(data_frame.to_string())

In [12]:
def train_and_test(data_frame, description):
  data_frame = data_frame.dropna()
  data_frame['cause'] = data_frame['cause'].map(causes_map)
  X = data_frame.drop(['cause'], axis=1).values
  Y = data_frame['cause'].values
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=0) # 80% to train; 20% test
  classifier = RandomForestClassifier(n_estimators=200).fit(X_train, Y_train)
  score = classifier.score(X_test, Y_test) * 100
  print('Test Set Score: {} %'.format(score))


In [13]:
queries_and_executors = [
  [
    """
    SELECT fire_year, COUNT(*) as incident_count
    FROM fires
    GROUP BY fire_year
    ORDER BY fire_year ASC
    """, 
    'Have wildfires become more or less frequent over time?',
    display_rows
  ],
  [
    """
    SELECT county, COUNT(*) as incident_count
    FROM fires
    GROUP BY county
    ORDER BY incident_count DESC
    """,
    'What counties are the most and least fire-prone?',
    display_rows
  ],
  # distinct query will read the entire table, better to use it once and update the local values
  [
    """
    SELECT stat_cause_descr as cause
    FROM fires
    LIMIT 100
    """, 
    'All Causes',
    set_causes_map
  ],
  [
    """
    SELECT
      latitude,
      longitude,
      fire_size as size,
      strftime('%w', discovery_date) as day_of_week,
      strftime('%m', discovery_date) as month,
      fire_year as year,
      stat_cause_descr as cause
    FROM fires
    LIMIT 50000
    """, 
    'Sample Data',
    train_and_test
  ]
]


In [14]:
connection = sql_client.connect(SQL_DB_FILE_PATH)
for query_description_executor in queries_and_executors:
  query = query_description_executor[0]
  description = query_description_executor[1]
  executor = query_description_executor[2]
  data_frame = data_analyzer.read_sql(query, con=connection)
  executor(data_frame, description)

Description: Have wildfires become more or less frequent over time?
Result:
    FIRE_YEAR  incident_count
0        1992           67975
1        1993           61989
2        1994           75955
3        1995           71472
4        1996           75574
5        1997           61450
6        1998           68370
7        1999           89363
8        2000           96416
9        2001           86587
10       2002           75656
11       2003           68261
12       2004           69279
13       2005           88604
14       2006          114004
15       2007           95573
16       2008           85378
17       2009           78325
18       2010           79889
19       2011           90552
20       2012           72769
21       2013           64780
22       2014           67753
23       2015           74491
Description: What counties are the most and least fire-prone?
Result:
                                                  COUNTY  incident_count
0                              

In [15]:
import numpy as np
import colorcet as cc
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, LogColorMapper

In [16]:
# read latitude/longitude/fire size/state into panda dataframe
df = data_analyzer.read_sql_query("SELECT LATITUDE, LONGITUDE, FIRE_SIZE, STATE FROM fires", connection)
# show the first 5 rows
df.head(5)

Unnamed: 0,LATITUDE,LONGITUDE,FIRE_SIZE,STATE
0,40.036944,-121.005833,0.1,CA
1,38.933056,-120.404444,0.25,CA
2,38.984167,-120.735556,0.1,CA
3,38.559167,-119.913333,0.1,CA
4,38.559167,-119.933056,0.1,CA


In [17]:
# Remove all wildfires in Alaska, Hawaii and Puerto Rico, because they don't fit on our map nicely:
new = df.loc[(df.loc[:,'STATE']!='AK') & (df.loc[:,'STATE']!='HI') & (df.loc[:,'STATE']!='PR')]

In [20]:
# Suppress some Pandas warning messages that are actually not relevant (false positives)
data_analyzer.options.mode.chained_assignment = None

In [21]:
# Group wildfires together that occured near to each other. To do this, I truncated all latitude and longitude values, combined the values into a new attribute (LL_COMBO) and grouped the dataframe by this attribute.
new.loc[:,'LATITUDE'] = ((new.loc[:,'LATITUDE']*10).apply(np.floor))/10
new.loc[:,'LONGITUDE'] = ((new.loc[:,'LONGITUDE']*10).apply(np.floor))/10
new.loc[:,'LL_COMBO'] = new.loc[:,'LATITUDE'].map(str) + '-' + new.loc[:,'LONGITUDE'].map(str)
grouped = new.groupby(['LL_COMBO', 'LATITUDE', 'LONGITUDE'])

In [22]:
# Create the datasource that is needed for the first heat maps (showing the number of wildfires per geographic location).
number_of_wf = grouped['FIRE_SIZE'].agg(['count']).reset_index()
number_of_wf.head(5)

Unnamed: 0,LL_COMBO,LATITUDE,LONGITUDE,count
0,24.5--81.7,24.5,-81.7,1
1,24.6--81.3,24.6,-81.3,2
2,24.6--81.4,24.6,-81.4,184
3,24.6--81.5,24.6,-81.5,43
4,24.6--81.6,24.6,-81.6,11


In [23]:
# Create the datasource that is needed for the second heat map (showing the average size of wildfires per geographic location).
size_of_wf = grouped['FIRE_SIZE'].agg(['mean']).reset_index()
size_of_wf.head(5)

Unnamed: 0,LL_COMBO,LATITUDE,LONGITUDE,mean
0,24.5--81.7,24.5,-81.7,0.5
1,24.6--81.3,24.6,-81.3,0.1
2,24.6--81.4,24.6,-81.4,0.50163
3,24.6--81.5,24.6,-81.5,0.646512
4,24.6--81.6,24.6,-81.6,0.954545


In [24]:
# Create and show the first heat map:
source = ColumnDataSource(number_of_wf)
p1 = figure(title="Number of wildfires occurring from 1992 to 2015 " + \
            "(lighter color means more wildfires)",
           toolbar_location=None, plot_width=600, plot_height=400)
p1.background_fill_color = "black"
p1.grid.grid_line_color = None
p1.axis.visible = False
color_mapper = LogColorMapper(palette=cc.fire)
glyph = p1.circle('LONGITUDE', 'LATITUDE', source=source,
          color={'field': 'count', 'transform' : color_mapper},
          size=1)
output_notebook()
show(p1)

In [25]:
# Create and show the second heat map
source = ColumnDataSource(size_of_wf)
p2 = figure(title="Average size of wildfires occurring from 1992 to 2015 " + \
            "(lighter color means bigger fire)",
           toolbar_location=None, plot_width=600, plot_height=400)
p2.background_fill_color = "black"
p2.grid.grid_line_color = None
p2.axis.visible = False
glyph = p2.circle('LONGITUDE', 'LATITUDE', source=source,
          color={'field': 'mean', 'transform' : color_mapper},
          size=1)
show(p2)