In [1]:
## Import Libraries ##
import sqlite3
from sqlite3 import Error
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from pandas import DataFrame
from sqlalchemy import create_engine, MetaData, Table, select, func, desc

## Connect to Database

In [3]:
engine = create_engine('sqlite:///C:/Users/604906/Desktop/BAH/Data Science/WiDS/Wildfires/188-million-us-wildfires/FPA_FOD_20170508.sqlite')

In [4]:
connection = engine.connect()

In [5]:
# Print tables names of database to make sure connection worked
print(engine.table_names())

['ElementaryGeometries', 'Fires', 'KNN', 'NWCG_UnitIDActive_20170109', 'SpatialIndex', 'geometry_columns', 'geometry_columns_auth', 'geometry_columns_field_infos', 'geometry_columns_statistics', 'geometry_columns_time', 'idx_Fires_Shape', 'idx_Fires_Shape_node', 'idx_Fires_Shape_parent', 'idx_Fires_Shape_rowid', 'spatial_ref_sys', 'spatial_ref_sys_aux', 'spatialite_history', 'sql_statements_log', 'sqlite_sequence', 'views_geometry_columns', 'views_geometry_columns_auth', 'views_geometry_columns_field_infos', 'views_geometry_columns_statistics', 'virts_geometry_columns', 'virts_geometry_columns_auth', 'virts_geometry_columns_field_infos', 'virts_geometry_columns_statistics']


## Connect to Fires Table

In [6]:
# Function in SqlAlchemy Library that reflects the table information (such as datatypes, primary keys, etc.)
metadata = MetaData()

In [7]:
# Create a Table object that can be used to query the data
Fires = Table('Fires', metadata, autoload = True, autoload_with=engine)

In [8]:
try:
    stmt = 'SELECT * FROM Fires'
    
    # Execute the statement and store all the records: results
    results = connection.execute(stmt).fetchall()

    # Create a DataFrame from the results: df
    fires = pd.DataFrame(results)

    # Set column names
    fires.columns = results[0].keys()

except Exception as err:
    print(err)

In [9]:
#Look at columns in Fires
fires.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1880465 entries, 0 to 1880464
Data columns (total 39 columns):
OBJECTID                      int64
FOD_ID                        int64
FPA_ID                        object
SOURCE_SYSTEM_TYPE            object
SOURCE_SYSTEM                 object
NWCG_REPORTING_AGENCY         object
NWCG_REPORTING_UNIT_ID        object
NWCG_REPORTING_UNIT_NAME      object
SOURCE_REPORTING_UNIT         object
SOURCE_REPORTING_UNIT_NAME    object
LOCAL_FIRE_REPORT_ID          object
LOCAL_INCIDENT_ID             object
FIRE_CODE                     object
FIRE_NAME                     object
ICS_209_INCIDENT_NUMBER       object
ICS_209_NAME                  object
MTBS_ID                       object
MTBS_FIRE_NAME                object
COMPLEX_NAME                  object
FIRE_YEAR                     int64
DISCOVERY_DATE                float64
DISCOVERY_DOY                 int64
DISCOVERY_TIME                object
STAT_CAUSE_CODE               float64
S

## Feature Selection

In [10]:
df = fires[['FIRE_YEAR','DISCOVERY_DATE','STAT_CAUSE_DESCR','FIRE_SIZE','STATE','FIPS_NAME','LATITUDE','LONGITUDE']]

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1880465 entries, 0 to 1880464
Data columns (total 8 columns):
FIRE_YEAR           int64
DISCOVERY_DATE      float64
STAT_CAUSE_DESCR    object
FIRE_SIZE           float64
STATE               object
FIPS_NAME           object
LATITUDE            float64
LONGITUDE           float64
dtypes: float64(4), int64(1), object(3)
memory usage: 114.8+ MB


In [12]:
#check data
print(df.head())

   FIRE_YEAR  DISCOVERY_DATE STAT_CAUSE_DESCR  FIRE_SIZE STATE  FIPS_NAME  \
0       2005       2453403.5    Miscellaneous       0.10    CA     Plumas   
1       2004       2453137.5        Lightning       0.25    CA     Placer   
2       2004       2453156.5   Debris Burning       0.10    CA  El Dorado   
3       2004       2453184.5        Lightning       0.10    CA     Alpine   
4       2004       2453184.5        Lightning       0.10    CA     Alpine   

    LATITUDE   LONGITUDE  
0  40.036944 -121.005833  
1  38.933056 -120.404444  
2  38.984167 -120.735556  
3  38.559167 -119.913333  
4  38.559167 -119.933056  


In [20]:
#Only look at California
cali_df = df[df.STATE == 'CA'] #filter for state

In [21]:
#Check that the only state is CA
cali_df.STATE.unique()

array(['CA'], dtype=object)

In [22]:
#Data shape
cali_df.shape
#1.88 million records -> now looking at 189,550.

(189550, 8)

In [25]:
#Drop the 678,148 missing records
cali_df.dropna()

Unnamed: 0,FIRE_YEAR,DISCOVERY_DATE,STAT_CAUSE_DESCR,FIRE_SIZE,STATE,FIPS_NAME,LATITUDE,LONGITUDE
0,2005,2453403.5,Miscellaneous,0.10,CA,Plumas,40.036944,-121.005833
1,2004,2453137.5,Lightning,0.25,CA,Placer,38.933056,-120.404444
2,2004,2453156.5,Debris Burning,0.10,CA,El Dorado,38.984167,-120.735556
3,2004,2453184.5,Lightning,0.10,CA,Alpine,38.559167,-119.913333
4,2004,2453184.5,Lightning,0.10,CA,Alpine,38.559167,-119.933056
...,...,...,...,...,...,...,...,...
1872300,2015,2457300.5,Missing/Undefined,0.01,CA,Sacramento,38.691628,-121.372908
1872301,2015,2457316.5,Miscellaneous,0.10,CA,Calaveras,38.172881,-120.801955
1872302,2015,2457353.5,Miscellaneous,0.10,CA,Riverside,33.919157,-116.882973
1872303,2015,2457347.5,Debris Burning,0.25,CA,Fresno,37.110917,-119.322918


In [30]:
#List top 10 counties in CA with most wildfires
cali_df['FIPS_NAME'].value_counts().head(10)

Riverside         6925
Los Angeles       2703
El Dorado         2676
San Bernardino    2642
San Diego         2443
Fresno            2183
Siskiyou          2156
Shasta            2118
Butte             1768
Kern              1737
Name: FIPS_NAME, dtype: int64

In [161]:
#Maybe focus on the "Top X" Counties
#For now, focus on these Top 10

In [31]:
top10_county = cali_df[cali_df['FIPS_NAME'].isin(['Riverside', 'Los Angeles','El Dorado','San Bernardino','San Diego',
                                              'Fresno','Siskiyou','Shasta','Butte', 'Kern'])]

In [32]:
top10_county.shape

(27351, 8)

In [33]:
top10_county

Unnamed: 0,FIRE_YEAR,DISCOVERY_DATE,STAT_CAUSE_DESCR,FIRE_SIZE,STATE,FIPS_NAME,LATITUDE,LONGITUDE
2,2004,2453156.5,Debris Burning,0.10,CA,El Dorado,38.984167,-120.735556
6,2004,2453187.5,Lightning,0.10,CA,El Dorado,38.688333,-120.153333
10,2004,2453188.5,Lightning,0.10,CA,El Dorado,38.691667,-120.159722
12,2004,2453251.5,Miscellaneous,0.10,CA,El Dorado,38.786667,-120.193333
14,2004,2453281.5,Lightning,0.20,CA,El Dorado,38.675833,-120.279722
...,...,...,...,...,...,...,...,...
1872295,2015,2457332.5,Debris Burning,9.90,CA,Fresno,36.945877,-119.503609
1872296,2015,2457300.5,Missing/Undefined,0.01,CA,Riverside,33.789728,-117.255233
1872298,2015,2457367.5,Missing/Undefined,0.02,CA,Riverside,33.709175,-116.179773
1872302,2015,2457353.5,Miscellaneous,0.10,CA,Riverside,33.919157,-116.882973


In [34]:
#Find Min and Max Year
top10_county.FIRE_YEAR.describe()[['min','max']]
#Train on 1997-2014
#Test on 2015 data

min    1997.0
max    2015.0
Name: FIRE_YEAR, dtype: float64

In [None]:
# Problem 1: Which county will the fire occur in? 
# Can the machine predict which county the wildfire will occur given data such as location, day of week, etc.

In [2]:
# Drop Latitude and Longitude, which defeats the purpose of predicting location of wildfire and State (not needed here)
test1 = top10_county.drop(columns=['LATITUDE','LONGITUDE','STATE'])
test1

NameError: name 'top10_county' is not defined

In [68]:
#Use LabelEncoder to dummy encode STAT_CAUSE_DESCR & FIPS_NAME
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
test1['STAT_CAUSE_DESCR'] = le.fit_transform(test1['STAT_CAUSE_DESCR']) #STAT_CAUSE_DESCR : cause of fire
test1['FIPS_NAME'] = le.fit_transform(test1['FIPS_NAME']) #FIPS_NAME : county name
#test1

In [72]:
#Standardize data (features have different scales/units)
from sklearn.preprocessing import StandardScaler

scaled_test1 = test1.copy()
col_names = ['FIRE_YEAR', 'DISCOVERY_DATE','FIRE_SIZE'] #Don't include STAT_CAUSE_DESCR & FIPS_NAME : the dummy encoded ones
features = scaled_test1[col_names]
scaler = StandardScaler().fit(features.values)
features = scaler.transform(features.values)

In [73]:
#Assign results to those three columns
scaled_test1[col_names] = features
print(scaled_test1)

         FIRE_YEAR  DISCOVERY_DATE  STAT_CAUSE_DESCR  FIRE_SIZE  FIPS_NAME
2        -0.976623       -1.001333                 3  -0.040807          1
6        -0.976623       -0.984110                 6  -0.040807          1
10       -0.976623       -0.983554                 6  -0.040807          1
12       -0.976623       -0.948553                 7  -0.040807          1
14       -0.976623       -0.931886                 6  -0.040765          1
...            ...             ...               ...        ...        ...
1872295   1.252841        1.318751                 3  -0.036757          2
1872296   1.252841        1.300973                 8  -0.040844          5
1872298   1.252841        1.338197                 8  -0.040840          5
1872302   1.252841        1.330419                 7  -0.040807          5
1872303   1.252841        1.327085                 3  -0.040745          2

[27351 rows x 5 columns]


In [74]:
##Partitioning Data
X = scaled_test1.drop(['FIPS_NAME'], axis=1).values
y = scaled_test1['FIPS_NAME'].values

In [76]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=0) #30% for testing, 70% for training

In [79]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
rf_clsf = RandomForestClassifier(n_estimators=50)
rf_clsf = rf_clsf.fit(X_train, y_train)
print(rf_clsf.score(X_test,y_test))

0.32451864489398


In [81]:
#Try predicting cause of wildfire
scaled_test1.STAT_CAUSE_DESCR.unique() #There seems to be 13 causes for these top 10 counties.

array([ 3,  6,  7,  4,  0,  1,  2, 10, 11,  9,  8,  5, 12], dtype=int64)

In [82]:
##Partitioning Data (2)
X = scaled_test1.drop(['STAT_CAUSE_DESCR'], axis=1).values
y = scaled_test1['STAT_CAUSE_DESCR'].values

In [84]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=0) #30% for testing, 70% for training

In [85]:
#Random Forest (2)
rf_clsf = RandomForestClassifier(n_estimators=50)
rf_clsf = rf_clsf.fit(X_train, y_train)
print(rf_clsf.score(X_test,y_test))

0.36010236412381186


In [89]:
## Only include numeric columns (drop the other dummy encoded variable)
scaled_test1a = scaled_test1.drop(columns=['STAT_CAUSE_DESCR'])
##Partitioning Data (3)
X = scaled_test1a.drop(['FIPS_NAME'], axis=1).values
y = scaled_test1a['FIPS_NAME'].values

In [90]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=0) #30% for testing, 70% for training

In [91]:
#Random Forest (3)
rf_clsf = RandomForestClassifier(n_estimators=50)
rf_clsf = rf_clsf.fit(X_train, y_train)
print(rf_clsf.score(X_test,y_test))

0.271996100414331


In [93]:
## Only include numeric columns (drop the other dummy encoded variable) #2
scaled_test1b = scaled_test1.drop(columns=['FIPS_NAME'])
##Partitioning Data ()
X = scaled_test1b.drop(['STAT_CAUSE_DESCR'], axis=1).values
y = scaled_test1b['STAT_CAUSE_DESCR'].values

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=0) #30% for testing, 70% for training

In [95]:
#Random Forest (4)
rf_clsf = RandomForestClassifier(n_estimators=50)
rf_clsf = rf_clsf.fit(X_train, y_train)
print(rf_clsf.score(X_test,y_test))

0.30246161345357053
