* Part 1: import libraries and data
* Part 2: EDA
* Part 3: Predict the cause of wildfire
* Part 4: Try to predict if the fire was caused knowingly

**Part 1**

In [None]:
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import tree, preprocessing
import sklearn.ensemble as ske
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from subprocess import check_output
conn = sqlite3.connect("capstone.sqlite")
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

The code below is sql connection.

In [None]:
data = pd.read_sql_query("SELECT * FROM 'Fires'", conn)
print(data.head())

The only columns we need are "FIRE_YEAR,STAT_CAUSE_DESCR,LATITUDE,LONGITUDE,STATE,DISCOVERY_DATE,FIRE_SIZE"

In [None]:
data = pd.read_sql_query("SELECT FIRE_YEAR,STAT_CAUSE_DESCR,LATITUDE,LONGITUDE,STATE,DISCOVERY_DATE,FIRE_SIZE FROM 'Fires'", conn)
print(data.head())

In [None]:
df= data[data.STAT_CAUSE_DESCR != 'Miscellaneous']

In [None]:
df= data[data.STAT_CAUSE_DESCR != 'Miscellaneous']

In [None]:
data = df

In [None]:
data.to_csv(r'C:\Users\gurma\Desktop\capstone\fires.csv', index = False)

The most weird part of this dataset is the date format which is in JUILIAN format , its been converted into Georgian so that it is generalized.

In [None]:
data['DATE'] = pd.to_datetime(data['DISCOVERY_DATE'] - pd.Timestamp(0).to_julian_date(), unit='D')
print(data.head()) #check the data

In [None]:
data['MONTH'] = pd.DatetimeIndex(data['DATE']).month
data['DAY_OF_WEEK'] = data['DATE'].dt.weekday_name
data_orig = data.copy()
print(data.head())

In [None]:
data.to_csv(r'C:\Users\gurma\Desktop\capstone\Data_mod.csv', index = False)

**Part 2**

**Exploratory Data Analysis (EDA):**

Let us first start looking at the cause of fires

In [None]:
data['STAT_CAUSE_DESCR'].value_counts().plot(kind='barh',color='coral')
plt.show()

So there are 13 causes of fire to be precise let us see if there is any unique pattern regarding the weekdays the fire is caused.

In [None]:
data['DAY_OF_WEEK'].value_counts().plot(kind='barh',color='coral')
plt.show()

    All the days of week look almost looks same. Let us see the top caused reason for Fire.

In [None]:
lightning = data[data['STAT_CAUSE_DESCR']=='Lightning']
lightning['DAY_OF_WEEK'].value_counts().plot(kind='barh',color='coral')
plt.show()

    Due to lightning the pattern is different so maybe the increase of fire on weekends could be because of man caused fire.

In [None]:
arson = data[data['STAT_CAUSE_DESCR']=='Arson']
arson['DAY_OF_WEEK'].value_counts().plot(kind='barh',color='coral')
plt.show()

The ARSON(MAN CAUSED) fire shows a good information that most of them are on weekend that means it could be because of campfires or similar things.

Let us divide them by states now?

In [None]:
data['STATE'].value_counts().head(n=10).plot(kind='barh',color='coral')
plt.show()

Let us norrow down to top 3

In [None]:
CA = data[data['STATE']=='CA']
GA = data[data['STATE']=='GA']
TX = data[data['STATE']=='TX']

In [None]:
CA['STAT_CAUSE_DESCR'].value_counts().plot(kind='barh',color='coral',title='causes of fires for CA')
plt.show()

In [None]:
GA['STAT_CAUSE_DESCR'].value_counts().plot(kind='barh',color='coral',title='causes of fires for GA')
plt.show()

In [None]:
TX['STAT_CAUSE_DESCR'].value_counts().plot(kind='barh',color='coral',title='causes of fires for TX')
plt.show()

Let us create a rough map using scatter plot as we have the Latitude and Longitude

In [None]:
data.plot(kind='scatter',x='LONGITUDE',y='LATITUDE',color='coral',alpha=0.3)
plt.show()

There are lot of categories in this dataset so let us use One Hot Encoding to find the correlation between all these.

In [None]:
le = preprocessing.LabelEncoder()
data['STAT_CAUSE_DESCR'] = le.fit_transform(data['STAT_CAUSE_DESCR'])
data['STATE'] = le.fit_transform(data['STATE'])
data['DAY_OF_WEEK'] = le.fit_transform(data['DAY_OF_WEEK'])
print(data.head())

In [None]:
def plot_corr(data,size=10):
    corr = data.corr()  #the default method is pearson
    fig, ax = plt.subplots(figsize=(size, size))
    ax.matshow(corr,cmap=plt.cm.Oranges)
    plt.xticks(range(len(corr.columns)), corr.columns)
    plt.yticks(range(len(corr.columns)), corr.columns)
    for tick in ax.get_xticklabels():
        tick.set_rotation(45)    
    plt.show()
    

    
plot_corr(data)

Good correlation between month and latitude, weather and season are related, less correlation between longitude and month
& No Correlation of Target variable with any

**Part 3**

**Preparing the data for machine learning**

Dropping the Dates and NA's

In [None]:
data = data.drop('DATE',axis=1)
data = data.dropna()

Our Target variable is Cause of Fire(" STAT_CAUSE_DESCR ")

In [None]:
X = data.drop(['STAT_CAUSE_DESCR'], axis=1).values
y = data['STAT_CAUSE_DESCR'].values

Logistic Regression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=0) #30% for testing, 70% for training

In [None]:
regr = linear_model.LinearRegression()

In [None]:
# Train the model using the training sets
regr.fit(X_train,y_train)

In [None]:
# Make predictions using the testing set
y_pred = regr.predict(X_test)

In [None]:
# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test,y_pred))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(y_test,y_pred))

In [None]:
regr.fit(X_train,y_train)

In [None]:
regr.score(X_train, y_train)

In [None]:
regr.fit(X_test,y_test)

In [None]:
regr.score(X_test, y_test)

Decision Tree

In [None]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

In [None]:
tree.plot_tree(clf.fit(X_train,y_train))


In [None]:
clf = tree.DecisionTreeRegressor(max_depth=35)

In [None]:
clf.fit(X_train, y_train)

In [None]:
clf.score(X_train, y_train)

In [None]:
clf.fit(X_test, y_test)

In [None]:
clf.score(X_test, y_test)

Random Forest

In [None]:
clf_rf = ske.RandomForestClassifier(n_estimators=200)
clf_rf = clf_rf.fit(X_train, y_train)
print(clf_rf.score(X_test,y_test))

Let us narrow down the classes as there are a lot of classes related to the cause of fire and wich could be tideous while predicting the cause of it.

In [None]:
def set_label(cat):
    cause = 0
    natural = ['Lightning']
    accidental = ['Structure','Fireworks','Powerline','Railroad','Smoking','Children','Campfire','Equipment Use','Debris Burning']
    malicious = ['Arson']
    other = ['Missing/Undefined','Miscellaneous']
    if cat in natural:
        cause = 1
    elif cat in accidental:
        cause = 2
    elif cat in malicious:
        cause = 3
    else:
        cause = 4
    return cause
     

data['LABEL'] = data_orig['STAT_CAUSE_DESCR'].apply(lambda x: set_label(x)) # I created a copy of the original data earlier in the kernel
data = data.drop('STAT_CAUSE_DESCR',axis=1)
print(data.head())

Let us try to predict the LABEL now.

In [None]:
X = data.drop(['LABEL'], axis=1).values
y = data['LABEL'].values
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=0)
clf_rf = ske.RandomForestClassifier(n_estimators=50)
clf_rf = clf_rf.fit(X_train, y_train)
print(clf_rf.score(X_test,y_test))

Reducing the Classes did turn out to be a good decision

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = clf_rf.fit(X_train, y_train).predict(X_test)
cm = confusion_matrix(y_true=y_test,y_pred=y_pred)
print(cm)

Accuracy and Confusion matrix simplified according to Label below

In [None]:
cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
fig,ax = plt.subplots(figsize=(10,10))
ax.matshow(cmn,cmap=plt.cm.Oranges,alpha=0.7)
for i in range(cmn.shape[0]):
    for j in range(cmn.shape[1]):
        ax.text(x=j,y=i,s=cmn[i,j],va='center',ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label')
plt.show()

**Part 4**

Narrowing down to States

In [None]:
print(CA.head())

Create a new field: ARSON

In [None]:
def set_arson_label(cause):
    arson = 0
    if cause == 'Arson':
        arson = 1
    return arson
     

CA['ARSON'] = CA['STAT_CAUSE_DESCR'].apply(lambda x: set_arson_label(x)) 
print(CA.head())

We can drop the DATE, STATE, FIRE_SIZE and STAT_CAUSE_DESCR fields and convert the DAY_OF_WEEK to numerical values.

In [None]:
le = preprocessing.LabelEncoder()
CA['DAY_OF_WEEK'] = le.fit_transform(CA['DAY_OF_WEEK'])

print(CA.head())

# From Here 

In [None]:
data1 = pd.read_sql_query("SELECT STAT_CAUSE_DESCR,LATITUDE,LONGITUDE,STATE,FIRE_SIZE FROM 'Fires'", conn)
print(data1.head())
data1_orig = data1.copy()

In [None]:
def set_label(cat):
    cause = 0
    natural = ['Lightning']
    accidental = ['Structure','Fireworks','Powerline','Railroad','Smoking','Children','Campfire','Equipment Use','Debris Burning']
    malicious = ['Arson']
    other = ['Missing/Undefined','Miscellaneous']
    if cat in natural:
        cause = 1
    elif cat in accidental:
        cause = 2
    elif cat in malicious:
        cause = 3
    else:
        cause = 4
    return cause
     

data1['LABEL'] = data1_orig['STAT_CAUSE_DESCR'].apply(lambda x: set_label(x)) # I created a copy of the original data earlier in the kernel
data1 = data1.drop('STAT_CAUSE_DESCR',axis=1)
print(data.head())

In [None]:
le = preprocessing.LabelEncoder()
data1['STATE'] = le.fit_transform(data1['STATE'])

print(data1.head())

In [None]:


X = data1.drop(['LABEL'], axis=1).values
y = data1['LABEL'].values
Xx_train, Xx_test, yy_train, yy_test = train_test_split(X,y,test_size=0.3, random_state=0)

In [None]:
from sklearn.svm import SVC
svc = SVC(kernel = 'rbf', C= 10, gamma =0.1) 
svc.fit(Xx_train,yy_train)

In [None]:
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

In [None]:
#Train the model using the training sets
clf.fit(X_train, y_train)

In [None]:

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [None]:
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))

We can now test the ML:

In [None]:
X = CA.drop(['ARSON'], axis=1).values
y = CA['ARSON'].values
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=0) #30% for testing, 70% for training
clf_rf = ske.RandomForestClassifier(n_estimators=200)
clf_rf = clf_rf.fit(X_train, y_train)
print(clf_rf.score(X_test,y_test))

Summary:
Given some basic data, the kind of data available when a fire is first discovered, it is possible to predict with some accuracy if the firs wa the result of arson.