In [1]:
import pandas as pd
import numpy as np
from naive_bayes import *
from observation_functions import get_observations_given_queries, get_probabilities_given_queries

In [2]:
# LOADING DATA

fires_df = pd.read_csv('data/forestfires.csv')
fires_df 

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.00
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.00
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.00
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.00
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,4,3,aug,sun,81.6,56.7,665.6,1.9,27.8,32,2.7,0.0,6.44
513,2,4,aug,sun,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0,54.29
514,7,4,aug,sun,81.6,56.7,665.6,1.9,21.2,70,6.7,0.0,11.16
515,1,4,aug,sat,94.4,146.0,614.7,11.3,25.6,42,4.0,0.0,0.00


In [3]:
# RECORDING OBSERVATIONS:
# INITIAL HYPOTHESIS: Fires are caused by high DMC, FFMC, DC (measures of moisture)
observation_fires = fires_df[fires_df['area'] > 0]
observation_no_fires = fires_df[fires_df['area'] == 0]
fires_observed = len(observation_fires)
no_fires_observed = len(observation_no_fires)
prob_fire = (fires_observed/(fires_observed+no_fires_observed))

observation_fires.title = 'OBSERVATIONS WHERE FIRE OCCURS'
observation_no_fires.title = 'OBSERVATIONS WHERE NO FIRE OCCURS'

In [4]:
# TESTING THE FUNCTIONS:

index_range_ffmc = [0, 75, 80, 85, 90, 95]
index_range_dmc = [0, 50, 100, 150, 200]
indices = [index_range_ffmc, index_range_dmc]


get_observations_given_queries(observation_fires, ['FFMC'], [index_range_ffmc], print_output=True)
get_probabilities_given_queries(observation_fires, ['FFMC'], [index_range_ffmc], print_output=True)

get_observations_given_queries(observation_fires, ['FFMC', 'DMC'], [index_range_ffmc, index_range_dmc], print_output=True)
get_probabilities_given_queries(observation_fires, ['FFMC', 'DMC'], [index_range_ffmc, index_range_dmc], print_output=True)

Counts in OBSERVATIONS WHERE FIRE OCCURS where 0 ≤ FFMC < 75]: 1
Counts in OBSERVATIONS WHERE FIRE OCCURS where 75 ≤ FFMC < 80]: 3
Counts in OBSERVATIONS WHERE FIRE OCCURS where 80 ≤ FFMC < 85]: 16
Counts in OBSERVATIONS WHERE FIRE OCCURS where 85 ≤ FFMC < 90]: 30
Counts in OBSERVATIONS WHERE FIRE OCCURS where 90 ≤ FFMC < 95]: 201
Counts in OBSERVATIONS WHERE FIRE OCCURS where 95 ≤ FFMC: 19


P(OBSERVATIONS WHERE FIRE OCCURS | 0 ≤ FFMC < 75]) = 0.0037
P(OBSERVATIONS WHERE FIRE OCCURS | 75 ≤ FFMC < 80]) = 0.01111
P(OBSERVATIONS WHERE FIRE OCCURS | 80 ≤ FFMC < 85]) = 0.05926
P(OBSERVATIONS WHERE FIRE OCCURS | 85 ≤ FFMC < 90]) = 0.11111
P(OBSERVATIONS WHERE FIRE OCCURS | 90 ≤ FFMC < 95]) = 0.74444
P(OBSERVATIONS WHERE FIRE OCCURS | 95 ≤ FFMC]) = 0.07037

Counts in OBSERVATIONS WHERE FIRE OCCURS where 0 ≤ FFMC < 75]: 1
Counts in OBSERVATIONS WHERE FIRE OCCURS where 75 ≤ FFMC < 80]: 3
Counts in OBSERVATIONS WHERE FIRE OCCURS where 80 ≤ FFMC < 85]: 16
Counts in OBSERVATIONS WHERE FIRE OCCURS

{'FFMC': array([0.0037037 , 0.01111111, 0.05925926, 0.11111111, 0.74444444,
        0.07037037]),
 'DMC': array([0.16666667, 0.23703704, 0.38148148, 0.12592593, 0.08888889])}

In [5]:

#Each weather index can be divided into 5 levels of severity: Low, Moderate, High, 
#Severe and Extreme. The elements in the list for each weather index corresponds 
#set the upper and lower bound for the range of each level of severity. For example,
#In the FFMC index, 0-50 represents Low, 50-80 moderate, 80-91 High, 91 - 95 Severe
# and >95 is Extreme.
index_range_FFMC = [0,50,80,91,95]
index_range_DMC = [0,1,10,60,200]
index_range_DC = [0,20,50,425,750]
index_range_ISI = [0,1,5,15,50]
#Observe how many times a fire occurs and does not occur for a given range of an index.
fire_observations = get_observations_given_queries(observation_fires, ['FFMC','DMC','DC','ISI'], [index_range_FFMC, index_range_DMC, index_range_DC, index_range_ISI], print_output=True)
no_fire_observations = get_observations_given_queries(observation_no_fires, ['FFMC','DMC','DC','ISI'], [index_range_FFMC, index_range_DMC, index_range_DC, index_range_ISI], print_output=True)


Counts in OBSERVATIONS WHERE FIRE OCCURS where 0 ≤ FFMC < 50]: 0
Counts in OBSERVATIONS WHERE FIRE OCCURS where 50 ≤ FFMC < 80]: 4
Counts in OBSERVATIONS WHERE FIRE OCCURS where 80 ≤ FFMC < 91]: 80
Counts in OBSERVATIONS WHERE FIRE OCCURS where 91 ≤ FFMC < 95]: 167
Counts in OBSERVATIONS WHERE FIRE OCCURS where 95 ≤ FFMC: 19

Counts in OBSERVATIONS WHERE FIRE OCCURS where 0 ≤ DMC < 1]: 0
Counts in OBSERVATIONS WHERE FIRE OCCURS where 1 ≤ DMC < 10]: 11
Counts in OBSERVATIONS WHERE FIRE OCCURS where 10 ≤ DMC < 60]: 47
Counts in OBSERVATIONS WHERE FIRE OCCURS where 60 ≤ DMC < 200]: 188
Counts in OBSERVATIONS WHERE FIRE OCCURS where 200 ≤ DMC: 24

Counts in OBSERVATIONS WHERE FIRE OCCURS where 0 ≤ DC < 20]: 5
Counts in OBSERVATIONS WHERE FIRE OCCURS where 20 ≤ DC < 50]: 8
Counts in OBSERVATIONS WHERE FIRE OCCURS where 50 ≤ DC < 425]: 45
Counts in OBSERVATIONS WHERE FIRE OCCURS where 425 ≤ DC < 750]: 171
Counts in OBSERVATIONS WHERE FIRE OCCURS where 750 ≤ DC: 41

Counts in OBSERVATIONS WHE

In [6]:
#Create a dataframe based off the recorded distributions
fire_observations_df = pd.DataFrame(fire_observations)
fire_observations_df.index = ['Low Risk(1)', 'Moderate Risk(2)', 'High Risk(3)', 'Severe Risk(4)', 'Extreme Risk(5)']
fire_observations_df['Row Sum'] = fire_observations_df.sum(axis=1)
fire_observations_df.loc['Column Sum'] = fire_observations_df.sum(axis=0)
print("Fire Observations")
print(fire_observations_df)

no_fire_observations_df = pd.DataFrame(no_fire_observations)
no_fire_observations_df.index = ['Low Risk (1)', 'Moderate Risk(2)', 'High Risk(3)', 'Severe Risk(4)', 'Extreme Risk(5)']
no_fire_observations_df['Row Sum'] = fire_observations_df.sum(axis=1)
no_fire_observations_df.loc['Column Sum'] = fire_observations_df.sum(axis=0)
print("No Fire Observations")
print (no_fire_observations_df)


Fire Observations
                  FFMC  DMC   DC  ISI  Row Sum
Low Risk(1)          0    0    5    1        6
Moderate Risk(2)     4   11    8   31       54
High Risk(3)        80   47   45  213      385
Severe Risk(4)     167  188  171   25      551
Extreme Risk(5)     19   24   41    0       84
Column Sum         270  270  270  270     1080
No Fire Observations
                  FFMC  DMC   DC  ISI  Row Sum
Low Risk (1)         1    0    3    6      NaN
Moderate Risk(2)     7   11   13   26    108.0
High Risk(3)        78   57   51  197    770.0
Severe Risk(4)     152  152  148   17   1102.0
Extreme Risk(5)      9   27   32    1    168.0
Column Sum         540  540  540  540   2160.0


In [7]:
boolean_fires_df = pd.read_csv('data/forestfires.csv')
#Adjust imported df such that area is set to be a boolean. Set to 1 if a fire
#occurs and 0 if one does not occur.
boolean_fires_df = boolean_fires_df.drop(['X','Y','wind', 'temp', 'rain','RH','month','day'], axis = 1)
boolean_fires_df['area'] = np.where(boolean_fires_df['area'] > 0, 1, 0)

#Adjust imported df such that each value in each weather indice is set between 1-5,
#corresponding the severity range they fall into. For example, if a row has an
#FFMC of 57, it would be replaced with a 2 as it falls in the moderate category.
bin_FFMC = [0,50,80,91,95,float('inf')]
bin_DMC = [0,1,10,60,200,float ('inf')]
bin_DC = [0,20,50,425,750, float('inf')]
bin_ISI = [0,1,5,15,50,float('inf')]
labels = [0, 1, 2, 3, 4]

boolean_fires_df['FFMC'] = np.digitize(boolean_fires_df['FFMC'], bins=bin_FFMC, right=True)
boolean_fires_df['DMC'] = np.digitize(boolean_fires_df['DMC'], bins=bin_DMC, right=True)
boolean_fires_df['DC'] = np.digitize(boolean_fires_df['DC'], bins=bin_DC, right=True)
boolean_fires_df['ISI'] = np.digitize(boolean_fires_df['ISI'], bins=bin_ISI, right=True)

#Calculate the conditional probability of each variable, such that P(X=x|area = 1)
#is found for all variables.
conditional_prob_FFMC = calculate_conditional_prob(boolean_fires_df,'FFMC','area',6,1,fires_observed)
conditional_prob_DMC = calculate_conditional_prob(boolean_fires_df,'DMC','area',6,1,fires_observed)
conditional_prob_DC = calculate_conditional_prob(boolean_fires_df,'DC','area',6,1,fires_observed)
conditional_prob_ISI = calculate_conditional_prob(boolean_fires_df,'ISI','area',6,1,fires_observed)
print(conditional_prob_FFMC)
print(conditional_prob_DMC)
print(conditional_prob_DC)
print(conditional_prob_ISI)

#Calculate the conditional probability of each variable, such that P(X=x|area = 0)
#is found for all variables.
false_conditional_prob_FFMC = calculate_conditional_prob(boolean_fires_df,'FFMC','area',6,0,no_fires_observed)
false_conditional_prob_DMC = calculate_conditional_prob(boolean_fires_df,'DMC','area',6,0,no_fires_observed)
false_conditional_prob_DC = calculate_conditional_prob(boolean_fires_df,'DC','area',6,0,no_fires_observed)
false_conditional_prob_ISI = calculate_conditional_prob(boolean_fires_df,'ISI','area',6,0,no_fires_observed)
print(false_conditional_prob_FFMC)
print(false_conditional_prob_DMC)
print(false_conditional_prob_DC)
print(false_conditional_prob_ISI)

{0: 0, 1: 0, 2: 0.014814814814814815, 3: 0.34814814814814815, 4: 0.5703703703703704, 5: 0.06666666666666667}
{0: 0, 1: 0, 2: 0.040740740740740744, 3: 0.17407407407407408, 4: 0.6962962962962963, 5: 0.08888888888888889}
{0: 0, 1: 0.018518518518518517, 2: 0.02962962962962963, 3: 0.16666666666666666, 4: 0.6333333333333333, 5: 0.15185185185185185}
{0: 0, 1: 0.003703703703703704, 2: 0.12222222222222222, 3: 0.7814814814814814, 4: 0.09259259259259259, 5: 0}
{0: 0, 1: 0, 2: 0.02834008097165992, 3: 0.3481781376518219, 4: 0.582995951417004, 5: 0.03643724696356275}
{0: 0, 1: 0, 2: 0.044534412955465584, 3: 0.23076923076923078, 4: 0.6153846153846154, 5: 0.10931174089068826}
{0: 0, 1: 0.012145748987854251, 2: 0.05263157894736842, 3: 0.20647773279352227, 4: 0.5991902834008097, 5: 0.12955465587044535}
{0: 0, 1: 0.020242914979757085, 2: 0.1214574898785425, 3: 0.7813765182186235, 4: 0.06882591093117409, 5: 0}


In [8]:
calculate_lookup_table('FFMC',conditional_prob_FFMC,'area',prob_fire)
calculate_lookup_table('DMC',conditional_prob_DMC,'area',prob_fire)
calculate_lookup_table('DC',conditional_prob_DC,'area',prob_fire)
calculate_lookup_table('ISI',conditional_prob_ISI,'area',prob_fire)

{'P(area|ISI = 0)': 0,
 'P(area|ISI = 1)': 0.003703703703703704,
 'P(area|ISI = 2)': 0.12222222222222222,
 'P(area|ISI = 3)': 0.7814814814814814,
 'P(area|ISI = 4)': 0.09259259259259259,
 'P(area|ISI = 5)': 0,
 'P(area|ISI = 1, ISI = 2, ISI = 3,ISI = 4, ISI = 5)': 1.710624029160059e-05}

In [None]:
#The calculate_naive_bayes function takes in a 4 value list. Each value of the 
#list corresponds to a severity, with the first index corresponding to the FFMC,
#second to the DMC, third to the DC, fourth to the ISI. The function then estimates
#the parameters of a Bayes Net for the given evidence.

#Calculate and show a requested entry in a CPT table for the defined variable
naive_bayes = calculate_naive_bayes([4,4,4,3],conditional_prob_FFMC,conditional_prob_DMC,conditional_prob_DC,conditional_prob_ISI,prob_fire)
print(naive_bayes)

0.1026538637402835
