In [1]:
import pandas as pd
import numpy as np

In [2]:
fires_df = pd.read_csv('data/forestfires.csv')
fires_df

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.00
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.00
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.00
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.00
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,4,3,aug,sun,81.6,56.7,665.6,1.9,27.8,32,2.7,0.0,6.44
513,2,4,aug,sun,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0,54.29
514,7,4,aug,sun,81.6,56.7,665.6,1.9,21.2,70,6.7,0.0,11.16
515,1,4,aug,sat,94.4,146.0,614.7,11.3,25.6,42,4.0,0.0,0.00


In [9]:
# RECORDING OBSERVATIONS:
# INITIAL HYPOTHESIS: Fires are caused by high DMC, FFMC, DC (measures of moisture)
observation_fires = fires_df[fires_df['area'] > 0]
observation_no_fires = fires_df[fires_df['area'] == 0]

observation_fires.title = 'OBSERVATIONS WHERE FIRE OCCURS'
observation_no_fires.title = 'OBSERVATIONS WHERE NO FIRE OCCURS'

"""
    Given a dataframe, this function returns a dict representing the counts of observations 
    that fall within each specified range in the `indices` list for each query variable given 
    from the query_list.

    Notable parameters:
    query_list : list[str]
        A list of strings from the column(s) in the DataFrame that we want to query from
        
    indices : list[int]
        List of list of indices defining the specified ranges of the `query_variable`. 
"""
def get_raw_observations_given_queries(
        data: pd.DataFrame,
        query_list: list[str],
        index_list: list[list[int]],
        print_output: bool = False
    ) -> dict[str, np.array]:

    assert(len(query_list) == len(index_list)), "The length of the query list and index list must be equal"

    observation_counts = {query: [] for query in query_list}

    for indices, query in zip(index_list, query_list):
        for i in range(0, len(indices)):
            count = 0
            lower_bound = indices[i]

            if i != len(indices) - 1:
                upper_bound = indices[i+1]
                count = data[(lower_bound <= data[query]) & (data[query] < upper_bound)].shape[0]
                if print_output:
                    print(f'Counts in {data.title} where {lower_bound} ≤ {query} < {upper_bound}]: {count}')
            else:
                count = data[(lower_bound <= data[query])].shape[0]
                if print_output:
                    print(f'Counts in {data.title} where {lower_bound} ≤ {query}: {count}')

            observation_counts[query].append(count)
        
        # For readability:
        print('')

    # convert counts to np array to work with data easier:
    for query in observation_counts.keys():
        observation_counts[query] = np.array(observation_counts[query])

    return observation_counts

"""
    Given a dataframe, this function returns a dict representing the probabilities of observations 
    given a range in the `indices` list for each query variable given from the query_list.

    Notable parameters:
    query_list : list[str]
        A list of strings from the column(s) in the DataFrame that we want to query from
        
    indices : list[int]
        List of list of indices defining the specified ranges of the `query_variable`. 
"""
def get_probabilities_given_query(
        data: pd.DataFrame,
        query_list: list[str],
        index_list: list[list[int]],
        print_output: bool = False
    ) -> dict[str, list[int]]:

    assert(len(query_list) == len(index_list)), "The length of the query list and index list must be equal"

    total_count_data = data.shape[0]
    conditional_probabilities = get_raw_observations_given_queries(data, query_list, index_list, print_output=False)
    
    for key in conditional_probabilities.keys():
        conditional_probabilities[key] = conditional_probabilities[key] / total_count_data

    if print_output:
        for indices, query in zip(index_list, query_list):
            for i in range(0, len(indices)):
                lower_bound = indices[i]
                if i != len(indices) - 1:
                    upper_bound = indices[i+1]
                    print(f'P({data.title} | {lower_bound} ≤ {query} < {upper_bound}]) = {np.round(conditional_probabilities[query][i], 5)}')
                else:
                    print(f'P({data.title} | {lower_bound} ≤ {query}]) = {np.round(conditional_probabilities[query][i], 5)}')
            
            # For readability:
            print('')

    return conditional_probabilities

In [11]:
# TESTING THE FUNCTIONS:

index_range_ffmc = [0, 75, 80, 85, 90, 95]
index_range_dmc = [0, 50, 100, 150, 200]
indices = [index_range_ffmc, index_range_dmc]

get_raw_observations_given_queries(observation_fires, ['FFMC'], [index_range_ffmc], print_output=True)
get_probabilities_given_query(observation_fires, ['FFMC'], [index_range_ffmc], print_output=True)

get_raw_observations_given_queries(observation_fires, ['FFMC', 'DMC'], [index_range_ffmc, index_range_dmc], print_output=True)
get_probabilities_given_query(observation_fires, ['FFMC', 'DMC'], [index_range_ffmc, index_range_dmc], print_output=True)

Counts in OBSERVATIONS WHERE FIRE OCCURS where 0 ≤ FFMC < 75]: 1
Counts in OBSERVATIONS WHERE FIRE OCCURS where 75 ≤ FFMC < 80]: 3
Counts in OBSERVATIONS WHERE FIRE OCCURS where 80 ≤ FFMC < 85]: 16
Counts in OBSERVATIONS WHERE FIRE OCCURS where 85 ≤ FFMC < 90]: 30
Counts in OBSERVATIONS WHERE FIRE OCCURS where 90 ≤ FFMC < 95]: 201
Counts in OBSERVATIONS WHERE FIRE OCCURS where 95 ≤ FFMC: 19


P(OBSERVATIONS WHERE FIRE OCCURS | 0 ≤ FFMC < 75]) = 0.0037
P(OBSERVATIONS WHERE FIRE OCCURS | 75 ≤ FFMC < 80]) = 0.01111
P(OBSERVATIONS WHERE FIRE OCCURS | 80 ≤ FFMC < 85]) = 0.05926
P(OBSERVATIONS WHERE FIRE OCCURS | 85 ≤ FFMC < 90]) = 0.11111
P(OBSERVATIONS WHERE FIRE OCCURS | 90 ≤ FFMC < 95]) = 0.74444
P(OBSERVATIONS WHERE FIRE OCCURS | 95 ≤ FFMC]) = 0.07037

Counts in OBSERVATIONS WHERE FIRE OCCURS where 0 ≤ FFMC < 75]: 1
Counts in OBSERVATIONS WHERE FIRE OCCURS where 75 ≤ FFMC < 80]: 3
Counts in OBSERVATIONS WHERE FIRE OCCURS where 80 ≤ FFMC < 85]: 16
Counts in OBSERVATIONS WHERE FIRE OCCURS

{'FFMC': array([  1,   3,  16,  30, 201,  19]),
 'DMC': array([ 45,  64, 103,  34,  24])}