In [3]:
import pandas as pd
import numpy as np
import json

from geopy.distance import geodesic

import matplotlib.pyplot as plt
from io import StringIO
from scipy.stats import chi2_contingency

In [5]:
# Load your dataset (adjust the file path as needed)
df = pd.read_csv("incident_nearest_fire_hydrant.csv")

# Check the columns in your dataset
print(df.columns)

# Define a function to parse the JSON string from the "Incident" and "fire_hydrant" columns.
def parse_coords(json_str):
    # Parse the JSON string to a Python dictionary.
    d = json.loads(json_str)
    # The coordinates in the JSON are in the form [longitude, latitude]
    lon, lat = d["coordinates"]
    # Return a tuple in the order (latitude, longitude) as required by geopy.
    return (lat, lon)

# Create new columns with parsed coordinates.
df["incident_coord"] = df["Incident"].apply(parse_coords)
df["hydrant_coord"] = df["fire_hydrant"].apply(parse_coords)

# Compute the distance (in miles) between the incident and the nearest hydrant.
def compute_distance(row):
    return geodesic(row["incident_coord"], row["hydrant_coord"]).miles

df["computed_distance_miles"] = df.apply(compute_distance, axis=1)

# For analysis, define binary categories:
# (a) Severity: assume "high" severity if NOMBRE_UNITES is greater than or equal to a chosen threshold.
threshold_units = 5  # example threshold; adjust based on data distribution
df["severity_cat"] = np.where(df["NOMBRE_UNITES"] >= threshold_units, "high", "low")

# (b) Hydrant accessibility:
# Here we assume that a "high" value in computed_distance means the nearest hydrant is farther away,
# which can be interpreted as lower local hydrant coverage (i.e. less accessible).
threshold_distance = 0.02  # example threshold in miles; adjust as needed
df["hydrant_cat"] = np.where(df["computed_distance_miles"] > threshold_distance, "high", "low")

# Build a contingency table between severity and hydrant category.
ct = pd.crosstab(df["severity_cat"], df["hydrant_cat"])
print("Contingency Table:")
print(ct)

# Perform a chi-square test for independence.
chi2, p, dof, expected = chi2_contingency(ct)
print("\nChi-square Statistic:", chi2)
print("p-value:", p)

# For a binary association measure, convert categories to binary numeric variables (high = 1, low = 0).
df["severity_bin"] = np.where(df["severity_cat"] == "high", 1, 0)
df["hydrant_bin"] = np.where(df["hydrant_cat"] == "high", 1, 0)

# Compute the phi coefficient (a correlation measure for two binary variables).
phi_corr = df["severity_bin"].corr(df["hydrant_bin"])
print("\nPhi Correlation:", phi_corr)

Index(['INCIDENT_NBR', 'INCIDENT_TYPE_DESC', 'DESCRIPTION_GROUPE', 'CASERNE',
       'DIVISION', 'NOMBRE_UNITES', 'Incident', 'ID_BI', 'ID_POINT',
       'fire_hydrant', 'FindNearestRank', 'DistanceMiles', 'Direction'],
      dtype='object')
Contingency Table:
hydrant_cat    high    low
severity_cat              
high          11361  16201
low           27348  38891

Chi-square Statistic: 0.03339857695114255
p-value: 0.8549920041552147

Phi Correlation: -0.0006204750061672008
