In [5]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from io import StringIO
from scipy.stats import chi2_contingency

In [6]:
# Read the CSV data from the string.
df = pd.read_csv('nb_count_fire_hydrant_by_caserne.csv')

# Read data into DataFrame
df["CREATION_DATE"] = pd.to_datetime(df["CREATION_DATE"])

# Define thresholds (these should be based on domain knowledge or data distribution)
threshold_units = 4         # For example, more than 4 units is considered "high severity"
threshold_hydrant = 400     # For nb_fire_hydrant, assume >400 is "high"

# Create binary categorical variables
df["severity"] = np.where(df["NOMBRE_UNITES"] > threshold_units, "high", "low")
df["hydrant_level"] = np.where(df["nb_fire_hydrant"] > threshold_hydrant, "high", "low")

# Build the contingency table
ct = pd.crosstab(df["severity"], df["hydrant_level"])
print("Contingency Table:")
print(ct)

# Perform Chi-square test for independence
chi2, p, dof, expected = chi2_contingency(ct)
print("\nChi-square Statistic:", chi2)
print("p-value:", p)

# For binary variables, you can also compute the phi coefficient:
# Convert categories to binary numeric (high=1, low=0)
df["severity_bin"] = np.where(df["severity"]=="high", 1, 0)
df["hydrant_bin"] = np.where(df["hydrant_level"]=="high", 1, 0)
phi_corr = df["severity_bin"].corr(df["hydrant_bin"])
print("\nPhi Correlation:", phi_corr)

Contingency Table:
hydrant_level   high    low
severity                   
high           14786  12757
low            38314  27871

Chi-square Statistic: 139.9299869961605
p-value: 2.757553807548808e-32

Phi Correlation: -0.03866217290267845
