In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('DADOS_DISTPOL_2000_2020_MSP_CEM_V1.csv')

In [3]:
df.head()

Unnamed: 0,DP_COD,DP_TIP,DELEGACIA,FUR_OU01,FUR_VE01,H_C_OU01,H_C_AC01,HO_DOL01,H_D_AC01,LATR01,...,PO_ARM20,TR_ENT20,ROU_OU20,ROU_BA20,ROU_CA20,ROU_VE20,TEN_HO20,INQ_PO20,EST_T20,ROU_TO20
0,1,1ª DP,001_dp_se,5241,302,0,7,59,0,2,...,8,154,3201,0,53,20,9,1129.0,23,3254
1,2,2ª DP,002_dp_bom_retiro,1278,180,0,15,13,0,0,...,3,17,970,0,7,16,1,1173.0,10,977
2,3,3ª DP,003_dp_campos_elisios,6167,408,0,5,23,0,0,...,17,187,4403,1,10,18,10,886.0,15,4414
3,4,4ª DP,004_dp_consolacao,3298,459,0,3,13,0,1,...,2,31,1649,0,4,9,5,423.0,7,1653
4,5,5ª DP,005_dp_aclimacao,2981,449,0,10,16,0,0,...,3,66,870,0,3,16,2,493.0,8,873


In [4]:
import geopandas as gpd
import pandas as pd

# Ler shapefile
gdf_dp = gpd.read_file("DISTPOL2021_MSP_CEM_V1.shp")

# Ler CSV com dados de crimes
df_crime = pd.read_csv("DADOS_DISTPOL_2000_2020_MSP_CEM_V1.csv", sep=";")

print("=== COLUNAS DO SHAPEFILE ===")
print(list(gdf_dp.columns))
print("\n=== COLUNAS DO CSV DE CRIMES ===")
print(list(df_crime.columns))


=== COLUNAS DO SHAPEFILE ===
['ID', 'AREA', 'NUM_DP', 'NM_DP', 'NM_DP_C', 'REF_DP', 'PO_ENT18', 'TR_ENT18', 'AP_ENT18', 'PO_ARM18', 'ARM_FO18', 'FLAG18', 'IN_FLA18', 'IN_MAN18', 'PE_FLA18', 'PE_MAN18', 'PR_EFE18', 'VEI_RE18', 'INQ_PO18', 'HO_DOL18', 'V_H_DO18', 'H_D_AC18', 'V_H_AC18', 'H_C_AC18', 'H_C_OU18', 'TEN_HO18', 'LE_MOR18', 'L_C_DO18', 'L_C_AC18', 'L_C_OU18', 'LATR18', 'VI_LAT18', 'EST_T18', 'EST18', 'EST_VU18', 'ROU_TO18', 'ROU_OU18', 'ROU_VE18', 'ROU_BA18', 'ROU_CA18', 'FUR_OU18', 'FUR_VE18', 'geometry']

=== COLUNAS DO CSV DE CRIMES ===
['DP_COD,DP_TIP,DELEGACIA,FUR_OU01,FUR_VE01,H_C_OU01,H_C_AC01,HO_DOL01,H_D_AC01,LATR01,L_C_OU01,L_C_AC01,L_C_DO01,LE_MOR01,ARM_FO01,FLAG01,IN_FLA01,IN_MAN01,PE_FLA01,PE_MAN01,PR_EFE01,V_H_DO01,V_H_AC01,VI_LAT01,VEI_RE01,AP_ENT01,PO_ENT01,PO_ARM01,TR_ENT01,ROU_BA01,ROU_CA01,ROU_VE01,TEN_HO01,INQ_PO01,EST_T01,ROU_TO01,FUR_OU02,FUR_VE02,H_C_OU02,H_C_AC02,HO_DOL02,H_D_AC02,LATR02,L_C_OU02,L_C_AC02,L_C_DO02,LE_MOR02,ARM_FO02,FLAG02,IN_FLA02,IN_MAN

In [5]:
import geopandas as gpd
import pandas as pd

# ========= Files in the SAME directory =========
SHP_FILE = "DISTPOL2021_MSP_CEM_V1.shp"
CSV_FILE = "DADOS_DISTPOL_2000_2020_MSP_CEM_V1.csv"

# ========= 1) Read shapefile (preserving the geometry column name) =========
gdf_districts = gpd.read_file(SHP_FILE)

# Preserve the geometry column name BEFORE renaming others
geom_col = gdf_districts.geometry.name  # usually "geometry"

# Rename only non-geometry columns to uppercase / without spaces
new_cols = {}
for c in gdf_districts.columns:
    if c != geom_col:
        new_cols[c] = str(c).strip().upper()
gdf_districts = gdf_districts.rename(columns=new_cols)

# Rename identifiers
gdf_districts = gdf_districts.rename(columns={"NUM_DP": "district_id", "NM_DP": "district_name"})

# Ensure it stays a GeoDataFrame with the original geometry column
gdf_districts = gpd.GeoDataFrame(gdf_districts, geometry=geom_col)

# Set types and CRS
gdf_districts["district_id"] = pd.to_numeric(gdf_districts["district_id"], errors="coerce").astype("Int64")
if gdf_districts.crs is None:
    # if the SHP has no CRS, set to WGS84
    gdf_districts = gdf_districts.set_crs(4326)
else:
    # reproject to WGS84 if necessary
    try:
        gdf_districts = gdf_districts.to_crs(4326)
    except Exception:
        pass  # if already 4326, skip

# ========= 2) Read CSV (detect separator and remove BOM) =========
df_crime = pd.read_csv(CSV_FILE, sep=None, engine="python")
df_crime.columns = (
    df_crime.columns
    .str.replace(r"^\ufeff", "", regex=True)  # remove invisible BOM
    .str.strip()
    .str.upper()
)

# Validate ID column
if "DP_COD" not in df_crime.columns:
    raise ValueError(f"'DP_COD' column not found in CSV. First columns: {list(df_crime.columns)[:12]}")

# ========= 3) Convert from wide to long format =========
# Measurement columns: contain digits in the name (e.g., LATR01 ... LATR20)
crime_cols = [c for c in df_crime.columns if any(ch.isdigit() for ch in c)]
id_cols = [c for c in ["DP_COD", "DELEGACIA"] if c in df_crime.columns]
if not id_cols:
    id_cols = ["DP_COD"]

df_long = df_crime.melt(
    id_vars=id_cols,
    value_vars=crime_cols,
    var_name="crime_year_code",
    value_name="occurrences"
)

# Extract year suffix (two digits) -> 2000+
suffix = df_long["crime_year_code"].str.extract(r"(\d{2})")[0]
df_long = df_long[suffix.notna()].copy()
df_long["year"] = suffix.dropna().astype(int) + 2000

# Convert occurrences to numeric
df_long["occurrences"] = (
    df_long["occurrences"]
    .astype(str)
    .str.replace(r"[^\d\-\.\,]", "", regex=True)  # remove non-numeric noise
    .str.replace(",", ".", regex=False)           # comma decimal -> dot
)
df_long["occurrences"] = pd.to_numeric(df_long["occurrences"], errors="coerce").fillna(0)

# ========= 4) Aggregate total crimes per district/year =========
df_total = (
    df_long.groupby(["DP_COD", "year"], as_index=False)["occurrences"]
    .sum()
    .rename(columns={"DP_COD": "district_id", "occurrences": "crime_total"})
)
df_total["district_id"] = pd.to_numeric(df_total["district_id"], errors="coerce").astype("Int64")

# ========= 5) Merge with shapefile =========
gdf_districts["district_id"] = gdf_districts["district_id"].astype("Int64")
gdf_final = gdf_districts.merge(df_total, on="district_id", how="left")

# ========= 6) Final selection and sorting =========
gdf_final = gdf_final[["district_id", "district_name", "year", "crime_total", geom_col]]
# rename geometry column if it’s not already "geometry"
if geom_col != "geometry":
    gdf_final = gdf_final.rename(columns={geom_col: "geometry"})
    gdf_final = gpd.GeoDataFrame(gdf_final, geometry="geometry", crs=4326)

df = gdf_final.sort_values(["district_id", "year"], na_position="last").reset_index(drop=True)

# ========= 7) Checks =========
print("gdf_final — columns:", list(df.columns))
print("CRS:", df.crs)
print("Shape:", df.shape)
print(df.head(10))


gdf_final — columns: ['district_id', 'district_name', 'year', 'crime_total', 'geometry']
CRS: EPSG:4326
Shape: (1860, 5)
   district_id district_name  year  crime_total  \
0            1            Sé  2001      13930.0   
1            1            Sé  2002      16001.0   
2            1            Sé  2003      14997.0   
3            1            Sé  2004      18496.0   
4            1            Sé  2005      19525.0   
5            1            Sé  2006      18407.0   
6            1            Sé  2007      16945.0   
7            1            Sé  2008      15764.0   
8            1            Sé  2009      15290.0   
9            1            Sé  2010      15325.0   

                                            geometry  
0  POLYGON ((-46.62627 -23.54824, -46.62614 -23.5...  
1  POLYGON ((-46.62627 -23.54824, -46.62614 -23.5...  
2  POLYGON ((-46.62627 -23.54824, -46.62614 -23.5...  
3  POLYGON ((-46.62627 -23.54824, -46.62614 -23.5...  
4  POLYGON ((-46.62627 -23.54824, -46.6261

In [6]:
df.head()

Unnamed: 0,district_id,district_name,year,crime_total,geometry
0,1,Sé,2001,13930.0,"POLYGON ((-46.62627 -23.54824, -46.62614 -23.5..."
1,1,Sé,2002,16001.0,"POLYGON ((-46.62627 -23.54824, -46.62614 -23.5..."
2,1,Sé,2003,14997.0,"POLYGON ((-46.62627 -23.54824, -46.62614 -23.5..."
3,1,Sé,2004,18496.0,"POLYGON ((-46.62627 -23.54824, -46.62614 -23.5..."
4,1,Sé,2005,19525.0,"POLYGON ((-46.62627 -23.54824, -46.62614 -23.5..."


In [7]:
import sys
import os
sys.path.append(os.path.join(os.path.dirname(''), '..', 'src'))

import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import Point
from datetime import datetime, timedelta

from pyehsa.emerging_hotspot_analysis import EmergingHotspotAnalysis
from pyehsa.ehsa_plotting import EhsaPlotting

np.random.seed(42)
print("Libraries imported successfully")


Libraries imported successfully


In [8]:
# Run EHSA analysis
results = EmergingHotspotAnalysis.emerging_hotspot_analysis(
    df,
    region_id_field='district_name',
    time_period_field='year', 
    value='crime_total',
    k=1,
    nsim=99
)


2025-10-24 02:04:35 - INFO - 🚀 Starting Emerging Hotspot Analysis
2025-10-24 02:04:35 - INFO - 📊 Input DataFrame shape: (1860, 5)
2025-10-24 02:04:35 - INFO - 🎯 Analysis parameters:
2025-10-24 02:04:35 - INFO -    - Region ID field: district_name
2025-10-24 02:04:35 - INFO -    - Time period field: year
2025-10-24 02:04:35 - INFO -    - Value field: crime_total
2025-10-24 02:04:35 - INFO -    - Random seed: 77
2025-10-24 02:04:35 - INFO -    - Time lags (k): 1
2025-10-24 02:04:35 - INFO -    - Simulations (nsim): 99
2025-10-24 02:04:35 - INFO - 📈 Data overview:
2025-10-24 02:04:35 - INFO -    - Total rows: 1,860
2025-10-24 02:04:35 - INFO -    - Unique regions: 93
2025-10-24 02:04:35 - INFO -    - Unique time periods: 20
2025-10-24 02:04:35 - INFO -    - Value range: [1768.000, 21566.000]
2025-10-24 02:04:35 - INFO -    - Missing values in crime_total: 0
2025-10-24 02:04:35 - INFO - 🔧 Step 1: Validating and cleaning input data...
2025-10-24 02:04:35 - INFO - ✅ Data validation completed

Geometries are already Shapely objects, creating GeoDataFrame...
Setting CRS...
Creating complete spacetime cube:
  - 93 locations
  - 20 time periods
  - 1860 total combinations
  - Original data: 1860 rows
  - Complete cube: 1860 rows
  - Missing combinations filled with NAs: 0
Spacetime cube dimensions:
  - 93 locations
  - 20 time periods
  - 1860 total observations


2025-10-24 02:04:48 - INFO - ✅ Gi* statistics calculated in 13.47s using spacetime method
2025-10-24 02:04:48 - INFO -    - Output shape: (1860, 8)
2025-10-24 02:04:48 - INFO -    - Used 99 simulations for p-values
2025-10-24 02:04:48 - INFO - 🎯 Step 5: Performing emerging hotspot classification...
2025-10-24 02:04:48 - INFO - ✅ EHSA classification completed in 0.07s
2025-10-24 02:04:48 - INFO -    - Results shape: (93, 8)
2025-10-24 02:04:48 - INFO - 📋 ANALYSIS RESULTS SUMMARY
2025-10-24 02:04:48 - INFO - ⏱️  Total execution time: 0:00:13.625636
2025-10-24 02:04:48 - INFO - 🏷️  Classification distribution:
2025-10-24 02:04:48 - INFO -    - no pattern detected: 39 regions (41.9%)
2025-10-24 02:04:48 - INFO -    - sporadic coldspot: 25 regions (26.9%)
2025-10-24 02:04:48 - INFO -    - sporadic hotspot: 20 regions (21.5%)
2025-10-24 02:04:48 - INFO -    - consecutive hotspot: 3 regions (3.2%)
2025-10-24 02:04:48 - INFO -    - persistent hotspot: 2 regions (2.2%)
2025-10-24 02:04:48 - INF

Gi* range: [-4.639, 8.236]
Significant observations (p<=0.01): 391/1860
Significant observations (p<=0.05): 710/1860
DEBUG - Sample of borderline cases:
  Obs 0: Gi*=-0.460, p=0.1500, sig=False
  Obs 10: Gi*=-0.212, p=0.2500, sig=False
  Obs 20: Gi*=0.902, p=0.4000, sig=False
  Obs 30: Gi*=1.443, p=0.3000, sig=False
  Obs 40: Gi*=-0.315, p=0.2800, sig=False
DEBUG - Neighbor permutation test:
  Original neighbors: [[87, 32, 62, 83, 71, 25, 5, 0], [24, 12, 43, 86, 66, 73, 1], [76, 55, 10, 29, 2]]
  Permuted neighbors: [[5, 2, 4, 3, 7, 9, 6, 1], [0, 5, 7, 8, 3, 9, 4], [6, 8, 5, 7, 0]]
  Neighbor structure changed: True
Total regions to process: 93
   No pattern #1: A. E. Carvalho, Gi* range [-1.28, 0.43], 0/10 significant
✓ Aclimação: sporadic hotspot
✓ Alto da Mooca: sporadic coldspot
   No pattern #2: Americanópolis, Gi* range [-1.84, 0.07], 0/10 significant
✓ Aricanduva: sporadic coldspot
✓ Artur Alvim: sporadic coldspot
   No pattern #3: Belém, Gi* range [-1.93, 0.39], 0/10 significan

In [9]:
results.head()

Unnamed: 0,district_name,classification,classification_details,mann_kendall_details,spatial_context_summary,location_data,tau,p_value
0,A. E. Carvalho,no pattern detected,{'reason': 'Conditions for specific hotspot/co...,"{'inputs': {'time_periods_count': 20, 'gi_star...","{'neighbors_config': ['Vila Matilde', 'Itaquer...","[{'time_period': '2001', 'value': -0.460237315...",-0.126316,0.455535
1,Aclimação,sporadic hotspot,{'criteria_met': ['Proportion of significant h...,"{'inputs': {'time_periods_count': 20, 'gi_star...","{'neighbors_config': ['Consolação', 'Cambuci',...","[{'time_period': '2001', 'value': 3.5066694538...",-0.326316,0.047805
2,Alto da Mooca,sporadic coldspot,{'criteria_met': ['Proportion of significant c...,"{'inputs': {'time_periods_count': 20, 'gi_star...","{'neighbors_config': ['Vila Alpina', 'Parque d...","[{'time_period': '2001', 'value': -1.411436234...",0.042105,0.820338
3,Americanópolis,no pattern detected,{'reason': 'Conditions for specific hotspot/co...,"{'inputs': {'time_periods_count': 20, 'gi_star...","{'neighbors_config': ['Jardim Míriam', 'Parque...","[{'time_period': '2001', 'value': -0.478051785...",-0.042105,0.820338
4,Aricanduva,sporadic coldspot,{'criteria_met': ['Proportion of significant c...,"{'inputs': {'time_periods_count': 20, 'gi_star...","{'neighbors_config': ['Vila Matilde', 'Teotôni...","[{'time_period': '2001', 'value': -0.401640142...",0.305263,0.064411


In [10]:
results['classification'].value_counts()

classification
no pattern detected      39
sporadic coldspot        25
sporadic hotspot         20
consecutive hotspot       3
persistent hotspot        2
oscilating hotspot        2
persistent coldspot       1
intensifying coldspot     1
Name: count, dtype: int64

In [11]:
# Merge geometry with results  
locations = df[['district_name', 'geometry']].drop_duplicates()
ehsa_df = results.merge(locations, left_on=results.columns[0], right_on='district_name')

# Create EHSA map
ehsa_map = EhsaPlotting.plot_ehsa_map_interactive(
    df=ehsa_df,
    region_id_field="district_name",
    title="São Paulo Emerging Hotspots - Crimes - 2000-2020",
    lib="folium"
)
ehsa_map



  map_center_coords = [gdf.geometry.centroid.y.mean(), gdf.geometry.centroid.x.mean()]


In [12]:
EhsaPlotting.open_visualization_tool()

Opening EHSA Visualization Tool in your default browser...
File location: /var/folders/vw/lchjhtcn06jgcb946y77vpb80000gn/T/ehsa_viz_m590fdzb/ehsa_visualization.html


'/var/folders/vw/lchjhtcn06jgcb946y77vpb80000gn/T/ehsa_viz_m590fdzb/ehsa_visualization.html'

In [13]:
ehsa_df.to_csv('ehsa_df.csv', index=False)