In [1]:
# Crime Hotspot & Safe-Route Recommendation  
# **Author:** Dheer N Raijada  
# **Date:** 21 May 2025  

# **Goal:**  
# 1. Train a crime-hotspot model (KMeans + risk score)  
# 2. Build a `get_safe_route(src, dst)` function that returns:  
#    - the safest route  
#    - its overall risk  
#    - any high-risk hotspots along that path  


In [2]:
!pip install googlemaps
!pip install polyline

Collecting googlemaps
  Downloading googlemaps-4.10.0.tar.gz (33 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: googlemaps
  Building wheel for googlemaps (setup.py) ... [?25l[?25hdone
  Created wheel for googlemaps: filename=googlemaps-4.10.0-py3-none-any.whl size=40714 sha256=5fb504521dfda4d84f55c3382aadd9c9e7316c5a7e936a911ba55ac6f6ea9bdc
  Stored in directory: /root/.cache/pip/wheels/f1/09/77/3cc2f5659cbc62341b30f806aca2b25e6a26c351daa5b1f49a
Successfully built googlemaps
Installing collected packages: googlemaps
Successfully installed googlemaps-4.10.0
Collecting polyline
  Downloading polyline-2.0.2-py3-none-any.whl.metadata (6.4 kB)
Downloading polyline-2.0.2-py3-none-any.whl (6.0 kB)
Installing collected packages: polyline
Successfully installed polyline-2.0.2


In [3]:
# Data handling & visuals
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Mapping
import folium
from folium.plugins import MarkerCluster

# ML
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import joblib   # for saving/loading models

# Geocoding & Routing (install geopy and googlemaps in Kaggle settings or via pip)
from geopy.geocoders import Nominatim
import googlemaps

# Polyline sampling
import polyline  # pip install polyline
print("done importing")

done importing


In [4]:
DATA_PATH = '/kaggle/input/crime-dataset/cleaned_crime_data_pruned.csv'
df = pd.read_csv(DATA_PATH)

print("Rows,Cols:", df.shape)
df.head()


Rows,Cols: (870067, 6)


Unnamed: 0,date,time,crime_type,latitude,longitude,area
0,2016-12-31 23:59:00,23:59:00,THEFT,41.97629,-87.905227,AIRPORT BUILDING NON-TERMINAL - SECURE AREA
1,2016-12-31 23:58:00,23:58:00,BATTERY,41.688033,-87.623931,RESIDENCE
2,2016-12-31 23:55:00,23:55:00,LIQUOR LAW VIOLATION,41.936885,-87.66477,RESIDENCE
3,2016-12-31 23:55:00,23:55:00,BATTERY,41.886815,-87.625593,HOTEL/MOTEL
4,2016-12-31 23:54:00,23:54:00,CRIMINAL DAMAGE,41.752307,-87.619798,RESIDENCE


In [5]:
# Keep only rows with needed fields
df = df.dropna(subset=['date','time','crime_type','latitude','longitude','area'])
df = df.drop_duplicates()

print("After cleaning:", df.shape)


After cleaning: (870067, 6)


In [6]:
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['Hour'] = df['date'].dt.hour
df['DayOfWeek'] = df['date'].dt.dayofweek  # Monday=0…Sunday=6
df[['date','Hour','DayOfWeek']].head()


Unnamed: 0,date,Hour,DayOfWeek
0,2016-12-31 23:59:00,23,5
1,2016-12-31 23:58:00,23,5
2,2016-12-31 23:55:00,23,5
3,2016-12-31 23:55:00,23,5
4,2016-12-31 23:54:00,23,5


In [7]:
from sklearn.preprocessing import QuantileTransformer

# --- 1) Extract Hour from `time` column ---
df['Hour'] = pd.to_datetime(df['time'], format='%H:%M:%S', errors='coerce').dt.hour

# --- 2) TimeWeight: 1.5 at night, 1.0 during day ---
df['TimeWeight'] = df['Hour'].apply(lambda h: 1.5 if (h >= 20 or h <= 5) else 1.0)

# --- 3) Updated severity map ---
severity_map = {
    'HOMICIDE': 10, 'CRIM SEXUAL ASSAULT': 9, 'KIDNAPPING': 9,
    'OFFENSE INVOLVING CHILDREN': 8, 'SEX OFFENSE': 8, 'ASSAULT': 8,
    'ROBBERY': 7, 'BATTERY': 7, 'STALKING': 7, 'WEAPONS VIOLATION': 7,
    'ARSON': 6, 'BURGLARY': 6, 'MOTOR VEHICLE THEFT': 5, 'CRIMINAL DAMAGE': 5,
    'DECEPTIVE PRACTICE': 4, 'THEFT': 4, 'NARCOTICS': 4, 'OTHER NARCOTIC VIOLATION': 3,
    'CRIMINAL TRESPASS': 2, 'LIQUOR LAW VIOLATION': 2, 'GAMBLING': 2,
    'PUBLIC PEACE VIOLATION': 2, 'PUBLIC INDECENCY': 2,
    'OBSCENITY': 1, 'PROSTITUTION': 1, 'NON-CRIMINAL': 1, 'NON - CRIMINAL': 1,
    'NON-CRIMINAL (SUBJECT SPECIFIED)': 1, 'OTHER OFFENSE': 1
}

# --- 4) Apply severity ---
df['Severity'] = df['crime_type'].map(severity_map).fillna(0)

# --- 5) Validate mapping ---
still_zero = set(df.loc[df['Severity'] == 0, 'crime_type'].unique())
still_one = set(df.loc[df['Severity'] == 1, 'crime_type'].unique())
print("Mapped to 0 (non-crimes):", still_zero)
print("Mapped to 1 (very minor only):", still_one)

# --- 6) Compute RiskScore ---
df['RiskScore'] = df['Severity'] * df['TimeWeight']

# --- ✅ 7) Quantile-based scaling ---
scaler = QuantileTransformer(output_distribution='uniform', random_state=42)
df['RiskScaled'] = scaler.fit_transform(df[['RiskScore']])

# --- 8) Peek at distribution ---
print(df['RiskScaled'].describe())


Mapped to 0 (non-crimes): {'INTIMIDATION', 'INTERFERENCE WITH PUBLIC OFFICER', 'CONCEALED CARRY LICENSE VIOLATION', 'HUMAN TRAFFICKING'}
Mapped to 1 (very minor only): {'NON-CRIMINAL', 'OTHER OFFENSE', 'NON-CRIMINAL (SUBJECT SPECIFIED)', 'PROSTITUTION', 'NON - CRIMINAL', 'OBSCENITY'}
count    870067.000000
mean          0.499234
std           0.285614
min           0.000000
25%           0.234234
50%           0.509009
75%           0.746747
max           1.000000
Name: RiskScaled, dtype: float64


In [8]:
from sklearn.cluster import KMeans

# Use more clusters (better granularity)
coords = df[['latitude', 'longitude']]
kmeans = KMeans(n_clusters=50, random_state=42, n_init=10)
kmeans.fit(coords)

# Assign cluster ID to each crime
df['Cluster'] = kmeans.labels_

# Calculate average risk per cluster
cluster_risk = df.groupby('Cluster')['RiskScaled'].mean().to_dict()

# View risk spread
print("Top 5 clusters by risk:", sorted(cluster_risk.items(), key=lambda x: x[1], reverse=True)[:5])
print("Bottom 5 clusters by risk:", sorted(cluster_risk.items(), key=lambda x: x[1])[:5])


Top 5 clusters by risk: [(34, 0.5517111576076679), (18, 0.54305724978827), (36, 0.5427255839739472), (23, 0.5426304715825142), (48, 0.5416536612549901)]
Bottom 5 clusters by risk: [(20, 0.34725771490105917), (40, 0.3842824106138375), (37, 0.43550158361036206), (19, 0.43580433931311124), (4, 0.4360708088044211)]


In [9]:
from geopy.geocoders import Nominatim
import googlemaps
from kaggle_secrets import UserSecretsClient

# ✅ Securely load the API key from Kaggle Secrets
user_secrets = UserSecretsClient()
GMAPS_API_KEY = user_secrets.get_secret("GMAPS_API_KEY")

# Initialize services
geolocator = Nominatim(user_agent="crime_app")
gmaps = googlemaps.Client(key=GMAPS_API_KEY)
print("done")

done


In [10]:
# Use Google Maps Geocoder instead of Nominatim
def geocode_addr(addr):
    result = gmaps.geocode(addr)
    if result:
        loc = result[0]['geometry']['location']
        return (loc['lat'], loc['lng'])
    else:
        raise ValueError(f"Could not geocode address: {addr}")


In [11]:
# Decodes polyline into a list of lat/lng points
import polyline

def sample_polyline(encoded, step_m=200):
    return polyline.decode(encoded)
print("done")

done


In [12]:
# Real Google Maps route fetcher (walking, with alternatives)
def fetch_routes(src, dst):
    """
    Returns a list of dicts:
      - polyline: encoded overview path
      - distance_km
      - duration_min
    """
    # src, dst are tuples: (lat, lng)
    directions = gmaps.directions(src, dst, alternatives=True, mode="driving")
    
    routes = []
    for leg in directions:
        routes.append({
            "polyline": leg['overview_polyline']['points'],
            "distance_km": leg['legs'][0]['distance']['value'] / 1000.0,
            "duration_min": leg['legs'][0]['duration']['value'] / 60.0
        })
    
    return routes


In [13]:
# After clustering
import joblib
joblib.dump(kmeans, 'kmeans_model.pkl')
joblib.dump(cluster_risk, 'cluster_risk_lookup.pkl')
import joblib

kmeans = joblib.load('kmeans_model.pkl')
cluster_risk = joblib.load('cluster_risk_lookup.pkl')

joblib.dump(scaler, 'risk_scaler.pkl')

HIGH_RISK_THRESH = 0.53  # anything above this is considered a hotspot


In [31]:
from collections import defaultdict

def categorize_relative(sorted_risks, score):
    min_r = sorted_risks[0]
    max_r = sorted_risks[-1]
    if np.isclose(max_r, min_r):
        return "Low"
    norm_score = (score - min_r) / (max_r - min_r)
    if norm_score < 0.33:
        return "Low"
    elif norm_score < 0.66:
        return "Medium"
    else:
        return "High"

def get_safe_route(src_input, dst_input):
    # Geocode addresses if strings
    if isinstance(src_input, str):
        src = geocode_addr(src_input)
    else:
        src = src_input
    if isinstance(dst_input, str):
        dst = geocode_addr(dst_input)
    else:
        dst = dst_input

    # Fetch routes (replace this with real Google API fetch if needed)
    routes = fetch_routes(src, dst)
    results = []

    for r in routes:
        waypoints = sample_polyline(r['polyline'])
        risks = []

        for lat, lng in waypoints:
            cid = kmeans.predict(pd.DataFrame([[lat, lng]], columns=['latitude', 'longitude']))[0]
            risk = cluster_risk.get(cid, 0)
            risks.append(risk)

        overall = float(np.mean(risks))
        hotspots = [
            {'lat': lat, 'lng': lng, 'risk': float(risk)}
            for (lat, lng), risk in zip(waypoints, risks) if risk > HIGH_RISK_THRESH
        ]

        results.append({
            "polyline": r['polyline'],
            "eta": r['duration_min'],
            "distance": r['distance_km'],
            "overall_risk": overall,
            "hotspots": hotspots
        })

    # Sort risks and assign relative levels
    risk_values = sorted([r['overall_risk'] for r in results])
    for route in results:
        route['risk_level'] = categorize_relative(risk_values, route['overall_risk'])

    # Select one route for each category (Low, Medium, High)
    routes_by_level = defaultdict(list)
    for route in results:
        routes_by_level[route['risk_level']].append(route)
    
    # Select the route with the lowest risk in each category
    selected_routes = []
    for level in ['Low', 'Medium', 'High']:
        if level in routes_by_level:
            best_route = min(routes_by_level[level], key=lambda r: r['overall_risk'])
            selected_routes.append(best_route)

    # If not enough unique levels, just pick any additional routes
    if len(selected_routes) < 3:
        remaining = [r for r in results if r not in selected_routes]
        remaining_sorted = sorted(remaining, key=lambda r: r['overall_risk'])
        for r in remaining_sorted:
            selected_routes.append(r)
            if len(selected_routes) == 3:
                break

    # Recompute hotspots (optional refresh)
    for route in selected_routes:
        waypoints = sample_polyline(route['polyline'])
        hotspots = [
            {'lat': lat, 'lng': lng, 'risk': float(cluster_risk.get(
                kmeans.predict(pd.DataFrame([[lat, lng]], columns=['latitude', 'longitude']))[0], 0))}
            for lat, lng in waypoints if cluster_risk.get(
                kmeans.predict(pd.DataFrame([[lat, lng]], columns=['latitude', 'longitude']))[0], 0) > HIGH_RISK_THRESH
        ]
        route['hotspots'] = hotspots

    recommended_route = selected_routes[0] if selected_routes else None
    alternative_routes = selected_routes[1:]

    return {
        "recommended_route": recommended_route,
        "alternative_routes": alternative_routes,
        "hotspots": recommended_route['hotspots'] if recommended_route else []
    }


In [43]:
# 🔹 Ask user for source and destination
source = input("Enter source location (e.g., 'MG Road, Bangalore'): ")
destination = input("Enter destination location (e.g., 'Koramangala, Bangalore'): ")

# 🔹 Get safest route
result = get_safe_route(source, destination)
print("\n🧭 ROUTE OPTIONS:")

all_routes = [result['recommended_route']] + result['alternative_routes']
for i, r in enumerate(all_routes):
    label = " (Recommended)" if i == 0 else ""
    print(f"\nOption {i+1}{label}") 
    print(f"Distance     : {r['distance']:.2f} km")
    print(f"ETA          : {r['eta']:.1f} minutes")
    print(f"Risk Level   : {r['risk_level']} ({r['overall_risk']:.2f})")
    print(f"Hotspots     : {len(r['hotspots'])}")

# Extract fields
route = result['recommended_route']
hotspots = result['hotspots']

# 🔹 Print user-friendly route summary
print("\n🧭 SAFE ROUTE SUMMARY")
print(f"From        : {source}")
print(f"To          : {destination}")
print(f"Distance    : {route['distance']:.2f} km")
print(f"ETA         : {route['eta']:.1f} minutes")
print(f"Risk Level  : {route['risk_level']} ({route['overall_risk']:.2f})")
print(f"Hotspots    : {len(hotspots)} found")

# 🔹 Hotspot detail display
if not hotspots:
   print(f"\n📍 The route intersects distinct high-risk zones.")

print("\n🧭 Hotspot Zone Details:")
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="crime_hotspot_reverse")

shown_clusters = set()

for hs in hotspots:
    lat, lng = hs['lat'], hs['lng']
    
    # Predict cluster
    cid = kmeans.predict(pd.DataFrame([[lat, lng]], columns=['latitude', 'longitude']))[0]
    if cid in shown_clusters:
        continue
    shown_clusters.add(cid)

    # Reverse geocode
    try:
        g_result = gmaps.reverse_geocode((lat, lng))
        area = g_result[0]['formatted_address'] if g_result else "Unknown area"

    except:
        area = "Unknown area"

    # Print nicely
    print(f"\n📍 Hotspot Zone: Cluster {cid}")
    print(f"Address    : {area}")
    print(f"Coordinates: Lat {lat:.5f}, Lng {lng:.5f}")


Enter source location (e.g., 'MG Road, Bangalore'):  Gage Park
Enter destination location (e.g., 'Koramangala, Bangalore'):  Paw Paw Township



🧭 ROUTE OPTIONS:

Option 1 (Recommended)
Distance     : 231.94 km
ETA          : 141.8 minutes
Risk Level   : Low (0.54)
Hotspots     : 273

Option 2
Distance     : 210.54 km
ETA          : 133.6 minutes
Risk Level   : High (0.54)
Hotspots     : 274

Option 3
Distance     : 203.17 km
ETA          : 124.3 minutes
Risk Level   : High (0.54)
Hotspots     : 276

🧭 SAFE ROUTE SUMMARY
From        : Gage Park
To          : Paw Paw Township
Distance    : 231.94 km
ETA         : 141.8 minutes
Risk Level  : Low (0.54)
Hotspots    : 273 found

🧭 Hotspot Zone Details:

📍 Hotspot Zone: Cluster 7
Address    : Kedzie & 55th Street, Chicago, IL, USA
Coordinates: Lat 41.79352, Lng -87.70356

📍 Hotspot Zone: Cluster 5
Address    : Garfield & Western Boulevard, Chicago, IL 60636, USA
Coordinates: Lat 41.79345, Lng -87.68280

📍 Hotspot Zone: Cluster 36
Address    : Garfield & Lowe, Chicago, IL, USA
Coordinates: Lat 41.79403, Lng -87.64151

📍 Hotspot Zone: Cluster 17
Address    : 6431 S Wentworth Ave, Chi

In [44]:
import folium
import polyline
import pandas as pd
from folium import LayerControl

# Decode routes
all_routes = [result['recommended_route']] + result['alternative_routes']
route_labels = ["✅ Recommended Route"] + [f"❗ Alternative Route {i+1}" for i in range(len(result['alternative_routes']))]

# Start map at first coordinate of recommended route
start_coords = polyline.decode(all_routes[0]['polyline'])[0]
m = folium.Map(location=start_coords, zoom_start=14)

# Function to color route lines
def get_color(risk_level):
    return {'Low': 'green', 'Medium': 'orange', 'High': 'red'}.get(risk_level, 'blue')

# Add each route
for i, route in enumerate(all_routes):
    coords = polyline.decode(route['polyline'])
    color = get_color(route['risk_level'])

    fg = folium.FeatureGroup(name=f"{route_labels[i]} (Risk: {route['risk_level']})")

    # Add route with route-level popup
    popup_html = f"""
    <b>🛣️ Route Details</b><br>
    Source: {source}<br>
    Destination: {destination}<br>
    Distance: {route['distance']:.2f} km<br>
    ETA: {route['eta']:.1f} minutes<br>
    Overall Risk: {route['overall_risk']:.2f}<br>
    Risk Level: <b>{route['risk_level']}</b><br>
    Hotspots: {len(route['hotspots'])}
    """

    folium.PolyLine(
        coords,
        color=color,
        weight=5,
        popup=folium.Popup(popup_html, max_width=300)
    ).add_to(fg)

    # Add source marker
    folium.Marker(
        location=coords[0],
        popup=folium.Popup(f"<b>📍 Source</b><br>{source}", max_width=250),
        icon=folium.Icon(color="blue", icon="play")
    ).add_to(fg)

    # Add destination marker
    folium.Marker(
        location=coords[-1],
        popup=folium.Popup(
            f"<b>🏁 Destination</b><br>{destination}<br>"
            f"Lat: {coords[-1][0]:.4f}, Lng: {coords[-1][1]:.4f}<br>"
            f"ETA: {route['eta']:.1f} min<br>Distance: {route['distance']:.2f} km",
            max_width=300),
        icon=folium.Icon(color="red", icon="flag")
    ).add_to(fg)

    # Limit hotspots per route (fewer on recommended)
    shown_clusters = set()
    hotspot_shown = 0
    MAX_HOTSPOTS = 2 if i == 0 else 5  # ✅ 2 for recommended, 5 for alternatives

    for hs in route['hotspots']:
        if hotspot_shown >= MAX_HOTSPOTS:
            break

        lat, lng = hs['lat'], hs['lng']
        point = pd.DataFrame([[lat, lng]], columns=['latitude', 'longitude'])
        cid = kmeans.predict(point)[0]

        if cid in shown_clusters:
            continue

        shown_clusters.add(cid)
        hotspot_shown += 1

        # ✅ Get top crime types in this cluster
        cluster_crimes = df[df['Cluster'] == cid]['crime_type'].value_counts().head(3)
        crime_list = "<br>".join([f"• {crime} ({count})" for crime, count in cluster_crimes.items()])

        # Style based on route type
        if i == 0:
            marker_color = 'blue'
            halo_color = 'blue'
            fill_opacity = 0.4
            halo_opacity = 0.15
        else:
            marker_color = 'red'
            halo_color = 'red'
            fill_opacity = 0.9
            halo_opacity = 0.25

        # 📍 Hotspot marker with crime types
        popup_text = f"""
        <b>🔥 High-Risk Zone</b><br>
        Cluster ID: {cid}<br>
        Risk Score: {hs['risk']:.2f}<br>
        <b>Top Crimes:</b><br>{crime_list}
        """

        folium.CircleMarker(
            location=(lat, lng),
            radius=6,
            color=marker_color,
            fill=True,
            fill_opacity=fill_opacity,
            popup=folium.Popup(popup_text, max_width=300)
        ).add_to(fg)

        # 🟥 200m Halo
        folium.Circle(
            location=(lat, lng),
            radius=200,
            color=halo_color,
            fill=True,
            fill_opacity=halo_opacity,
            stroke=False
        ).add_to(fg)

    fg.add_to(m)

# Add route toggle control
folium.LayerControl(collapsed=False).add_to(m)

# Display the final map
m
