In [18]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt
import requests
import zipfile
import os

In [19]:
# Load your Dataset A with lat/lon
df_a = pd.read_csv(r"C:\Users\kayle\Desktop\DTSC-3601\Crime_Data_from_2020_to_Present(2)(1).csv")  # or whatever your source is

#Create geometry column from lat/lon - USING THE CORRECT COLUMN NAMES
geometry = [Point(xy) for xy in zip(df_a['LON'], df_a['LAT'])]

# Convert to GeoDataFrame
gdf_a = gpd.GeoDataFrame(df_a, geometry=geometry, crs="EPSG:4326")

print(f"Dataset A records: {len(gdf_a)}")
print("\nFirst few rows with geometry:")
print(gdf_a[['LAT', 'LON', 'geometry']].head())

# Check for any missing coordinates
missing_lat = gdf_a['LAT'].isna().sum()
missing_lon = gdf_a['LON'].isna().sum()
print(f"\nMissing lat values: {missing_lat}")
print(f"Missing lon values: {missing_lon}")

# Remove rows with missing coordinates if any exist
if missing_lat > 0 or missing_lon > 0:
    gdf_a = gdf_a.dropna(subset=['LAT', 'LON'])
    print(f"Records after removing missing coordinates: {len(gdf_a)}")

Dataset A records: 1004991

First few rows with geometry:
       LAT       LON                   geometry
0  34.2124 -118.4092  POINT (-118.4092 34.2124)
1  34.1993 -118.4203  POINT (-118.4203 34.1993)
2  34.1847 -118.4509  POINT (-118.4509 34.1847)
3  34.0339 -118.3747  POINT (-118.3747 34.0339)
4  33.9813 -118.4350   POINT (-118.435 33.9813)

Missing lat values: 0
Missing lon values: 0


In [20]:
def download_zipcode_boundaries():
    """Download just the ZIP code boundaries"""
    zip_url = "https://www2.census.gov/geo/tiger/TIGER2022/ZCTA520/tl_2022_us_zcta520.zip"
    filename = "tl_2022_us_zcta520.zip"
    extract_path = "./zipcode_boundaries"
    
    # Create directory if it doesn't exist
    os.makedirs(extract_path, exist_ok=True)
    
    # Download the file
    response = requests.get(zip_url, stream=True)
    
    if response.status_code == 200:
        with open(filename, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        
        # Extract the zip file
        with zipfile.ZipFile(filename, 'r') as zip_ref:
            zip_ref.extractall(extract_path)
        
        print("ZIP code boundaries downloaded and extracted successfully!")
        return extract_path
    else:
        print(f"Failed to download. Status code: {response.status_code}")
        return None

# Download ZIP code boundaries
zip_path = download_zipcode_boundaries()

if zip_path:
    # Load the ZIP code boundaries
    gdf_zcta = gpd.read_file(f"{zip_path}/tl_2022_us_zcta520.shp")
else:
    # If download fails, try loading from a manually downloaded file
    manual_path = r"C:\Users\kayle\Desktop\DTSC-3601\zipcode_boundaries"
    if os.path.exists(f"{manual_path}/tl_2022_us_zcta520.shp"):
        gdf_zcta = gpd.read_file(f"{manual_path}/tl_2022_us_zcta520.shp")
    else:
        print("Please manually download the ZIP code file and place it in zipcode_boundaries/ folder")

ZIP code boundaries downloaded and extracted successfully!


In [21]:
# Perform spatial join with ZIP codes
print("Performing spatial join with ZIP codes...")

# Ensure same coordinate reference system
gdf_zcta = gdf_zcta.to_crs(gdf_a.crs)

# Perform the spatial join
gdf_merged = gpd.sjoin(gdf_a, gdf_zcta, how='left', predicate='within')

# Extract the ZIP code
gdf_merged['zip_code'] = gdf_merged['ZCTA5CE20']

print(f"Spatial join completed!")
print(f"Records matched to ZIP codes: {gdf_merged['zip_code'].notna().sum()}")
print(f"Success rate: {gdf_merged['zip_code'].notna().sum() / len(gdf_merged):.2%}")

# Show some statistics
print(f"\nZIP code coverage:")
zip_coverage = gdf_merged['zip_code'].notna().sum() / len(gdf_merged) * 100
print(f"{zip_coverage:.1f}% of crime records matched to ZIP codes")

Performing spatial join with ZIP codes...
Spatial join completed!
Records matched to ZIP codes: 1002345
Success rate: 99.74%

ZIP code coverage:
99.7% of crime records matched to ZIP codes


In [22]:
# Select final columns - keep all original columns plus the new zip_code
final_columns = [col for col in gdf_merged.columns if col != 'geometry' and not col.startswith('index_')]
final_columns.append('zip_code')  # Ensure zip_code is included

# Remove any duplicate columns
final_columns = list(dict.fromkeys(final_columns))

# Create final dataframe
df_final = gdf_merged[final_columns]

# Save to CSV
output_path = r"C:\Users\kayle\Desktop\DTSC-3601\crime_data_with_zipcodes.csv"
df_final.to_csv(output_path, index=False)

print(f"\nMerged dataset saved to: {output_path}")
print(f"Final columns: {list(df_final.columns)}")
print(f"Total records: {len(df_final)}")
print(f"Records with ZIP codes: {df_final['zip_code'].notna().sum()}")

# Show sample of results
print("\nSample of merged data (first 10 rows):")
sample_cols = ['LAT', 'LON', 'zip_code']
# Add any other important columns from your original data
for col in ['police_district', 'area_name', 'crime_type']:  # adjust to your actual column names
    if col in df_final.columns:
        sample_cols.append(col)

print(df_final[sample_cols].head(10))


Merged dataset saved to: C:\Users\kayle\Desktop\DTSC-3601\crime_data_with_zipcodes.csv
Final columns: ['DR_NO', 'Date Rptd', 'DATE OCC', 'TIME OCC', 'AREA', 'AREA NAME', 'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Crm Cd Desc', 'Mocodes', 'Vict Age', 'Vict Sex', 'Vict Descent', 'Premis Cd', 'Premis Desc', 'Weapon Used Cd', 'Weapon Desc', 'Status', 'Status Desc', 'Crm Cd 1', 'Crm Cd 2', 'Crm Cd 3', 'Crm Cd 4', 'LOCATION', 'Cross Street', 'LAT', 'LON', 'ZCTA5CE20', 'GEOID20', 'CLASSFP20', 'MTFCC20', 'FUNCSTAT20', 'ALAND20', 'AWATER20', 'INTPTLAT20', 'INTPTLON20', 'zip_code']
Total records: 1004991
Records with ZIP codes: 1002345

Sample of merged data (first 10 rows):
       LAT       LON zip_code
0  34.2124 -118.4092    91605
1  34.1993 -118.4203    91605
2  34.1847 -118.4509    91411
3  34.0339 -118.3747    90034
4  33.9813 -118.4350    90292
5  34.0830 -118.1678    90032
6  34.0100 -118.2900    90037
7  34.1107 -118.2589    90039
8  34.2763 -118.5210    91344
9  34.1493 -118.5886    91364


In [23]:
import pandas as pd

# Load both datasets
crime_with_zip = pd.read_csv(r"C:\Users\kayle\Desktop\DTSC-3601\Project\crime_data_with_zipcodes.csv")
sexual_crimes = pd.read_csv(r"C:\Users\kayle\Downloads\sexual_crimes_cleaned(1).csv")

# Check the structure of both datasets first
print("Crime with zip columns:", crime_with_zip.columns.tolist())
print("Sexual crimes columns:", sexual_crimes.columns.tolist())
print("\nCrime with zip shape:", crime_with_zip.shape)
print("Sexual crimes shape:", sexual_crimes.shape)

# Check if DR_NO is unique in both datasets
print("\nDR_NO unique in crime_with_zip:", crime_with_zip['DR_NO'].nunique() == len(crime_with_zip))
print("DR_NO unique in sexual_crimes:", sexual_crimes['DR_NO'].nunique() == len(sexual_crimes))

# Perform the left join to add zip_code to sexual_crimes dataset
# We only select the DR_NO and zip_code columns from the crime_with_zip dataset
sexual_crimes_with_zip = sexual_crimes.merge(
    crime_with_zip[['DR_NO', 'zip_code']],  # Only bring these two columns
    on='DR_NO', 
    how='left'
)

# Check the result
print("\nAfter merge shape:", sexual_crimes_with_zip.shape)
print("Missing zip codes:", sexual_crimes_with_zip['zip_code'].isnull().sum())

# Save the updated dataset
sexual_crimes_with_zip.to_csv(r'C:\Users\kayle\Downloads\sexual_crimes_with_zip.csv', index=False)

Crime with zip columns: ['DR_NO', 'Date Rptd', 'DATE OCC', 'TIME OCC', 'AREA', 'AREA NAME', 'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Crm Cd Desc', 'Mocodes', 'Vict Age', 'Vict Sex', 'Vict Descent', 'Premis Cd', 'Premis Desc', 'Weapon Used Cd', 'Weapon Desc', 'Status', 'Status Desc', 'Crm Cd 1', 'Crm Cd 2', 'Crm Cd 3', 'Crm Cd 4', 'LOCATION', 'Cross Street', 'LAT', 'LON', 'ZCTA5CE20', 'GEOID20', 'CLASSFP20', 'MTFCC20', 'FUNCSTAT20', 'ALAND20', 'AWATER20', 'INTPTLAT20', 'INTPTLON20', 'zip_code']
Sexual crimes columns: ['DR_NO', 'Date Rptd', 'DATE OCC', 'TIME OCC', 'AREA', 'AREA NAME', 'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Crm Cd Desc', 'Mocodes', 'Vict Age', 'Vict Sex', 'Vict Descent', 'Premis Cd', 'Premis Desc', 'Weapon Used Cd', 'Weapon Desc', 'Crm Cd 1', 'Crm Cd 2', 'Crm Cd 3', 'Crm Cd 4', 'Vict Descent Full', 'Vict Age Group', 'Time_Formatted', 'Hour', 'Time_Category', 'Date_Rptd', 'DATE_OCC', 'Year', 'Month', 'DayOfWeek', 'Reporting_Delay', 'Delay_Category']

Crime with zip shape: (100