# Preprocess accessibility_coordinates.csv

**Imputation strategy:** Group median by `issue_type` + spatial bin
- For rows with missing severity: use median severity of same issue_type in same spatial bin
- Fallback: if bin has no data, use median by issue_type only
- Severity clipped to 1â€“4

In [1]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv('/Users/devarshee/Desktop/Datathon/accessibility_coordinates.csv')
print(f"Shape: {df.shape}")
print(f"Missing severity: {df['severity'].isna().sum()}")
df.head()

Shape: (81973, 6)
Missing severity: 2251


Unnamed: 0,longitude,latitude,issue_type,severity,X,Y
0,-122.298981,47.594616,SurfaceProblem,4.0,1278807.0,220339.305108
1,-122.301071,47.593357,SurfaceProblem,3.0,1278282.0,219890.033975
2,-122.301079,47.596844,SurfaceProblem,4.0,1278304.0,221161.640591
3,-122.301071,47.5965,SurfaceProblem,4.0,1278304.0,221036.395318
4,-122.306274,47.59993,NoCurbRamp,4.0,1277044.0,222311.62328


In [2]:
# Create spatial bins (~1km grid: 0.01 degrees)
# Smaller bin = finer granularity, larger = more data per bin (less fallback)
BIN_SIZE = 0.01
df['lat_bin'] = (df['latitude'] // BIN_SIZE) * BIN_SIZE
df['lon_bin'] = (df['longitude'] // BIN_SIZE) * BIN_SIZE
df[['latitude', 'lat_bin', 'longitude', 'lon_bin']].head(10)

Unnamed: 0,latitude,lat_bin,longitude,lon_bin
0,47.594616,47.59,-122.298981,-122.3
1,47.593357,47.59,-122.301071,-122.31
2,47.596844,47.59,-122.301079,-122.31
3,47.5965,47.59,-122.301071,-122.31
4,47.59993,47.59,-122.306274,-122.31
5,47.600765,47.6,-122.310371,-122.32
6,47.595486,47.59,-122.300201,-122.31
7,47.600323,47.6,-122.300064,-122.31
8,47.600834,47.6,-122.30098,-122.31
9,47.601597,47.6,-122.298859,-122.3


In [3]:
# Compute median severity by (issue_type, lat_bin, lon_bin)
# Only use rows WITH severity to compute the median
median_by_spatial = df.groupby(['issue_type', 'lat_bin', 'lon_bin'])['severity'].transform(
    lambda x: x.dropna().median() if x.notna().any() else np.nan
)

# Fallback: median by issue_type only (when bin has no data)
median_by_issue_type = df.groupby('issue_type')['severity'].transform('median')

# Apply: first try spatial bin, then fallback to issue_type median
df['severity_imputed'] = df['severity'].fillna(median_by_spatial).fillna(median_by_issue_type)

# Replace original severity with imputed (keep imputed for rows that were missing)
df['severity'] = df['severity_imputed']
df = df.drop(columns=['severity_imputed'])

print(f"Missing severity after imputation: {df['severity'].isna().sum()}")

Missing severity after imputation: 253


In [4]:
# Clip severity to valid range (1-4) - handles outliers like 5.0
df['severity'] = df['severity'].clip(1, 4)
print("Severity distribution after imputation:")
print(df['severity'].value_counts().sort_index())

Severity distribution after imputation:
severity
1.0    17945
1.5       11
2.0    14899
2.5       18
3.0    23955
3.5       26
4.0    24866
Name: count, dtype: int64


In [None]:
### Add population demographics

In [None]:
# Load Access dataset to get neighborhood for each (lon, lat)
access = pd.read_csv('/Users/devarshee/Desktop/Datathon/Data/Access_to_Everyday_Life_Dataset.csv')
access_coords = access[['geometry/coordinates/0', 'geometry/coordinates/1', 'properties/neighborhood']].copy()
access_coords.columns = ['longitude', 'latitude', 'neighborhood']

# Merge on coordinates (round to 6 decimals for matching)
df['lon_round'] = df['longitude'].round(6)
df['lat_round'] = df['latitude'].round(6)
access_coords['lon_round'] = access_coords['longitude'].round(6)
access_coords['lat_round'] = access_coords['latitude'].round(6)

df = df.merge(access_coords[['lon_round', 'lat_round', 'neighborhood']], 
              on=['lon_round', 'lat_round'], how='left')
df = df.drop(columns=['lon_round', 'lat_round'])
print(f"Rows with neighborhood: {df['neighborhood'].notna().sum()}")

In [None]:
# Load demographics and select key columns
demo = pd.read_csv('/Users/devarshee/Desktop/Datathon/Data/demographics_basic_age_sex_Neighborhoods.csv')
demo_sub = demo[['Neighborhood Name', 'Total', 'Children_under_18', 'Working_Age_Adults_18_64', 
                 'Older_Adults_65_over', 'Median Age Total']].copy()
demo_sub.columns = ['demo_neighborhood', 'population_total', 'children_under_18', 
                    'working_age_adults', 'older_adults_65_over', 'median_age']

# Map Access neighborhood to Demographics neighborhood (partial match)
# Access uses names like "Atlantic", "Madrona"; Demographics uses "Alki/Admiral", "Madrona/Leschi"
def map_neighborhood(access_neigh):
    if pd.isna(access_neigh):
        return None
    access_neigh = str(access_neigh).strip()
    for demo_name in demo_sub['demo_neighborhood'].unique():
        if access_neigh in demo_name or demo_name in access_neigh:
            return demo_name
        # Handle compound names: "Madrona" -> "Madrona/Leschi"
        if access_neigh in demo_name.split('/'):
            return demo_name
    return None

df['demo_neighborhood'] = df['neighborhood'].apply(map_neighborhood)
match_rate = df['demo_neighborhood'].notna().mean() * 100
print(f"Matched to demographics: {match_rate:.1f}%")

### Save preprocessed data

In [5]:
# Drop spatial bin columns before saving (used only for imputation)
df_out = df.drop(columns=['lat_bin', 'lon_bin'])

# Save preprocessed data
output_path = '/Users/devarshee/Desktop/Datathon/accessibility_coordinates_preprocessed.csv'
df_out.to_csv(output_path, index=False)
print(f"Saved preprocessed data to {output_path}")
df_out.head()

Saved preprocessed data to /Users/devarshee/Desktop/Datathon/accessibility_coordinates_preprocessed.csv


Unnamed: 0,longitude,latitude,issue_type,severity,X,Y
0,-122.298981,47.594616,SurfaceProblem,4.0,1278807.0,220339.305108
1,-122.301071,47.593357,SurfaceProblem,3.0,1278282.0,219890.033975
2,-122.301079,47.596844,SurfaceProblem,4.0,1278304.0,221161.640591
3,-122.301071,47.5965,SurfaceProblem,4.0,1278304.0,221036.395318
4,-122.306274,47.59993,NoCurbRamp,4.0,1277044.0,222311.62328
