In [18]:
# Import necessary libraries
import pandas as pd
import requests
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
import time
import difflib
import re

# --- 1. Data Loading and Preprocessing ---

# Load datasets from NYC Open Data
url1 = "https://data.cityofnewyork.us/resource/qnem-b8re.csv?$limit=10000"
url2 = "https://data.cityofnewyork.us/resource/enfh-gkve.csv?$limit=10000"

df1 = pd.read_csv(url1)
df2 = pd.read_csv(url2)

# Join the dataframes
df = pd.merge(df1, df2, on="gispropnum", how="left")

# Define the list of sports columns
sports_list = [
    "adult_baseball", "adult_football", "adult_softball", "basketball", "bocce",
    "cricket", "flagfootball", "frisbee", "handball", "hockey", "kickball",
    "lacrosse", "ll_baseb_12andunder", "ll_baseb_13andolder", "ll_softball",
    "netball", "nonregulation_soccer", "regulation_soccer", "rugby", "t_ball",
    "tennis", "track_and_field", "volleyball", "wheelchairfootball", "youth_football",
    "pickleball"
]

# Filter for active facilities and select relevant columns
df = df[df['featurestatus'] == 'Active']
df = df[['borough_y','zipcode_y','address','gispropnum','name311','multipolygon_y'] + sports_list]

# Rename columns for better understanding
df = df.rename(columns={
    'borough_y': 'borough',
    'zipcode_y': 'zipcode',
    'name311': 'name',
    'multipolygon_y': 'multipolygon',
})

# Map borough codes to full names
df['borough'] = df['borough'].map({
    'B': 'Brooklyn', 'Q': 'Queens', 'R': 'Staten Island',
    'M': 'Manhattan', 'X': 'Bronx'
})

# Delete duplicates
df = df.drop_duplicates(subset=['gispropnum', 'name'])


# --- 2. (Performance Fix) Pre-calculate coordinates from 'multipolygon' ---

def get_first_coord_from_wkt(wkt_str):
    """
    Extracts the first coordinate pair (lat, lon) from a WKT string using regex.
    """
    if not isinstance(wkt_str, str):
        return None, None
    match = re.search(r'(-?\d+\.\d+)\s(-?\d+\.\d+)', wkt_str)
    if match:
        longitude = float(match.group(1))
        latitude = float(match.group(2))
        return latitude, longitude
    return None, None

print("Pre-calculating coordinates for all facilities...")
df[['latitude', 'longitude']] = df['multipolygon'].apply(
    lambda x: pd.Series(get_first_coord_from_wkt(x))
)
print("Coordinates calculated successfully.")


# --- 3. Helper Functions ---

# Create a single geolocator instance to reuse
geolocator = Nominatim(user_agent="nyc_sports_finder_unified")

def match_sport_columns(sport: str, all_sports: list) -> list[str]:
    """
    Matches user input sport to the available sport columns using fuzzy matching.
    """
    if not isinstance(sport, str) or not sport.strip(): return []
    norm_sport = sport.lower().strip().replace(' ', '_').replace('-', '_')
    
    matches = [s for s in all_sports if norm_sport in s or s in norm_sport]
    if matches:
        return matches
    
    return difflib.get_close_matches(norm_sport, all_sports, n=5, cutoff=0.6)

def is_truthy_series(series: pd.Series) -> pd.Series:
    """
    Checks for various truthy values (True, '1', 'Y', 'Yes') in a pandas Series.
    """
    s = series.fillna('false').astype(str).str.strip().str.lower()
    return s.isin(['1', 'true', 'yes', 'y', 't'])


# --- 4. Main Search Function (Updated and Unified Logic) ---

def find_nearby_facilities(location_input, sport, df=df, top_n=5):
    """
    Finds the closest facilities based on geographic distance for any location input (address or zipcode).
    """
    # --- Input Validation ---
    if not isinstance(location_input, str) or not location_input.strip():
        print("Please enter a valid location (zipcode or address).")
        return None
    if not isinstance(sport, str) or not sport.strip():
        print("Please enter a valid sport.")
        return None

    # --- Data Cleaning and Sport Filtering ---
    df_clean = df.dropna(subset=['name', 'address', 'latitude', 'longitude']).drop_duplicates(subset=['name', 'address'])
    
    matched_cols = match_sport_columns(sport, sports_list)
    if not matched_cols:
        print(f"Sorry, '{sport}' facilities could not be found.")
        return None

    mask = pd.Series(False, index=df_clean.index)
    for col in matched_cols:
        if col in df_clean.columns:
            mask |= is_truthy_series(df_clean[col])
    
    sport_facilities = df_clean[mask].copy()

    if sport_facilities.empty:
        print(f"No facilities found for '{sport}'.")
        return None

    # --- Geocode User's Location (Address or Zipcode) ---
    print(f"Finding coordinates for your location: '{location_input}'...")
    try:
        # Append "NY, USA" to improve geocoding accuracy for both addresses and zipcodes
        location = geolocator.geocode(f"{location_input}, New York, NY, USA")
        if not location:
            print(f"Could not find the location '{location_input}'. Please try again.")
            return None
        user_coords = (location.latitude, location.longitude)
    except Exception as e:
        print(f"An error occurred during geocoding: {e}")
        return None

    # --- Distance Calculation ---
    print("Calculating distances from your location to all facilities...")
    sport_facilities['distance_miles'] = sport_facilities.apply(
        lambda row: geodesic(user_coords, (row['latitude'], row['longitude'])).miles,
        axis=1
    )

    # Sort by distance and return the top N results
    result = sport_facilities.sort_values('distance_miles').head(top_n)

    print(f"\nFound the {len(result)} closest '{sport}' facilities to '{location_input}':")
    return result


# --- 5. Main execution block ---

if __name__ == "__main__":
    print("🏀 NYC Sports Facility Finder 🎾")
    
    # Display available sports for better user experience
    print("\nAvailable sports include:")
    display_sports = sorted([s.replace('_', ' ').title() for s in sports_list if 'll_' not in s and 'adult' not in s])
    print(", ".join(display_sports[:15]) + ", etc.")
    
    print("-" * 30)

    location = input("Enter your location (address or 5-digit zipcode): ").strip()
    sport = input("Enter a sport: ").strip()
    
    # Call the single, unified search function
    results_df = find_nearby_facilities(location, sport, df=df, top_n=5)

    if isinstance(results_df, pd.DataFrame) and not results_df.empty:
        print("\n--- Search Results ---")
        
        # Format distance for display
        results_df['distance'] = results_df['distance_miles'].map('{:.2f} miles'.format)
        
        # Define columns to display
        display_cols = ['name', 'address', 'zipcode', 'borough', 'distance']
        final_cols = [col for col in display_cols if col in results_df.columns]
        
        print(results_df[final_cols].to_string(index=False))
    else:
        print("\nNo matching facilities were found.")

Pre-calculating coordinates for all facilities...
Coordinates calculated successfully.
🏀 NYC Sports Facility Finder 🎾

Available sports include:
Basketball, Bocce, Cricket, Flagfootball, Frisbee, Handball, Hockey, Kickball, Lacrosse, Netball, Nonregulation Soccer, Pickleball, Regulation Soccer, Rugby, T Ball, etc.
------------------------------
Finding coordinates for your location: '11201'...
Calculating distances from your location to all facilities...

Found the 5 closest 'basketball' facilities to '11201':

--- Search Results ---
              name               address      zipcode  borough   distance
   Adam Yauch Park     46 COLUMBIA PLACE        11201 Brooklyn 0.36 miles
       Boerum Park     364 WARREN STREET        11201 Brooklyn 0.55 miles
 Oxport Playground 34 NORTH PORTLAND AVE        11205 Brooklyn 0.87 miles
  Fort Greene Park   100 WASHINGTON PARK 11201, 11205 Brooklyn 0.93 miles
Edmonds Playground    319 CARLTON AVENUE        11205 Brooklyn 1.10 miles
