predict median house values in Californian districts, given a number of features from these districts

In [20]:
import pandas as pd

housing = pd.read_csv(r"..\datasets\housing\housing.csv")

In [21]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [22]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [23]:
housing["ocean_proximity"].value_counts()

ocean_proximity
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: count, dtype: int64

In [24]:
housing.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


Use nomimatim reverse geocoding to fetch county/district name for given lat long

In [25]:
import requests
import time
import pickle
import os

# Load cache if it exists
cache_file = "..\model_assets/district_cache.pkl"
if os.path.exists(cache_file):
    with open(cache_file, "rb") as f:  
        cache = pickle.load(f)
else:
    cache = {}

# Function to call Nominatim reverse geocoding API
def get_district(lat, lon):
    key = (round(lat, 2), round(lon, 2)) # coordinates round off upto two decimals. All coords are within two decimals in raw dataset

    if key in cache:
        return cache[key]

    url = "https://nominatim.openstreetmap.org/reverse"
    params = {
        "lat": lat,
        "lon": lon,
        "format": "json",
        "zoom": 10, # 10 usually returns a district/county level. Lower = broader (e.g., country), higher = more specific (e.g., house).
        "addressdetails": 1 # 1 returns a dictionary of address components (like city, state, district).
    }
    headers = {
        "User-Agent": "darshilvekaria11@gmail.com"  # Required per Nominatim policy
    }

    response = requests.get(url, params=params, headers=headers)
    if response.status_code == 200:
        data = response.json()
        district = data.get("address", {}).get("county") or data.get("address", {}).get("state_district")
    else:
        district = None

    cache[key] = district

    # Save updated cache
    with open(cache_file, "wb") as f:
        pickle.dump(cache, f)

    time.sleep(0.5)  # Nominatim policy: 1 request/sec max
    return district

# Add a new column with district name
districts = []
for idx, row in housing.iterrows():
    district = get_district(row["latitude"], row["longitude"])
    districts.append(district)

    # Print progress every 100 rows
    if (idx + 1) % 100 == 0 or (idx + 1) == len(housing):
        print(f"[{idx + 1}/{len(housing)}] Coordinates: ({row['latitude']}, {row['longitude']}) -> District: {district}")


housing["district"] = districts

# Save the updated CSV
housing.loc[~housing['district'].isna()].to_csv(r"..\datasets\housing\housing_with_districts.csv", index=False)

[100/20640] Coordinates: (37.82, -122.26) -> District: Alameda County
[200/20640] Coordinates: (32.8, -116.94) -> District: San Diego County
[300/20640] Coordinates: (33.63, -117.92) -> District: Orange County
[400/20640] Coordinates: (37.38, -122.05) -> District: Santa Clara County
[500/20640] Coordinates: (37.56, -122.31) -> District: San Mateo County
[600/20640] Coordinates: (36.91, -121.76) -> District: Santa Cruz County
[700/20640] Coordinates: (33.97, -118.16) -> District: Los Angeles County
[800/20640] Coordinates: (34.05, -117.27) -> District: San Bernardino County
[900/20640] Coordinates: (34.07, -118.45) -> District: Los Angeles County
[1000/20640] Coordinates: (34.15, -118.13) -> District: Los Angeles County
[1100/20640] Coordinates: (37.63, -122.08) -> District: Alameda County
[1200/20640] Coordinates: (40.57, -122.4) -> District: Shasta County
[1300/20640] Coordinates: (33.73, -117.93) -> District: Orange County
[1400/20640] Coordinates: (33.91, -118.17) -> District: Los A

to get city level data through reverse gecoding. Not used, bacuase raw data is prepared at district level

In [26]:
# import requests
# import time
# import pickle
# import os

# # Load cache if it exists
# cache_file = "district_cache.pkl"
# if os.path.exists(cache_file):
#     with open(cache_file, "rb") as f:
#         cache = pickle.load(f)
# else:
#     cache = {}

# # Function to call Nominatim reverse geocoding API
# def get_district(lat, lon):
#     key = (round(lat, 2), round(lon, 2))  # Reduce precision to group nearby coords # all records are with 2 decimals

#     if key in cache:
#         return cache[key]

#     zoom_levels = [10, 11, 12]  # Try in order, # 10 usually returns a city, district/county level. Lower = broader (e.g., country), higher = more specific (e.g., house).

#     for zoom in zoom_levels:
#         url = "https://nominatim.openstreetmap.org/reverse"
#         params = {
#             "lat": lat,
#             "lon": lon,
#             "format": "json",
#             "zoom": zoom, 
#             "addressdetails": 1 # 1 returns a dictionary of address components (like city, state, district).
#         }
#         headers = {
#             "User-Agent": "darshilvekaria11@gmail.com"  # Required per Nominatim policy
#         }

#         try:
#             response = requests.get(url, params=params, headers=headers)
#             if response.status_code == 200:
#                 data = response.json()
#                 # print(data)
#                 district = data.get("address", {}).get("city")
#                 if district:
#                     cache[key] = district
#                     # Save updated cache
#                     with open(cache_file, "wb") as f:
#                         pickle.dump(cache, f)
#                     time.sleep(0.5)  # Respect Nominatim's 1 request/sec policy
#                     return district
#         except Exception as e:
#             print(f"Error during request: {e}")
#         time.sleep(0.5)  # Always sleep between retries to respect rate limit

#     return None  # If all zoom levels fail

# # housing_null = housing.loc[housing['district'].isna()]
# # print(housing_null)

# # Add a new column with district name
# cities = []
# for idx, row in housing.iterrows():
#     city = get_district(row["latitude"], row["longitude"])
#     cities.append(city)

#     # Print progress every 100 rows
#     # if (idx + 1) % 100 == 0 or (idx + 1) == len(housing):
#     print(f"[{idx + 1}/{len(housing)}] Coordinates: ({row['latitude']}, {row['longitude']}) -> city: {city}")

# housing["city"] = cities

# # Save the updated CSV
# housing.to_csv("housing_with_cities.csv", index=False)