Explanation: To run the car ownership prediction, the user provides their address, which is first geocoded so we can extract all required proximity indicators. The user also enters their household size, number of driving licences, and income. The income value is automatically mapped to the corresponding revenue category inside the function. Using these inputs, the preprocessing pipeline prepares the data and the trained XGBoost model predicts the expected number of cars ( 0 to 4+).

# Complete code:

In [24]:

# 0. IMPORTS


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
import joblib




# 1. LOAD & CLEAN DATA

file_path = "/Users/bitafarokhian/Downloads/od18_20251112.csv"
od = pd.read_csv(file_path)

# --- IMPORTANT PREPROCESSING ---

# Clip autologi to max 4
od["autologi"] = od["autologi"].clip(upper=4).astype(int)

# Convert revenu to string categorical
od["revenu"] = od["revenu"].astype(str)

# Replace StatCan missing strings
od = od.replace("..", 0).replace("...", 0)

# Convert proximity indicators
prox_cols = [c for c in od.columns if c.startswith("idx_prox")]
od[prox_cols] = od[prox_cols].astype(float)




# 2. DEFINE FEATURES


target = "autologi"

numeric_features = [
    "perslogi",
    "nbPermis",
    "idx_prox_emp",
    "idx_prox_pharma",
    "idx_prox_garderie",
    "idx_prox_sante",
    "idx_prox_epicerie",
    "idx_prox_educpri",
    "idx_prox_educsec",
    "idx_prox_bibl",
    "idx_prox_parcs",
    "idx_prox_transit",
]

categorical_features = ["revenu"]



# Remove rows with missing fields
od_clean = od.dropna(subset=[target] + numeric_features + categorical_features)
print("Rows kept:", od_clean.shape)

X = od_clean[numeric_features + categorical_features]
y = od_clean[target].astype(int)




# 3. PREPROCESSING PIPELINE


preprocess = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)




# 4. TRAIN/TEST SPLIT


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)




# 5. BASE MODEL (XGBoost Classifier Tunned)

xgb_clf = XGBClassifier(
    objective="multi:softprob",
    num_class=5,
    learning_rate=0.05,
    max_depth=6,
    n_estimators=200,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    tree_method="hist",
    eval_metric="merror",
)

xgb_pipeline = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", xgb_clf),
])



#6 TEST AND EVALUATION
print("\nTraining tuned XGBoostClassifier pipeline...")
xgb_pipeline.fit(X_train, y_train)

y_pred = xgb_pipeline.predict(X_test)

print("\n=== Tuned XGBoostClassifier Performance (hold-out test) ===")
#print("Accuracy:", accuracy_score(y_test, y_pred))
#print(classification_report(y_test, y_pred))



# 7. RETRAIN ON FULL DATA FOR DEPLOYMENT


final_xgb_pipeline = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", xgb_clf),
])

print("\nTraining final_xgb_pipeline on FULL dataset...")
final_xgb_pipeline.fit(X, y)





# 8. SAVE MODEL

joblib.dump(final_xgb_pipeline, "xgb_car_ownership_model.pkl")
print("\nModel saved as: xgb_car_ownership_model.pkl")



# 9. DEPLOYMENT FUNCTION

def predict_cars(
    perslogi,
    nbPermis,
    revenu_str,
    idx_prox_emp,
    idx_prox_pharma,
    idx_prox_garderie,
    idx_prox_sante,
    idx_prox_epicerie,
    idx_prox_educpri,
    idx_prox_educsec,
    idx_prox_bibl,
    idx_prox_parcs,
    idx_prox_transit,
    model_path="xgb_car_ownership_model.pkl"
):
    """Predicts number of cars (0–4) from user inputs."""

    # Load trained model
    model = joblib.load(model_path)

    # Income category mapping (same structure as training data)
    income_mapping = {
        "Less than $10,000": 1,
        "$10,000 to $19,999": 1,
        "$20,000 to $29,999": 1,
        "$30,000 to $39,999": 2,
        "$40,000 to $49,999": 2,
        "$50,000 to $59,999": 2,
        "$60,000 to $69,999": 3,
        "$70,000 to $79,999": 3,
        "$80,000 to $89,999": 3,
        "$90,000 to $99,999": 4,
        "$100,000 to $119,999":4,
        "$120,000 to $149,999": 5,
        "$150,000 to $179,999": 6,
        "$180,000 to $209,999":7,
        "$210,000 and more": 8,
        "I don't know": 10,
        "I prefer not to answer": 9,
    }

    revenu_code = income_mapping.get(revenu_str, 10)

    # Build input row
    row = {
        "perslogi": perslogi,
        "nbPermis": nbPermis,
        "idx_prox_emp": idx_prox_emp,
        "idx_prox_pharma": idx_prox_pharma,
        "idx_prox_garderie": idx_prox_garderie,
        "idx_prox_sante": idx_prox_sante,
        "idx_prox_epicerie": idx_prox_epicerie,
        "idx_prox_educpri": idx_prox_educpri,
        "idx_prox_educsec": idx_prox_educsec,
        "idx_prox_bibl": idx_prox_bibl,
        "idx_prox_parcs": idx_prox_parcs,
        "idx_prox_transit": idx_prox_transit,
        "revenu": str(revenu_code)
    }

    df = pd.DataFrame([row])

    # Predict
    pred = model.predict(df)[0]
    return int(pred)



# 10. EXAMPLE PREDICTION

print("\nExample prediction:")
example = predict_cars(
    perslogi=3,
    nbPermis=2,
    revenu_str="$40,000 to $49,999",
    idx_prox_emp=0.3,
    idx_prox_pharma=0.1,
    idx_prox_garderie=0.2,
    idx_prox_sante=0.4,
    idx_prox_epicerie=0.5,
    idx_prox_educpri=0.1,
    idx_prox_educsec=0.2,
    idx_prox_bibl=0.3,
    idx_prox_parcs=0.1,
    idx_prox_transit=0.7
)

print("Predicted cars:", example)


  od = pd.read_csv(file_path)


Rows kept: (75260, 38)

Training tuned XGBoostClassifier pipeline...

=== Tuned XGBoostClassifier Performance (hold-out test) ===

Training final_xgb_pipeline on FULL dataset...

Model saved as: xgb_car_ownership_model.pkl

Example prediction:
Predicted cars: 1


# Part by part details if needed

## 1- Loading data

In [7]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

from xgboost import XGBClassifier
import joblib   


In [9]:

file_path = "/Users/bitafarokhian/Downloads/od18_20251112.csv"
od = pd.read_csv(file_path, encoding="utf-8")

print("Raw shape:", od.shape)
print(od.head())

Raw shape: (75266, 38)
   feuillet  revenu  autologi  perslogi  nbPermis  TypeMenage         IDIDU  \
0    101006       1         0         2         0           5  2.466045e+10   
1    101021       5         1         2         2           3  2.466136e+10   
2    101037       1         1         1         1           5  2.466084e+10   
3    101049       1         0         1         0           1  2.466151e+10   
4    101057       3         3         3         3           8  2.466284e+10   

   IDPOP       ADIDU  ADPOP  ...    idx_prox_educpri  en_id_educsec  \
0  202.0  24660452.0  476.0  ...  0.2420225519323052            0.0   
1  196.0  24661361.0  663.0  ...  0.3434350527203367            0.0   
2  523.0  24660835.0  737.0  ...  0.2674946572777115            1.0   
3  329.0  24661514.0  461.0  ...  0.2212670061593004            0.0   
4  157.0  24662843.0  512.0  ...  0.0639849746970133            0.0   

     idx_prox_educsec  en_id_bibl       idx_prox_bibl  en_id_parcs  \
0  0.

  od = pd.read_csv(file_path, encoding="utf-8")


## 2- Data Cleaning

In [11]:
target = "autologi"

numeric_features = [
    "perslogi",
    "nbPermis",
    "idx_prox_emp",
    "idx_prox_pharma",
    "idx_prox_garderie",
    "idx_prox_sante",
    "idx_prox_epicerie",
    "idx_prox_educpri",
    "idx_prox_educsec",
    "idx_prox_bibl",
    "idx_prox_parcs",
    "idx_prox_transit",
]

categorical_features = ["revenu"]   # only income class in this model for now! We will add the typology of people later

# 2.1 convert some columns
od["revenu"] = od["revenu"].astype(str)

# clip car ownership at 4+ cars
od["autologi"] = od["autologi"].clip(upper=4).astype(int)

# StatCan missing values sometimes coded as ".." or "..."
od = od.replace("..", 0).replace("...", 0)

# proximity indices numeric
prox_cols = [c for c in od.columns if c.startswith("idx_prox")]
od[prox_cols] = od[prox_cols].astype(float)

# 2.2 drop rows that are still incomplete
od_clean = od.dropna(subset=[target] + numeric_features + categorical_features)
print("Clean shape:", od_clean.shape)





Clean shape: (75260, 38)


## 3- Model

In [22]:

# 3. SPLIT DATA


X = od_clean[numeric_features + categorical_features]
y = od_clean[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)



# 4. PREPROCESSING PIPELINE (OHE for revenue)


preprocess = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)



# 5. TUNED XGBOOST CLASSIFIER (done before - from GridSearch)


xgb_clf = XGBClassifier(
    objective="multi:softprob",
    num_class=5,            # 0,1,2,3,+4 cars
    learning_rate=0.05,
    max_depth=6,
    n_estimators=200,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    tree_method="hist",
    eval_metric="merror",
)

# full pipeline = preprocessing + model
xgb_pipeline = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", xgb_clf),
])



# 6. TRAIN + TEST EVALUATION


print("\nTraining tuned XGBoostClassifier pipeline...")
xgb_pipeline.fit(X_train, y_train)

y_pred = xgb_pipeline.predict(X_test)

print("\n=== Tuned XGBoostClassifier Performance (hold-out test) ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))



# 7. RETRAIN ON FULL DATA FOR DEPLOYMENT


final_xgb_pipeline = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", xgb_clf),
])

print("\nTraining final_xgb_pipeline on FULL dataset...")
final_xgb_pipeline.fit(X, y)


Training tuned XGBoostClassifier pipeline...

=== Tuned XGBoostClassifier Performance (hold-out test) ===
Accuracy: 0.6917884666489503
              precision    recall  f1-score   support

           0       0.85      0.64      0.73      2868
           1       0.71      0.75      0.73      8035
           2       0.65      0.74      0.69      6116
           3       0.49      0.37      0.42      1289
           4       0.59      0.31      0.41       507

    accuracy                           0.69     18815
   macro avg       0.66      0.56      0.60     18815
weighted avg       0.69      0.69      0.69     18815


Training final_xgb_pipeline on FULL dataset...


## Final Function

In [15]:
def predict_cars(
    perslogi,
    nbPermis,
    revenu_str,
    idx_prox_emp,
    idx_prox_pharma,
    idx_prox_garderie,
    idx_prox_sante,
    idx_prox_epicerie,
    idx_prox_educpri,
    idx_prox_educsec,
    idx_prox_bibl,
    idx_prox_parcs,
    idx_prox_transit,
    model
):
    """
    Predicts number of cars using the trained XGBoost classifier pipeline.

    Inputs must be provided exactly as the tool collects them.
    'model' is the final_xgb_pipeline loaded or passed by caller.
    """

  
    # 1. Map revenue string → category
   
    income_mapping = {
        "Less than $10,000": 1,
        "$10,000 to $19,999": 1,
        "$20,000 to $29,999": 1,
        "$30,000 to $39,999": 2,
        "$40,000 to $49,999": 2,
        "$50,000 to $59,999": 2,
        "$60,000 to $69,999": 3,
        "$70,000 to $79,999": 3,
        "$80,000 to $89,999": 3,
        "$90,000 to $99,999": 4,
        "$100,000 to $119,999":4,
        "$120,000 to $149,999": 5,
        "$150,000 to $179,999": 6,
        "$180,000 to $209,999":7,
        "$210,000 and more": 8,
        "I don't know": 10,
        "I prefer not to answer": 9,
    }

    revenu_code = income_mapping.get(revenu_str, 10)

   
    # 2. Build input row for pipeline
    
    row = {
        "perslogi": perslogi,
        "nbPermis": nbPermis,
        "idx_prox_emp": idx_prox_emp,
        "idx_prox_pharma": idx_prox_pharma,
        "idx_prox_garderie": idx_prox_garderie,
        "idx_prox_sante": idx_prox_sante,
        "idx_prox_epicerie": idx_prox_epicerie,
        "idx_prox_educpri": idx_prox_educpri,
        "idx_prox_educsec": idx_prox_educsec,
        "idx_prox_bibl": idx_prox_bibl,
        "idx_prox_parcs": idx_prox_parcs,
        "idx_prox_transit": idx_prox_transit,
        "revenu": str(revenu_code)
    }

    df = pd.DataFrame([row])

    # -------------------------------
    # 3. Predict using trained model
    # -------------------------------
    pred = model.predict(df)[0]

    # Return integer 0–4
    return int(pred)

## example

In [20]:
cars = predict_cars(
    perslogi = 3,
    nbPermis = 2,
    revenu_str = "$50,000 to $59,999",
    idx_prox_emp = 0.3,
    idx_prox_pharma = 0.1,
    idx_prox_garderie = 0.4,
    idx_prox_sante = 0.2,
    idx_prox_epicerie = 0.5,
    idx_prox_educpri = 0.1,
    idx_prox_educsec = 0.2,
    idx_prox_bibl = 0.3,
    idx_prox_parcs = 0.4,
    idx_prox_transit = 0.8,
    model = final_xgb_pipeline
)

print("Predicted cars:", cars)


Predicted cars: 1


## Example in map - Just for test

example:

In [62]:
pip install folium


Note: you may need to restart the kernel to use updated packages.


In [66]:
pip install contextily


Note: you may need to restart the kernel to use updated packages.


In [210]:
# Compute accessibility score
od_clean["access_score"] = (
    od_clean["idx_prox_emp"] +
    od_clean["idx_prox_pharma"] +
    od_clean["idx_prox_garderie"] +
    od_clean["idx_prox_sante"] +
    od_clean["idx_prox_epicerie"] +
    od_clean["idx_prox_educpri"] +
    od_clean["idx_prox_educsec"] +
    od_clean["idx_prox_bibl"] +
    od_clean["idx_prox_parcs"] +
    od_clean["idx_prox_transit"]
)

# Keep only unique coordinate points
unique_locs = od_clean.drop_duplicates(subset=["id_lat", "id_lon"])

# Top 10 unique most accessible points
top10 = unique_locs.nlargest(10, "access_score")[["id_lon", "id_lat", "access_score"]]

# Bottom 10 unique least accessible points
bottom10 = unique_locs.nsmallest(10, "access_score")[["id_lon", "id_lat", "access_score"]]

print("Top 10 high-access points:\n", top10)
print("\nBottom 10 low-access points:\n", bottom10)


Top 10 high-access points:
         id_lon   id_lat  access_score
9557  -73.5659  45.5112      4.103662
2621  -73.5978  45.5188      3.904211
1279  -73.5671  45.5146      3.854320
3541  -73.5671  45.5126      3.838145
4649  -73.5956  45.5221      3.772115
3180  -73.5673  45.5119      3.770197
72063 -73.5964  45.5204      3.757300
1487  -73.5958  45.5211      3.726286
10448 -73.5665  45.5159      3.696825
9037  -73.6059  45.5170      3.638324

Bottom 10 low-access points:
         id_lon   id_lat  access_score
18360 -74.4078  45.8014      0.000018
38992 -74.4956  45.8793      0.000028
69760 -74.3670  45.7539      0.000032
29652 -73.7231  46.1261      0.000041
74339 -72.7554  45.0203      0.000054
19466 -74.3303  45.7966      0.000059
36336 -74.4847  45.8610      0.000065
25864 -74.3287  45.7795      0.000066
29641 -74.3365  45.7551      0.000073
60406 -74.2521  45.8045      0.000075


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  od_clean["access_score"] = (


choosing 3 locations

In [214]:
# Compute accessibility score
prox_cols = [
    "idx_prox_emp", "idx_prox_pharma", "idx_prox_garderie",
    "idx_prox_sante", "idx_prox_epicerie",
    "idx_prox_educpri", "idx_prox_educsec",
    "idx_prox_bibl", "idx_prox_parcs", "idx_prox_transit"
]

od_clean["access_score"] = od_clean[prox_cols].sum(axis=1)

# Locations
high_loc  = od_clean.nlargest(3, "access_score").iloc[0]
medium_loc = od_clean.iloc[len(od_clean)//2]       # median access
low_loc   = od_clean.nsmallest(1, "access_score").iloc[0]
low5_loc  = od_clean.nsmallest(5, "access_score").iloc[-1]   # 5th lowest


def extract_loc(row):
    return {
        "idx_prox_emp":       row["idx_prox_emp"],
        "idx_prox_pharma":    row["idx_prox_pharma"],
        "idx_prox_garderie":  row["idx_prox_garderie"],
        "idx_prox_sante":     row["idx_prox_sante"],
        "idx_prox_epicerie":  row["idx_prox_epicerie"],
        "idx_prox_educpri":   row["idx_prox_educpri"],
        "idx_prox_educsec":   row["idx_prox_educsec"],
        "idx_prox_bibl":      row["idx_prox_bibl"],
        "idx_prox_parcs":     row["idx_prox_parcs"],
        "idx_prox_transit":   row["idx_prox_transit"],
        "lat":                row["id_lat"],
        "lon":                row["id_lon"],
    }

# Extract feature dictionaries
HIGH   = extract_loc(high_loc)
MEDIUM = extract_loc(medium_loc)
LOW    = extract_loc(low_loc)
LOW5   = extract_loc(low5_loc)

print("HIGH:\n", HIGH)
print("\nMEDIUM:\n", MEDIUM)
print("\nLOW:\n", LOW)
print("\n5th LOWEST:\n", LOW5)


HIGH:
 {'idx_prox_emp': 0.504009515589194, 'idx_prox_pharma': 0.3699210066793408, 'idx_prox_garderie': 0.390694157820645, 'idx_prox_sante': 0.2794194220677586, 'idx_prox_epicerie': 0.5302039724131117, 'idx_prox_educpri': 0.2781694522681496, 'idx_prox_educsec': 0.0854360777722558, 'idx_prox_bibl': 0.3022820130960015, 'idx_prox_parcs': 0.9533788795411572, 'idx_prox_transit': 0.410147145461452, 'lat': 45.5112, 'lon': -73.5659}

MEDIUM:
 {'idx_prox_emp': 0.0123319618576506, 'idx_prox_pharma': 0.0207722986635519, 'idx_prox_garderie': 0.2171461269252508, 'idx_prox_sante': 0.0084578931636186, 'idx_prox_epicerie': 0.0254951263021986, 'idx_prox_educpri': 0.0619779127950764, 'idx_prox_educsec': 0.0, 'idx_prox_bibl': 0.0, 'idx_prox_parcs': 0.0115509409692146, 'idx_prox_transit': 0.0124256728063763, 'lat': 45.7742, 'lon': -73.4252}

LOW:
 {'idx_prox_emp': 1.8401377006240783e-05, 'idx_prox_pharma': 0.0, 'idx_prox_garderie': 0.0, 'idx_prox_sante': 0.0, 'idx_prox_epicerie': 0.0, 'idx_prox_educpri': 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  od_clean["access_score"] = od_clean[prox_cols].sum(axis=1)


Predicting car ownership

In [200]:
def drop_lat_lon(loc_dict):
    """Remove latitude and longitude keys before prediction."""
    return {k: v for k, v in loc_dict.items() if k not in ["lat", "lon"]}

# create clean feature dicts
HIGH_FEATS = drop_lat_lon(HIGH)
MEDIUM_FEATS = drop_lat_lon(MEDIUM)
LOW_FEATS = drop_lat_lon(LOW5)

# example—HIGH accessibility
pred_high = predict_cars(
    perslogi=3,
    nbPermis=2,
    revenu_str="$60,000 to $69,999",
    **HIGH_FEATS
)

pred_medium = predict_cars(
    perslogi=3,
    nbPermis=2,
    revenu_str="$60,000 to $69,999",
    **MEDIUM_FEATS
)

pred_low = predict_cars(
    perslogi=3,
    nbPermis=2,
    revenu_str="$60,000 to $69,999",
    **LOW_FEATS
)

In [202]:
pred_low

2

In [204]:
pred_medium

2

In [206]:
pred_high

1

In [240]:
import folium


# 1. Coordinates of the three selected accessibility levels
#

# HIGH accessibility (Plateau / Rosemont sample)
high_lat = 45.5112
high_lon = -73.5659

# MEDIUM accessibility (Côte-Saint-Luc / Saint-Laurent sample)
medium_lat = 45.4643
medium_lon = -73.6434

# LOW accessibility (Saint-Hubert / Longueuil outskirts)
low_lat = 45.4983
low_lon = -73.9614



# 2. Creating a map centered between all three locations


center_lat = (high_lat + medium_lat + low_lat) / 3
center_lon = (high_lon + medium_lon + low_lon) / 3

m = folium.Map(location=[center_lat, center_lon], zoom_start=11)



# 3. Adding markers for each location


# HIGH accessibility
folium.Marker(
    [high_lat, high_lon],
    popup="High Accessibility Area (Plateau / Rosemont)<br>Lowest car ownership expected",
    tooltip="HIGH accessibility",
    icon=folium.Icon(color="green", icon="car", prefix="fa")
).add_to(m)

# MEDIUM accessibility
folium.Marker(
    [medium_lat, medium_lon],
    popup="Medium Accessibility Area (Côte-Saint-Luc)",
    tooltip="MEDIUM accessibility",
    icon=folium.Icon(color="orange", icon="car", prefix="fa")
).add_to(m)

# LOW accessibility
folium.Marker(
    [low_lat, low_lon],
    popup="Low Accessibility Area (Saint-Hubert / Longueuil)<br>Highest car ownership expected",
    tooltip="LOW accessibility",
    icon=folium.Icon(color="red", icon="car", prefix="fa")
).add_to(m)



# 4. Display map

legend_html = """
<div style="
    position: fixed; 
    bottom: 30px; left: 30px; width: 250px; height: 140px; 
    background-color: white; 
    border:2px solid grey; 
    z-index:9999; 
    font-size:12px;
    padding: 10px;
">
<b>Legend – Predicted Cars</b><br>
<i class="fa fa-car" style="color:green"></i> &nbsp; 1 car (High access – OD example)<br>
<i class="fa fa-car" style="color:orange"></i> &nbsp; 1 car (Medium–high access)<br>
<i class="fa fa-car" style="color:red"></i> &nbsp; 2 cars (Low access)<br>
</div>
"""

m.get_root().html.add_child(folium.Element(legend_html))



# 5. Display the final map

m


In [242]:
# Target location (LOW accessibility example)
target_lat = 45.4983
target_lon = -73.9614

# Compute distance to all OD points (Euclidean distance in lat/lon space)
od_clean["dist"] = (
    (od_clean["id_lat"] - target_lat)**2 +
    (od_clean["id_lon"] - target_lon)**2
).pow(0.5)

# Find the closest row
closest_row = od_clean.nsmallest(1, "dist").iloc[0]

# Extract all proximity features
prox_cols = [
    "idx_prox_emp", "idx_prox_pharma", "idx_prox_garderie",
    "idx_prox_sante", "idx_prox_epicerie",
    "idx_prox_educpri", "idx_prox_educsec",
    "idx_prox_bibl", "idx_prox_parcs", "idx_prox_transit"
]

LOW_FROM_COORD = {col: closest_row[col] for col in prox_cols}

print("\nClosest OD row to (45.4983, -73.9614):")
print(closest_row[["id_lat", "id_lon", "dist"]])

print("\nExtracted proximity values for this point:\n")
print(LOW_FROM_COORD)



Closest OD row to (45.4983, -73.9614):
id_lat    45.4983
id_lon   -73.9614
dist          0.0
Name: 37, dtype: object

Extracted proximity values for this point:

{'idx_prox_emp': 0.0036428658093537, 'idx_prox_pharma': 0.0, 'idx_prox_garderie': 0.0280489186570501, 'idx_prox_sante': 0.0005354213692471, 'idx_prox_epicerie': 0.003507727438329, 'idx_prox_educpri': 0.0494114953485251, 'idx_prox_educsec': 0.0, 'idx_prox_bibl': 0.0, 'idx_prox_parcs': 0.0158169681404668, 'idx_prox_transit': 0.0125818909476578}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  od_clean["dist"] = (


In [168]:
import folium


# 1. LOCATION COORDINATES


HIGH_LAT, HIGH_LON = 45.5112, -73.5659
MEDIUM_LAT, MEDIUM_LON = 45.7742, -73.4252
LOW_LAT, LOW_LON = 45.8014, -74.4078


# 2.  PROXIMITY VALUES


HIGH_TRANSIT = 0.410147145461452
MEDIUM_TRANSIT = 0.0124256728063763
LOW_TRANSIT = 0.0


# 3. MODEL PREDICTIONS


HIGH_PRED = 1      
MEDIUM_PRED = 2
LOW_PRED = 2

# -------------------------
# 4. CREATE THE MAP
# -------------------------

m_all = folium.Map(location=[45.56, -73.60], zoom_start=10)


# 5. MARKERS WITH POPUP INFO


folium.Marker(
    [HIGH_LAT, HIGH_LON],
    popup=f"<b>HIGH ACCESSIBILITY</b><br>"
          f"Transit proximity: {HIGH_TRANSIT:.3f}<br>"
          f"Predicted cars: {HIGH_PRED}",
    tooltip="High accessibility",
    icon=folium.Icon(color="green", icon="car", prefix="fa")
).add_to(m_all)

folium.Marker(
    [MEDIUM_LAT, MEDIUM_LON],
    popup=f"<b>MEDIUM ACCESSIBILITY</b><br>"
          f"Transit proximity: {MEDIUM_TRANSIT:.3f}<br>"
          f"Predicted cars: {MEDIUM_PRED}",
    tooltip="Medium accessibility",
    icon=folium.Icon(color="orange", icon="car", prefix="fa")
).add_to(m_all)

folium.Marker(
    [LOW_LAT, LOW_LON],
    popup=f"<b>LOW ACCESSIBILITY</b><br>"
          f"Transit proximity: {LOW_TRANSIT:.3f}<br>"
          f"Predicted cars: {LOW_PRED}",
    tooltip="Low accessibility",
    icon=folium.Icon(color="red", icon="car", prefix="fa")
).add_to(m_all)


# 6. LEGEND


legend_html = """
<div style="
    position: fixed;
    bottom: 40px;
    left: 40px;
    width: 250px;
    height: 160px;
    background-color: white;
    z-index:9999;
    font-size:14px;
    border:2px solid grey;
    padding:10px;
">
<b>Legend</b><br>
<span style="color:red;">&#9679;</span> Low accessibility ( 2 cars)<br>
<span style="color:orange;">&#9679;</span> Medium accessibility ( 2 cars)<br>
<span style="color:green;">&#9679;</span> High accessibility (1 car)<br>
<br>


</div>
"""

m_all.get_root().html.add_child(folium.Element(legend_html))

# Show map
m_all


In [247]:
od_clean

Unnamed: 0,feuillet,revenu,autologi,perslogi,nbPermis,TypeMenage,IDIDU,IDPOP,ADIDU,ADPOP,...,idx_prox_educsec,en_id_bibl,idx_prox_bibl,en_id_parcs,idx_prox_parcs,en_id_transit,idx_prox_transit,quartier_dense_en_commodites,access_score,dist
0,101006,1,0,2,0,5,2.466045e+10,202.0,24660452.0,476.0,...,0.084397,0.0,0.051264,0.0,0.163795,0.0,0.054073,2.0,1.784264,0.342653
1,101021,5,1,2,2,3,2.466136e+10,196.0,24661361.0,663.0,...,0.157704,0.0,0.050923,0.0,0.279924,0.0,0.103367,1.0,2.058414,0.347957
2,101037,1,1,1,1,5,2.466084e+10,523.0,24660835.0,737.0,...,0.234788,0.0,0.000000,0.0,0.040222,1.0,0.022301,0.0,0.966873,0.307309
3,101049,1,0,1,0,1,2.466151e+10,329.0,24661514.0,461.0,...,0.014653,0.0,0.068919,1.0,0.283300,1.0,0.104290,2.0,2.523864,0.352809
4,101057,3,3,3,3,8,2.466284e+10,157.0,24662843.0,512.0,...,0.000000,0.0,0.000000,0.0,0.227733,0.0,0.052679,0.0,0.714100,0.386793
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75261,872371,2,1,2,1,3,2.466045e+10,303.0,24660445.0,645.0,...,0.019805,0.0,0.000000,0.0,0.129270,0.0,0.080428,0.0,1.872038,0.338071
75262,872377,4,2,2,2,3,2.474002e+10,502.0,24740020.0,1023.0,...,0.000000,0.0,0.019094,0.0,0.000000,0.0,0.000000,0.0,0.098984,0.141400
75263,872379,2,1,1,1,7,2.466047e+10,288.0,24660471.0,548.0,...,0.114285,0.0,0.000000,0.0,0.149090,0.0,0.091263,0.0,2.121159,0.334704
75264,872380,9,3,3,3,8,2.456015e+10,70.0,24560152.0,453.0,...,0.021100,0.0,0.000000,0.0,0.011583,0.0,0.000000,0.0,0.191317,0.696234
