# Strategic Store Placement â€“ Feature Engineering Notebook

### This notebook generates the enhanced dataset used for K-Means clustering and recommendation scoring.

## 1. Load Dataset

In [7]:
import pandas as pd, os
import chardet

# detect encoding
rawdata = open('../backend/rawbusinessdata.csv', 'rb').read()
result = chardet.detect(rawdata)
print(result)

df = pd.read_csv('../backend/rawbusinessdata.csv', encoding=result['encoding'])
df.head()


{'encoding': 'utf-8', 'confidence': 0.938125, 'language': ''}


Unnamed: 0,business_id,business_name,general_category,latitude,longitude,street,zone_type,status
0,1,SANTA MARIA 888 Hardware & CONSTRUCTION SUPPLIES,Merchandise / Trading,14.83326,120.95468,Gulod St.,Commercial,Active
1,2,VK Food & Beverages,Food & Beverages,14.83335,120.95478,Gulod St.,Commercial,Active
2,3,Reymalyn Loading Station,Retail,14.83373,120.95497,Gulod St.,Commercial,Active
3,4,Nanay Mercy & KMS Carwash,Services,14.8336,120.95501,Gulod St.,Commercial,Active
4,5,BIGOTE'S MAMIHAN,Restaurant,14.83397,120.95488,Gulod St.,Commercial,Active


## 2. Inspect Structure

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   business_id       100 non-null    int64  
 1   business_name     100 non-null    object 
 2   general_category  100 non-null    object 
 3   latitude          100 non-null    float64
 4   longitude         100 non-null    float64
 5   street            100 non-null    object 
 6   zone_type         100 non-null    object 
 7   status            100 non-null    object 
dtypes: float64(2), int64(1), object(5)
memory usage: 6.4+ KB


## 3. Build BallTree for Geographic Distance
Using Haversine distance which is ideal for lat/lon coordinates.

In [9]:
import numpy as np
from sklearn.neighbors import BallTree

coords = np.radians(df[['latitude','longitude']].values)
tree = BallTree(coords, metric='haversine')
coords[:5]

array([[0.25888923, 2.11105741],
       [0.2588908 , 2.11105916],
       [0.25889743, 2.11106247],
       [0.25889516, 2.11106317],
       [0.25890162, 2.1110609 ]])

## 4. Define Multi-Radius Windows

In [10]:
# Radii in meters
radii_meters = {
    "50m": 50,
    "100m": 100,
    "200m": 200
}

# Earth radius in meters
earth_radius_m = 6371000

# Convert to radians
radii_radians = {k: v / earth_radius_m for k, v in radii_meters.items()}
radii_radians

{'50m': 7.848061528802386e-06,
 '100m': 1.5696123057604773e-05,
 '200m': 3.1392246115209545e-05}

## 5. Compute Multi-Radius Business & Competitor Densities

In [12]:
# Initialize
for label in radii_meters.keys():
    df[f'business_density_{label}'] = 0
    df[f'competitor_density_{label}'] = 0

# Compute densities
for i, row in df.iterrows():
    for label, rad in radii_radians.items():
        idx = tree.query_radius([coords[i]], r=rad)[0]

        df.at[i, f'business_density_{label}'] = len(idx) - 1

        nearby = df.iloc[idx]
        same = nearby[nearby['general_category'] == row['general_category']]
        df.at[i, f'competitor_density_{label}'] = len(same) - 1

df.head()

Unnamed: 0,business_id,business_name,general_category,latitude,longitude,street,zone_type,status,business_density_50m,competitor_density_50m,business_density_100m,competitor_density_100m,business_density_200m,competitor_density_200m
0,1,SANTA MARIA 888 Hardware & CONSTRUCTION SUPPLIES,Merchandise / Trading,14.83326,120.95468,Gulod St.,Commercial,Active,2,0,5,0,9,0
1,2,VK Food & Beverages,Food & Beverages,14.83335,120.95478,Gulod St.,Commercial,Active,4,0,5,0,9,0
2,3,Reymalyn Loading Station,Retail,14.83373,120.95497,Gulod St.,Commercial,Active,4,1,5,1,12,4
3,4,Nanay Mercy & KMS Carwash,Services,14.8336,120.95501,Gulod St.,Commercial,Active,4,0,5,0,12,1
4,5,BIGOTE'S MAMIHAN,Restaurant,14.83397,120.95488,Gulod St.,Commercial,Active,2,0,9,1,13,1


## 6. Encode Zone Type

In [13]:
df['zone_encoded'] = df['zone_type'].astype('category').cat.codes
df[['zone_type','zone_encoded']].head()

Unnamed: 0,zone_type,zone_encoded
0,Commercial,0
1,Commercial,0
2,Commercial,0
3,Commercial,0
4,Commercial,0


## 7. Save Enhanced Dataset


In [14]:
df.to_csv('../backend/enhanced_businessdata.csv', index=False)
'enhanced_businessdata.csv saved.'

'enhanced_businessdata.csv saved.'