In [9]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv('datasets/condo - Sheet3 (1).csv')

# Select features for clustering
features = [
    'Developer',
    'Building',
    'Unit Type',
    'Size',
    'Location',
    'Price'  # Assuming 'Price' is also included for anomaly detection
]

df_cleaned = df[features].dropna()

# Define preprocessing for categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['Developer', 'Building', 'Unit Type', 'Location']),
        ('num', StandardScaler(), ['Size'])  # 'Size' is assumed to be numeric
    ])

# Create a pipeline with preprocessing and K-Means clustering
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('kmeans', KMeans(n_clusters=3, random_state=42))
])

# Fit the pipeline and predict clusters
df_cleaned['cluster'] = pipeline.fit_predict(df_cleaned)

# Calculate distance to cluster centers
X_scaled = preprocessor.transform(df_cleaned)  # Get transformed features
distances = pipeline.named_steps['kmeans'].transform(X_scaled)
df_cleaned['distance_to_center'] = np.min(distances, axis=1)

# Identify anomalies (properties with large distances and low prices)
threshold_distance = np.percentile(df_cleaned['distance_to_center'], 95)  # Top 5% as outliers
df_cleaned['too_good_to_be_true'] = (df_cleaned['distance_to_center'] > threshold_distance) & (df_cleaned['Price'] < df_cleaned['Price'].mean())

# View flagged properties
print(df_cleaned[df_cleaned['too_good_to_be_true']])


  Developer          Building Unit Type  Size        Location      Price  \
4      SMDC  Bloom Residences       1BR    39  Paranaque City  6700000.0   

   cluster  distance_to_center  too_good_to_be_true  
4        1            1.730226                 True  
