In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from umap import UMAP
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

In [None]:
# kodu tutarlı hale getir
# resultstaki alanları tespit et.
# Bu alanların fotoğraflarını bul
# results'ı yorumla

path = './'
filename = 'joined_nufus.csv' 

df = pd.read_csv(path+filename)

df['area'] = df['area']/1000000
df.head()

In [None]:
# Veride null olduğunu biliyoruz. Bu null veriler AFAD'ın mahalleyi yanlış işaretlemesinden kaynaklı. 
df.isnull().sum()

In [None]:
# Mahallelerdeki nüfus histogramı
sns.histplot(data=df[['mahalle_adi','mahalle_nüfus']].drop_duplicates(), x="mahalle_nüfus", kde=True)

In [None]:
df.describe()

In [None]:
# Burada mahalle bazında bina mesafelerini stacked bar chart halinde çıkartıyoruz.
# Ancak nüfus bina sayısını pozitif yönde etkileyecek. Normalize etmemiz lazım. 
df_building_counts = df[['ilce_adi','num_of_buildings_between_0_15','num_of_buildings_between_15_30','num_of_buildings_between_30_50']]
df_building_counts = df_building_counts.rename(columns={
    'num_of_buildings_between_0_15': '0_15m',
    'num_of_buildings_between_15_30': '15_30m',
    'num_of_buildings_between_30_50': '30_50m',
    'ilce_adi':'District'
})
grouped = df_building_counts.groupby('District')[['0_15m', '15_30m', '30_50m']].sum()

# Optional: sort by total building count (largest first)
grouped['total'] = grouped.sum(axis=1)
grouped = grouped.sort_values(by='total', ascending=False)
grouped.drop(columns='total', inplace=True)

# Plot stacked bar chart
grouped.plot(
    kind='bar',
    stacked=True,
    figsize=(14, 7),
    colormap='viridis'
)

plt.title("Stacked Bar Chart of Building Counts by Proximity to EAZs by District")
plt.xlabel("District")
plt.ylabel("Number of Buildings")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.legend(title="Proximity to EAZ")
plt.show()

In [None]:
# Mahallelerdeki bina sayılarının nüfusa bölünmüş stacked bar chart'ı
# Bu bar chartın y axis'i bize çok bir şey söylemiyor sadece bar chartları birbirleriyle karşılaştıracağız.
# Aalar'ı exclude ettik çünkü nüfusu düşük ve dar bir alan olduğu için veriyi çarpıtıyordu.
df_building_counts = df[['ilce_adi','num_of_buildings_between_0_15','num_of_buildings_between_15_30','num_of_buildings_between_30_50','mahalle_nüfus']]
df_building_counts = df_building_counts.rename(columns={
    'num_of_buildings_between_0_15': '0_15m',
    'num_of_buildings_between_15_30': '15_30m',
    'num_of_buildings_between_30_50': '30_50m',
    'mahalle_nüfus': 'neg_pop',
    'ilce_adi':'District'
})
df_building_counts = df_building_counts[df_building_counts['District'] != 'ADALAR']

grouped = df_building_counts.groupby('District')[['0_15m', '15_30m', '30_50m','neg_pop']].sum()

grouped['normalized_15'] = grouped['0_15m'] / grouped['neg_pop']
grouped['normalized_30'] = grouped['15_30m'] / grouped['neg_pop']
grouped['normalized_50'] = grouped['30_50m'] / grouped['neg_pop']

grouped_normalized = grouped[['normalized_15','normalized_30','normalized_50']]

# Optional: sort by total building count (largest first)
grouped_normalized['total'] = grouped_normalized.sum(axis=1)
grouped_normalized = grouped_normalized.sort_values(by='total', ascending=False)
grouped_normalized.drop(columns='total', inplace=True)
# Plot stacked bar chart
grouped_normalized.plot(
    kind='bar',
    stacked=True,
    figsize=(14, 7),
    colormap='viridis'
)

plt.title("Stacked Bar Chart of Building Counts by Proximity to EAZs by District")
plt.xlabel("District")
plt.ylabel("Number of Buildings")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.legend(title="Proximity to EAZ")
plt.show()

In [None]:
# fid'yi dropladık ki model eğitirken kullanmayalım.
df = df.drop(['fid'], axis='columns')
df = df.dropna(axis='index', how='any')

In [None]:
def elbow_method(df, max_k=15,scaler=StandardScaler):
    numeric_cols = [col for col in df.select_dtypes(include='number').columns if col != 'cluster']
    X = df[numeric_cols]
    X_scaled = scaler.fit_transform(X)

    inertia = []
    k_values = range(1, max_k + 1)

    for k in k_values:
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(X_scaled)
        inertia.append(kmeans.inertia_)

    plt.figure(figsize=(8, 5))
    plt.plot(k_values, inertia, marker='o')
    plt.xticks(k_values)
    plt.xlabel('Number of Clusters (k)')
    plt.ylabel('Inertia (WSS)')
    plt.title(f'Elbow Method for Optimal k with {scaler}')
    plt.grid(True)
    plt.show()


In [None]:
elbow_method(df, max_k=15,scaler=StandardScaler())

In [None]:
def cluster_and_plot(
    df: pd.DataFrame,
    cluster_num: int = 3,
    plot: bool = True,
    scaler=StandardScaler(),
    reduction_method: str = "pca"  # Options: 'pca', 'umap', None
):
    df = df.copy()

    # Select numeric columns (excluding pre-existing cluster)
    numeric_cols = [col for col in df.select_dtypes(include='number').columns if col != 'cluster']
    X = df[numeric_cols]
    X_scaled = scaler.fit_transform(X)

    # Fit KMeans
    kmeans = KMeans(n_clusters=cluster_num, random_state=42)
    labels = kmeans.fit_predict(X_scaled)
    df['cluster'] = labels

    # Evaluation
    silhouette = silhouette_score(X_scaled, labels)
    calinski = calinski_harabasz_score(X_scaled, labels)
    davies = davies_bouldin_score(X_scaled, labels)

    print("📊 Evaluation Results")
    print(f"Silhouette Score:       {silhouette:.3f}")
    print(f"Calinski-Harabasz Score: {calinski:.1f}")
    print(f"Davies-Bouldin Score:    {davies:.2f}")
    print("\n🔢 Cluster Sizes:")
    print(df['cluster'].value_counts())

    # Plotting
    if plot:
        if reduction_method == "pca":
            X_plot = PCA(n_components=2).fit_transform(X_scaled)
            x_label, y_label = "PCA 1", "PCA 2"
        elif reduction_method == "umap":
            X_plot = UMAP(n_components=2, random_state=42).fit_transform(X_scaled)
            x_label, y_label = "UMAP 1", "UMAP 2"
        else:
            if X_scaled.shape[1] > 2:
                print("❗ Cannot plot high-dimensional data without dimension reduction.")
                return df, kmeans, scaler, numeric_cols
            X_plot = X_scaled
            x_label, y_label = numeric_cols[0], numeric_cols[1]

        plt.figure(figsize=(8, 6))
        colors = plt.cm.viridis(np.linspace(0, 1, cluster_num))
        for cluster_id, color in zip(range(cluster_num), colors):
            cluster_points = X_plot[labels == cluster_id]
            plt.scatter(cluster_points[:, 0], cluster_points[:, 1],
                        s=50, color=color, label=f"Cluster {cluster_id}")

        plt.title(f"K-Means Clustering (k={cluster_num}) - {reduction_method.upper() if reduction_method else 'No Reduction'}")
        plt.xlabel(x_label)
        plt.ylabel(y_label)
        plt.legend(title="Clusters")
        plt.grid(True)
        plt.show()

    return df, kmeans, scaler, numeric_cols


In [None]:
df_clustered, kmeans, scaler, numeric_cols = cluster_and_plot(df,cluster_num=4,plot=True,scaler=StandardScaler(),reduction_method='umap')

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
df_clustered[df_clustered['cluster'] == 3]


In [None]:
pd.set_option('display.float_format', '{:.5f}'.format)


In [None]:
#inverse standardized
centroids_unscaled = pd.DataFrame(
    scaler.inverse_transform(kmeans.cluster_centers_),
    columns=numeric_cols
)
centroids_unscaled.head()


In [None]:
df_clustered.groupby('cluster')[numeric_cols].median()


In [None]:
df_clustered.groupby('cluster')[numeric_cols].mean()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def plot_feature_distributions(df, features, cluster_col='cluster'):
    num_features = len(features)
    num_cols = 2
    num_rows = (num_features + 1) // num_cols

    plt.figure(figsize=(12, 4 * num_rows))
    
    for idx, feature in enumerate(features):
        plt.subplot(num_rows, num_cols, idx + 1)
        sns.boxplot(data=df, x=cluster_col, y=feature, palette='viridis')
        plt.title(f"{feature} by Cluster")
        plt.xlabel("Cluster")
        plt.ylabel(feature)
        plt.grid(True, axis='y')

    plt.tight_layout()
    plt.show()


In [None]:
plot_feature_distributions(df_clustered, numeric_cols)

In [None]:
df.groupby('cluster')[numeric_cols].mean()


In [None]:
df_clustered.to_csv(path+'df_clustered.csv')