In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv("data/credit_card.csv")
df.head()

df.describe()


In [None]:
df=df.drop("CUST_ID", axis=1)
df=df.dropna()
df.shape

In [None]:
from sklearn.preprocessing import StandardScaler 

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df)

X_scaled[5]

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

wcss=[]

for k in range(1,11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(8,5))
plt.plot(range(1,11), wcss, marker='o')
plt.title("Elbow Method")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("WCSS")
plt.show()




In [None]:
from sklearn.metrics import silhouette_score
for k in range (2,11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X_scaled)
    score = silhouette_score(X_scaled, labels)
    print(f"K={k}, Silhouette Score={score:.4f}")

In [None]:
kmeans = KMeans(n_clusters=2, random_state=42)
labels = kmeans.fit_predict(X_scaled)
df["Cluster"] = labels
df.head(10)

In [None]:
df.groupby("Cluster").mean()

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

print("Explained Variance Ratio:", pca.explained_variance_ratio_)
print(X_scaled.shape)
print(X_pca.shape)


In [None]:
plt.figure(figsize=(8,6))
plt.scatter(
    X_pca[:,0],
    X_pca[:,1],
    c=df["Cluster"],
    cmap="viridis",
    alpha=0.6
)

plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("Customer Segmentation (PCA Projection)")
plt.colorbar(label="Cluster")
plt.show()

In [None]:
kmeans_3 = KMeans(n_clusters=3, random_state=42)
labels_3 = kmeans_3.fit_predict(X_scaled)

df["Cluster_3"] = labels_3

from sklearn.metrics import silhouette_score
print("Silhouette K=3:", silhouette_score(X_scaled, labels_3))
plt.figure(figsize=(8,6))
plt.scatter(
    X_pca[:,0],
    X_pca[:,1],
    c=df["Cluster_3"],
    cmap="viridis",
    alpha=0.6
)
plt.title("Customer Segmentation (K=3)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.colorbar()
plt.show()



In [None]:
centroids_scaled = kmeans.cluster_centers_
centroids_scaled



In [None]:
centroids_original = scaler.inverse_transform(centroids_scaled)
centroids_df = pd.DataFrame(centroids_original, columns=df.columns[:-2])  # exclude cluster columns
centroids_df


In [None]:
pca_components = pd.DataFrame(
    pca.components_,
    columns=df.columns[:-2]
)
pca_components

In [None]:
kmeans_pca = KMeans(n_clusters=2, random_state=42)
labels_pca = kmeans_pca.fit_predict(X_pca)

silhouette_pca = silhouette_score(X_pca, labels_pca)

print("Silhouette Score (PCA space):", silhouette_pca)



In [None]:
plt.figure(figsize=(8,6))
plt.scatter(
    X_pca[:,0],
    X_pca[:,1],
    c=labels_pca,
    cmap="viridis",
    alpha=0.6
)
plt.title("Clustering After PCA (K=2)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.colorbar()
plt.show()
