In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("debtData.csv")
print(df)

     CustomerID  Age  EducationLevel  YearsEmployed  Income  CardDebt  \
0             1   41               2              6      19     0.124   
1             2   47               1             26     100     4.582   
2             3   33               2             10      57     6.111   
3             4   29               2              4      19     0.681   
4             5   47               1             31     253     9.308   
..          ...  ...             ...            ...     ...       ...   
845         846   27               1              5      26     0.548   
846         847   28               2              7      34     0.359   
847         848   25               4              0      18     2.802   
848         849   32               1             12      28     0.116   
849         850   52               1             16      64     1.866   

     OtherDebt  Defaulted  DebtIncomeRatio  
0        1.073        0.0              6.3  
1        8.218        0.0        

In [None]:
#Select columns for clustering
features = ['Age', 'CustomerID', 'Age', 'EducationLevel', 'YearsEmployed', 'Income', 'CardDebt', 'OtherDebt', 'Defaulted', 'DebtIncomeRatio']
X = df[features]

In [4]:
#Scaling the columns
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [5]:
#WCSS and Silhouette Score Analysis
wcss = []
silhouette_scores = []
k_range = range(2, 6)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)
    score = silhouette_score(X_scaled, kmeans.labels_)
    silhouette_scores.append(score)

# Plot WCSS (Elbow method)
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(k_range, wcss, marker='o')
plt.title('WCSS - Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.grid(True)

# Plot Silhouette Scores
plt.subplot(1, 2, 2)
plt.plot(k_range, silhouette_scores, marker='o', color='green')
plt.title('Silhouette Scores')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Coefficient')
plt.grid(True)

plt.tight_layout()
plt.show()

ValueError: Input X contains NaN.
KMeans does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [6]:
# Step 5: Choose optimal k (e.g., from plot, let's say k=2)
optimal_k = 2
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
df['Cluster'] = kmeans.fit_predict(X_scaled)


ValueError: Input X contains NaN.
KMeans does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# Step 6: Analyze Clusters
print("Cluster Centers:\n", kmeans.cluster_centers_)
print("\nCluster Summary:\n", df.groupby('Cluster').mean())

In [None]:
# Step 7: Visualize clusters (Income vs DebtIncomeRatio)
sns.scatterplot(data=df, x='Income', y='DebtIncomeRatio', hue='Cluster', palette='Set1')
plt.title('Customer Segmentation')
plt.show()