In [None]:
import pandas as pd
from scipy import stats
import numpy as np

from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

In [None]:
# Load the dataset.
data = pd.read_csv("Credit_card_dataset.csv")

data

In [53]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8950 entries, 0 to 8949
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CUST_ID            8950 non-null   object 
 1   BALANCE_FREQUENCY  8950 non-null   float64
 2   PURCHASES          8950 non-null   float64
 3   PAYMENTS           8950 non-null   float64
 4   CREDIT_LIMIT       8949 non-null   float64
 5   CASH_ADVANCE       8950 non-null   float64
dtypes: float64(5), object(1)
memory usage: 419.7+ KB
None


In [55]:
print(data.describe())

       BALANCE_FREQUENCY     PURCHASES      PAYMENTS  CREDIT_LIMIT  \
count        8950.000000   8950.000000   8950.000000   8949.000000   
mean            0.877271   1003.204834   1733.143852   4494.449450   
std             0.236904   2136.634782   2895.063757   3638.815725   
min             0.000000      0.000000      0.000000     50.000000   
25%             0.888889     39.635000    383.276166   1600.000000   
50%             1.000000    361.280000    856.901546   3000.000000   
75%             1.000000   1110.130000   1901.134317   6500.000000   
max             1.000000  49039.570000  50721.483360  30000.000000   

       CASH_ADVANCE  
count   8950.000000  
mean     978.871112  
std     2097.163877  
min        0.000000  
25%        0.000000  
50%        0.000000  
75%     1113.821139  
max    47137.211760  


In [57]:
print(data.isnull().sum())

CUST_ID              0
BALANCE_FREQUENCY    0
PURCHASES            0
PAYMENTS             0
CREDIT_LIMIT         1
CASH_ADVANCE         0
dtype: int64


In [59]:
# Handle missing values (example: filling with mean).
# 1. Extract the mean values for each column.
mean_values = data.describe().loc["mean"]
print(mean_values)

BALANCE_FREQUENCY       0.877271
PURCHASES            1003.204834
PAYMENTS             1733.143852
CREDIT_LIMIT         4494.449450
CASH_ADVANCE          978.871112
Name: mean, dtype: float64


In [61]:
# 2. Fill missing values with the mean values
data.fillna(mean_values, inplace=True)

In [63]:
# 3. Check for missing values again to confirm they are handled
print(data.isnull().sum())

CUST_ID              0
BALANCE_FREQUENCY    0
PURCHASES            0
PAYMENTS             0
CREDIT_LIMIT         0
CASH_ADVANCE         0
dtype: int64


In [65]:
# Handle categorical features by Encoding categorical features.
data["CUST_ID"] = data["CUST_ID"].astype("category").cat.codes

In [67]:
# Check for duplicates and remove them.
data = data.drop_duplicates()

In [69]:
# Handle outliers (example: removing outliers using z-score).
data = data[(np.abs(stats.zscore(data.select_dtypes(include=[np.number]))) < 3).all(axis=1)]

# Display the cleaned data.
print(data.info())

<class 'pandas.core.frame.DataFrame'>
Index: 8314 entries, 0 to 8949
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CUST_ID            8314 non-null   int16  
 1   BALANCE_FREQUENCY  8314 non-null   float64
 2   PURCHASES          8314 non-null   float64
 3   PAYMENTS           8314 non-null   float64
 4   CREDIT_LIMIT       8314 non-null   float64
 5   CASH_ADVANCE       8314 non-null   float64
dtypes: float64(5), int16(1)
memory usage: 406.0 KB
None


In [71]:
# Select features
X = data[["PURCHASES", "CREDIT_LIMIT"]]

In [73]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [75]:
# Perform hierarchical clustering
linked = linkage(X_scaled, method = "ward")

In [None]:
# Plot the dendrogram
plt.figure(figsize=(10, 7))
dendrogram(linked,
           orientation = "top",
           distance_sort = "descending",
           show_leaf_counts=True)
plt.title("Hierarchical Clustering Dendrogram")
plt.xlabel("Customers")
plt.ylabel("Distance")
plt.show()

In [None]:
# Perform K-means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
data["Cluster"] = kmeans.fit_predict(X_scaled)

In [None]:
# Plot the clusters
plt.figure(figsize=(10, 7))
plt.scatter(data["PURCHASES"], data["CREDIT_LIMIT"], c = data["Cluster"], cmap = "viridis")
plt.title("K-means Clustering")
plt.xlabel("Purchases")
plt.ylabel("Credit Limit")
plt.show()

In [None]:
# Find the best k value using the Elbow Method
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)

In [None]:
# Plot the Elbow Method graph
plt.figure(figsize=(10, 7))
plt.plot(range(1, 11), wcss)
plt.title("Elbow Method")
plt.xlabel("Number of Clusters")
plt.ylabel("WCSS")
plt.show()

In [None]:
# Perform K-means clustering with the best k value (example: k=3)
kmeans = KMeans(n_clusters=3, random_state=42)
data["Best_Cluster"] = kmeans.fit_predict(X_scaled)

In [None]:
# Plot the clusters with the best k value
plt.figure(figsize = (10, 7))
plt.scatter(data["PURCHASES"], data["CREDIT_LIMIT"], c = data["Best_Cluster"], cmap = "viridis")
plt.title("K-means Clustering with Best k")
plt.xlabel("Purchases")
plt.ylabel("Credit Limit")
plt.show()