<a href="https://www.kaggle.com/code/georgyniorosales/credit-card-clustering?scriptVersionId=145022924" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
dataframe = pd.read_csv('/kaggle/input/ccdata/CC GENERAL.csv')
dataframe.head()

# Pre-processing data

Remove less important information
- ID 
- TENURE

In [None]:
dataframe.drop(columns=['CUST_ID', 'TENURE'], inplace=True)
dataframe.head()

# Find missing data

In [None]:
missing = dataframe.isna().sum()
print(missing)

# Replace missing by the median

In [None]:
dataframe.fillna(dataframe.median(), inplace=True)
missing_2 = dataframe.isna().sum()
print(missing_2)

# Normalize data

In [None]:
from sklearn.preprocessing import Normalizer

values = Normalizer().fit_transform(dataframe.values)
print(values)

# Clustering

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, n_init=10, max_iter=300)
y_pred = kmeans.fit_predict(values)


# Validation
* Calculate the Silhouette coeff
* Davis - Bouldin
* Calinski-Harabasz

In [None]:
from sklearn import metrics

labels = kmeans.labels_
silhouette = metrics.silhouette_score(values, labels, metric='euclidean')
# interval -1 to 1

print(silhouette)



In [None]:
dbs = metrics.davies_bouldin_score(values, labels)
# best near to 0
print(dbs)

In [None]:
calinski = metrics.calinski_harabasz_score(values, labels)
print(calinski)

# Relative Validation

- change relative params to showed the best configuration

In [None]:
def cluster_algorithm(n_clusters, dataset):
    kmeans = KMeans(n_clusters=n_clusters, n_init=10, max_iter=300)
    labels = kmeans.fit_predict(dataset)
    s = metrics.silhouette_score(dataset, labels, metric='euclidean')
    dbs = metrics.davies_bouldin_score(dataset,labels)
    calinski = metrics.calinski_harabasz_score(dataset, labels)
    return s, dbs, calinski

In [None]:
for i in range(3,21):
    res = cluster_algorithm(i, values)
    print(f'- {i}- clusters --> silhouette: {res[0]} Davis-Bouldin: {res[1]} Calinski: {res[2]}')

In [None]:
dataframe.count()

In [None]:
import numpy as np

random_data = np.random.rand(8950, 16)
s, dbs, calinski = cluster_algorithm(6, random_data)
print(s, dbs, calinski)
print(cluster_algorithm(6, values))

In [None]:
set1, set2, set3 = np.array_split(values, 3)
s1, dbs1, calinski1 = cluster_algorithm(6, set1)
s2, dbs2, calinski2 = cluster_algorithm(6, set2)
s3, dbs3, calinski3 = cluster_algorithm(6, set3)

print(s1, dbs1, calinski1)
print(s2, dbs2, calinski2)
print(s3, dbs3, calinski3)

In [None]:
import matplotlib.pyplot as plt

plt.scatter(dataframe['PURCHASES'], dataframe['PAYMENTS'], c=labels, s=5, cmap='rainbow')
plt.xlabel('Valor total pago')
plt.ylabel('Valor total gasto')

In [None]:
import seaborn as sns

dataframe['cluster'] = labels

sns.pairplot(dataframe[0:], hue='cluster')

# See the client values

In [None]:
dataframe.groupby('cluster').describe()

In [None]:
centroids = kmeans.cluster_centers_
print(centroids)

In [None]:
for i,_ in enumerate(centroids[0]):
    print(dataframe.columns.values[i], "\n{:.4f}".format(centroids[:,i].var()))

## searching the client behavior

`
BALANCE 0.0224
PURCHASES 0.0197
CASH_ADVANCE 0.0225
CREDIT_LIMIT 0.0360
PAYMENTS 0.0280
MINIMUM_PAYMENTS 0.0541 <- this is related to less payment to use the credit card, so discard
`


In [None]:
description = dataframe.groupby('cluster')[['BALANCE', 'PURCHASES', 'CASH_ADVANCE', 'CREDIT_LIMIT', 'PAYMENTS']]
n_clients = description.size()
description = description.mean()
description['n_clients'] = n_clients
print(description)

In [None]:
dataframe.groupby('cluster')['PRC_FULL_PAYMENT'].describe()

# analise dos clientes por faixa de clusters

* Cluster 0: Clientes com os maiores valores gastos. Menor valor em compras. Valor consideravel em saques. Pessimos pagadores. Boa quantidade de clientes.
* Cluster 1: Clientes com menores valores gastos. Menor valor em saques. Clientes com maior limite de credito. Bons pagadores. Maior numero de clientes.
* Cluster 2: Clientes com menor limite de credito. Não são bons pagadores. Menor numero de clientes.
* Cluster 3: Maior valor em compras. Melhores Pgadores
* Cluster 4: Maior valor em saques. Pagam as vezes