In [None]:
import pandas as pd
import plotly.express as px
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
pd.options.display.max_rows = 100

In [None]:
df = pd.read_csv('CC_GENERAL.csv')

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
std_mean = df.describe().apply(lambda x: x['std']/x['mean'], axis=0)
px.bar(x = std_mean.index, y= std_mean, template='seaborn', color=std_mean.index, labels={'x':'parameter', 'y':'value'},
       title='relative distribution')

In [None]:
df.isna().sum()

In [None]:
df.fillna({'MINIMUM_PAYMENTS':df['MINIMUM_PAYMENTS'].median(),
          'CREDIT_LIMIT': df['CREDIT_LIMIT'].median()}, inplace=True)

In [None]:
df.isna().sum()

In [None]:
df.head()

In [None]:
df['monthly_avg_purchase'] = df.apply(lambda x: x['PURCHASES']/x['TENURE'],axis=1)

In [None]:
df['monthly_cash_advance'] = df.apply(lambda x: x['CASH_ADVANCE']/x['TENURE'], axis=1)

In [None]:
df['credit_score'] = df.apply(lambda x: x['BALANCE']/x['CREDIT_LIMIT'], axis=1)

In [None]:
df['payment_minpay'] = df.apply(lambda x: x['PAYMENTS']/x['MINIMUM_PAYMENTS'], axis=1)

In [None]:
df.head()

In [None]:
def purchase_type(x):
    if (x['ONEOFF_PURCHASES'] == 0) & (x['INSTALLMENTS_PURCHASES'] ==0):
        return 'none'
    if (x['ONEOFF_PURCHASES'] > 0) & (x['INSTALLMENTS_PURCHASES'] ==0):
        return 'oneoff_only'
    if (x['ONEOFF_PURCHASES'] == 0) & (x['INSTALLMENTS_PURCHASES'] >0):
        return 'installment_only'
    if (x['ONEOFF_PURCHASES'] > 0) & (x['INSTALLMENTS_PURCHASES'] > 0):
        return 'both'
df['purchase_type'] = df.apply(purchase_type, axis=1)

In [None]:
df['purchase_type'].value_counts()

In [None]:
df.head()

In [None]:
payment_minpay_purchase_type = df.groupby('purchase_type').apply(lambda x: x['payment_minpay'].mean())
px.bar(x=payment_minpay_purchase_type, y=payment_minpay_purchase_type.index, color=payment_minpay_purchase_type.index,
      template='seaborn', orientation='h')

In [None]:
cash_advance_purchase_type = df.groupby('purchase_type').apply(lambda x: x['monthly_cash_advance'].mean())
px.bar(x=cash_advance_purchase_type, y=cash_advance_purchase_type.index, color=cash_advance_purchase_type.index,
      template='seaborn', orientation='h')

In [None]:
credit_score_purchase_type = df.groupby('purchase_type').apply(lambda x: x['credit_score'].mean())
px.bar(x=credit_score_purchase_type, y=credit_score_purchase_type.index, color=credit_score_purchase_type.index,
      template='seaborn', orientation='h')

In [None]:
px.imshow(df.corr(), x=df.corr().columns, y=df.corr().columns,template='seaborn')

In [None]:
df.head()

In [None]:
df_log = df.drop(['CUST_ID', 'purchase_type'], axis=1).applymap(lambda x: np.log(x+1))

In [None]:
df_one_hot = pd.get_dummies(df['purchase_type'])

In [None]:
df_machine_learning = pd.concat([df_log, df_one_hot], axis=1)

In [None]:
df_machine_learning.head()

In [None]:
scale = StandardScaler()
df_scaled = scale.fit_transform(df_machine_learning)
df_scaled = pd.DataFrame(df_scaled, columns=df_machine_learning.columns)

In [None]:
df_scaled.head()

In [None]:
pca = PCA()
pca.fit(df_scaled)

In [None]:
px.line(y=np.cumsum(pca.explained_variance_ratio_), x = range(df_scaled.shape[-1]))

In [None]:
pca_8 = PCA(n_components=8)
pca_8.fit(df_scaled)

In [None]:
df_pca = pd.DataFrame(pca_8.transform(df_scaled), columns=[f'pca_{i}' for i in range(8)])

In [None]:
df_pca

In [None]:
inertia = []
silhouette_score_list = []
for n_clusters in range(2,20):
    kmeans = KMeans(n_clusters=n_clusters)
    kmeans.fit(df_pca)
    inertia.append(kmeans.inertia_)
    silhouette_score_list.append(silhouette_score(df_pca, kmeans.labels_))

In [None]:
px.line(y=inertia, x=range(2,20)).update_traces(mode='markers+lines')

In [None]:
px.line(y=silhouette_score_list, x=range(2,20)).update_traces(mode='markers+lines')

In [None]:
kmean = KMeans(n_clusters=5)
kmean.fit(df_pca)

In [None]:
df_clustered = df_pca.copy()

In [None]:
df_clustered['class'] = kmean.labels_

In [None]:
df_clustered.head()

In [None]:
px.scatter_matrix(df_pca, color=kmean.labels_, template='seaborn')

In [None]:
df_cluster2 = df.copy()
df_cluster2['class'] = kmean.labels_

In [None]:
df_cluster2.head()

In [None]:
selected_col = ['monthly_avg_purchase', 'monthly_cash_advance',
                'credit_score', 'payment_minpay']
temp = df_cluster2.groupby('class').apply(lambda x: x[selected_col].mean())

In [None]:
temp = temp.reset_index().melt(id_vars='class')
temp

In [None]:
px.bar(temp, x='class', color='variable', y='value', barmode='group', template='seaborn',
      facet_col='variable', facet_col_wrap=2).update_yaxes(matches=None).update_xaxes(matches=None)