## Importing libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder, StandardScaler,MinMaxScaler
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering 
from sklearn.mixture import GaussianMixture
from sklearn.cluster import SpectralClustering 
from sklearn.cluster import DBSCAN
import warnings
warnings.filterwarnings('ignore')
from sklearn.cluster import KMeans

In [None]:
card = pd.read_csv('CC GENERAL.csv')
card

In [None]:
card.isnull().sum()

In [None]:
card['MINIMUM_PAYMENTS'].isnull().head()

In [None]:
card['MINIMUM_PAYMENTS'].fillna('0',inplace=True)

In [None]:
card.isnull().sum()

In [None]:
card['CREDIT_LIMIT'].isnull()

In [None]:
null_values = pd.isnull(card['CREDIT_LIMIT'])
card[null_values]

In [None]:
card['CREDIT_LIMIT'].fillna('0',inplace=True)

In [None]:
df = card.drop(['CUST_ID'],axis = 1)
df

## Data Visualization

In [None]:
for column in df.columns:
     plt.figure(figsize = (20,3))
     sns.displot(df[column])
     plt.show()

1. It is visible from the above trends that most people have low credit limits upto 7500
2. Many variables like Purchases and OneOffPurchases follow the same trend as credit limit and hence they might be correlated, which needs to be checked
3. Purchases are quite less for many people.
4. Most preferred method of purchasing is installments
5. Most of the card holders hold it for atleast 12 months

## Determining the correlation between the variables

In [None]:
card.corr()

In [None]:
plt.figure(figsize=(15, 15))
sns.heatmap(round(df.corr(method='spearman'), 2), 
            annot=True, mask=None, cmap='YlGnBu')
plt.show()

From the correlation heatmap it is clear that:
1. Some variables like PURCHASES_TRX, PURCHASES_FREQUENCY, PURCHASES,CASH_ADVANCE are strongly correlated with a lot of variables and at the same time some variables like CASH_ADVANCE_TRX, PRC_FULL_PAYMENT are not so correlated with others
2. As the credit limit will increase, the balance will also increase, which is indicated by a strong positive correlation.
3. Purchases, oneoffpurchases and installment purchases are all related due to a strong positive correlation

Now, we need to see the plots to check what exactly is the relationship between these variables

In [None]:
sns.pairplot(df)
plt.show()

## Training and Test Dataset Split

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df,test_size=0.2,random_state=42)

In [None]:
train_df

## Normalizing the values

In [None]:
mm = MinMaxScaler()
train_df = mm.fit_transform(train_df)
test_df = mm.transform(test_df)
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer()
train_df = pt.fit_transform(train_df)
test_df = pt.transform(test_df)

## Clustering

In [None]:
x=card.iloc[:,1:6]

In [None]:
km = KMeans(5)
km.fit(x)

In [None]:
km.fit(train_df)
y_pred = km.predict(train_df)

In [None]:
cluster_df = pd.DataFrame(train_df,columns = df.columns)
cluster_df['clusters'] = y_pred
cluster_df.head(10)

In [None]:
identified_clusters = km.fit_predict(x)
identified_clusters

In [None]:
data_with_clusters = df.copy()
data_with_clusters['Cluster'] = identified_clusters
data_with_clusters

In [None]:
X = cluster_df[['BALANCE','PURCHASES']].to_numpy()

In [None]:
km = KMeans(n_clusters = 6,init ='k-means++', max_iter=300,random_state=42)
km.fit(X)
y_balance_pred = km.predict(X)

In [None]:
plt.scatter(X[y_balance_pred==0, 0], X[y_balance_pred==0, 1], s=100, c='#c43735', label ='Cluster 1')
plt.scatter(X[y_balance_pred==1, 0], X[y_balance_pred==1, 1], s=100, c='#243233', label ='Cluster 2')
plt.scatter(X[y_balance_pred==2, 0], X[y_balance_pred==2, 1], s=100, c='#1b6116', label ='Cluster 3')
plt.scatter(X[y_balance_pred==3, 0], X[y_balance_pred==3, 1], s=100, c='#fcba03', label ='Cluster 4')
plt.scatter(X[y_balance_pred==4, 0], X[y_balance_pred==4, 1], s=100, c='#543d54', label ='Cluster 4')
plt.scatter(X[y_balance_pred==5, 0], X[y_balance_pred==5, 1], s=100, c='#612116', label ='Cluster 4')

plt.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], s=300, c='cyan', label = 'Centroids')
plt.show()

## Scaled data

In [None]:
from sklearn import preprocessing
x_scaled = preprocessing.scale(df)
x_scaled

In [None]:
km_scaled = KMeans(5)
km_scaled.fit(x_scaled)

In [None]:
wcss = []
# 'cl_num' is a that keeps track the highest number of clusters we want to use the WCSS method for. 
# We have it set at 10 right now, but it is completely arbitrary.
cl_num = 10
for i in range (1,cl_num):
    km= KMeans(i)
    km.fit(x_scaled)
    wcss_iter = km.inertia_
    wcss.append(wcss_iter)
wcss

In [None]:
number_clusters = range(1,cl_num)
plt.figure(figsize=(15, 15))
plt.plot(number_clusters, wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('Within-cluster Sum of Squares')

From the elbow curve, we can see that the graph elbows when the number of clusters are 2, 3, 4 and 8.
Hence, we need to analyze how the scatter plots will behave when we divide them with 2, 3, 4 and 8 clusters respectively. 

## 2 clusters

In [None]:
km_2 = KMeans(2)
km_2.fit(x_scaled)

In [None]:
clusters_2 = x.copy()
clusters_2['cluster_pred']=km_2.fkm = KMeans(n_clusters = 2,init ='k-means++', max_iter=300,random_state=42)
km.fit(X_2)
y_2_balance_pred = km.predict(X_2)it_predict(x_scaled)

In [None]:
X_2 = cluster_df[['BALANCE','PURCHASES']].to_numpy()

In [None]:
km = KMeans(n_clusters = 2,init ='k-means++', max_iter=300,random_state=42)
km.fit(X_2)
y_2_balance_pred = km.predict(X_2)

In [None]:
plt.scatter(X_2[y_2_balance_pred==0, 0], X_2[y_2_balance_pred==0, 1], s=100, c='#c43735', label ='Cluster 1')
plt.scatter(X_2[y_2_balance_pred==1, 0], X_2[y_2_balance_pred==1, 1], s=100, c='#243233', label ='Cluster 2')
plt.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], s=300, c='cyan', label = 'Centroids')
plt.show()

## 3 Clusters

In [None]:
km_3 = KMeans(3)
km_3.fit(x_scaled)

In [None]:
clusters_3 = x.copy()
clusters_3['cluster_pred']=km_3.fit_predict(x_scaled)

In [None]:
X_3 = cluster_df[['BALANCE','PURCHASES']].to_numpy()

In [None]:
km = KMeans(n_clusters = 3,init ='k-means++', max_iter=300,random_state=42)
km.fit(X_3)
y_3_balance_pred = km.predict(X_3)

In [None]:
plt.scatter(X_3[y_3_balance_pred==0, 0], X_3[y_3_balance_pred==0, 1], s=100, c='#c43735', label ='Cluster 1')
plt.scatter(X_3[y_3_balance_pred==1, 0], X_3[y_3_balance_pred==1, 1], s=100, c='#243233', label ='Cluster 2')
plt.scatter(X_3[y_3_balance_pred==2, 0], X_3[y_3_balance_pred==2, 1], s=100, c='#1b6116', label ='Cluster 3')
plt.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], s=300, c='cyan', label = 'Centroids')
plt.show()

## 4 clusters

In [None]:
km_4 = KMeans(4)
km_4.fit(x_scaled)

In [None]:
clusters_4 = x.copy()
clusters_4['cluster_pred']=km_4.fit_predict(x_scaled)

In [None]:
X_4 = cluster_df[['BALANCE','PURCHASES']].to_numpy()

In [None]:
km = KMeans(n_clusters = 4,init ='k-means++', max_iter=300,random_state=42)
km.fit(X_4)
y_4_balance_pred = km.predict(X_4)

In [None]:
plt.scatter(X_4[y_4_balance_pred==0, 0], X_4[y_4_balance_pred==0, 1], s=100, c='#c43735', label ='Cluster 1')
plt.scatter(X_4[y_4_balance_pred==1, 0], X_4[y_4_balance_pred==1, 1], s=100, c='#243233', label ='Cluster 2')
plt.scatter(X_4[y_4_balance_pred==2, 0], X_4[y_4_balance_pred==2, 1], s=100, c='#1b6116', label ='Cluster 3')
plt.scatter(X_4[y_4_balance_pred==3, 0], X_4[y_4_balance_pred==3, 1], s=100, c='#fcba03', label ='Cluster 4')
plt.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], s=300, c='cyan', label = 'Centroids')
plt.show()

## 8 clusters

In [None]:
km_8 = KMeans(8)
km_8.fit(x_scaled)

In [None]:
clusters_8 = x.copy()
clusters_8['cluster_pred']=km_8.fit_predict(x_scaled)

In [None]:
X_8 = cluster_df[['BALANCE','PURCHASES']].to_numpy()

In [None]:
km = KMeans(n_clusters = 8,init ='k-means++', max_iter=300,random_state=42)
km.fit(X_8)
y_8_balance_pred = km.predict(X_8)

In [None]:
plt.scatter(X_8[y_8_balance_pred==0, 0], X_8[y_8_balance_pred==0, 1], s=100, c='#c43735', label ='Cluster 1')
plt.scatter(X_8[y_8_balance_pred==1, 0], X_8[y_8_balance_pred==1, 1], s=100, c='#243233', label ='Cluster 2')
plt.scatter(X_8[y_8_balance_pred==2, 0], X_8[y_8_balance_pred==2, 1], s=100, c='#1b6116', label ='Cluster 3')
plt.scatter(X_8[y_8_balance_pred==3, 0], X_8[y_8_balance_pred==3, 1], s=100, c='#fcba03', label ='Cluster 4')
plt.scatter(X_8[y_8_balance_pred==4, 0], X_8[y_8_balance_pred==4, 1], s=100, c='#bae8c6', label ='Cluster 1')
plt.scatter(X_8[y_8_balance_pred==5, 0], X_8[y_8_balance_pred==5, 1], s=100, c='#3287a1', label ='Cluster 2')
plt.scatter(X_8[y_8_balance_pred==6, 0], X_8[y_8_balance_pred==6, 1], s=100, c='#523646', label ='Cluster 3')
plt.scatter(X_8[y_8_balance_pred==7, 0], X_8[y_8_balance_pred==7, 1], s=100, c='#381113', label ='Cluster 4')
plt.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], s=300, c='cyan', label = 'Centroids')
plt.show()