# Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
from feat.feature_builder import FeatureHandler
import datetime

from sklearn.preprocessing import StandardScaler


In [None]:
customers = pd.read_csv("/workspaces/hnb/data/customer_data.csv").drop("Loyalty Points", axis=1).dropna()
customers2 = pd.read_csv("/workspaces/hnb/data/customer_data.csv")
customers2 = customers2[customers2["Loyalty Card"]==1].dropna()

trans = pd.read_csv("/workspaces/hnb/data/transactions_data.csv")
trans["Date"] = [datetime.datetime.strptime(date_, '%Y-%m-%d') for date_ in trans["Date"]]

complete_df = trans.merge(customers, on="Customer ID").drop(["Transaction ID", "Customer ID", "Date","Incomplete Transaction"], axis=1)


The following chuck look for variance in the date and check for posible synthetic data

In [None]:
test = customers.values[:,1:]

unique_count = np.zeros_like(test)

for feature in range(test.shape[1]):
    _,index, count = np.unique(test[:, feature], return_counts=True, return_index=True)
    unique_count[index[count==1], feature] += 1

real_samples = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
synth_samples = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]

print("real", len(real_samples))
print("fake", len(synth_samples))

# Categorical variables to numerical

In [None]:
ids = customers["Customer ID"]
customers = pd.get_dummies(customers.iloc[:,1:], dtype=float)


In [None]:
vh = FeatureHandler(customers)
vh.categorical_to_numerical()
scaler = StandardScaler()
df = scaler.fit_transform(vh.df)

# Clustering

Look for possibles clusters and the purity of them using the silhouette score

In [None]:

kmeans = KMeans(n_clusters = 2, init='k-means++')
kmeans.fit(df)

print(silhouette_score(df, kmeans.labels_, metric='manhattan'))

In [None]:
scaler = StandardScaler()

pca = PCA(n_components=3)
principalComponents = pca.fit_transform(df)

features = range(pca.n_components_)
plt.bar(features, pca.explained_variance_ratio_, color='black')
plt.xlabel('PCA features')
plt.ylabel('variance %')
plt.xticks(features)

PCA_components = pd.DataFrame(principalComponents)
PCA_components

In [None]:
model = KMeans(n_clusters=4)

clusters = model.fit_predict(df)
PCA_components["label"] = clusters
 
fig = plt.figure(figsize=(21,10))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(PCA_components.iloc[:,0][PCA_components.label == 0], PCA_components.iloc[:,1][PCA_components.label == 0], PCA_components.iloc[:,2][PCA_components.label == 0], c='blue', s=60)
ax.scatter(PCA_components.iloc[:,0][PCA_components.label == 1], PCA_components.iloc[:,1][PCA_components.label == 1], PCA_components.iloc[:,2][PCA_components.label == 1], c='red', s=60)
ax.scatter(PCA_components.iloc[:,0][PCA_components.label == 2], PCA_components.iloc[:,1][PCA_components.label == 2], PCA_components.iloc[:,2][PCA_components.label == 2], c='green', s=60)
ax.scatter(PCA_components.iloc[:,0][PCA_components.label == 3], PCA_components.iloc[:,1][PCA_components.label == 3], PCA_components.iloc[:,2][PCA_components.label == 3], c='orange', s=60)

ax.view_init(30, 185)
plt.show()

In [None]:
import plotly.express as px
PCA_components.columns = ["pc1","pc2","pc3", "labels"]
fig = px.scatter_3d(PCA_components, x='pc1', y='pc2', z='pc3',
              color='labels')
fig.show()

In [None]:
a = trans.loc[:,["Customer ID", "Incomplete Transaction"]].groupby("Customer ID").sum()
b = trans.loc[:,["Customer ID", "Incomplete Transaction"]].groupby("Customer ID").count()
c = (a/b)

In [None]:
customers["Customer ID"] = ids
cc = customers.merge(c, on="Customer ID")
cc = cc.merge(a, on="Customer ID")
cc = cc.merge(b, on="Customer ID").drop("Household Income", axis=1)

cc["Age"] = (cc["Age"]-np.mean(cc["Age"]))/np.std(cc["Age"])
cc

In [None]:
pca = PCA(n_components=3)
principalComponents = pca.fit_transform(cc.iloc[:,:-4])

features = range(pca.n_components_)
plt.bar(features, pca.explained_variance_ratio_, color='black')
plt.xlabel('PCA features')
plt.ylabel('variance %')
plt.xticks(features)

PCA_components = pd.DataFrame(principalComponents)
PCA_components

In [None]:
import plotly.express as px
PCA_components["labels"] = c
PCA_components.columns = ["pc1","pc2","pc3", "labels"]
fig = px.scatter_3d(PCA_components, x='pc1', y='pc2', z='pc3')
fig.show()

# Autoencoders for dimensionality reduction

In [None]:
from keras.layers import Input, Dense
from keras.models import Model

In [None]:
encoding_dim = 3

In [None]:
input_dim = Input(shape = (trans.iloc[:, 4:-1].shape[1], ))

# Encoder Layers
encoded1 = Dense(10, activation = 'relu')(input_dim)
encoded2 = Dense(8, activation = 'relu')(encoded1)
encoded3 = Dense(6, activation = 'relu')(encoded2)
encoded13 = Dense(encoding_dim, activation = 'relu')(encoded3)

# Decoder Layers
decoded1 = Dense(6, activation = 'relu')(encoded13)
decoded11 = Dense(8, activation = 'relu')(decoded1)
decoded12 = Dense(10, activation = 'relu')(decoded11)
decoded13 = Dense(trans.iloc[:, 4:-1].shape[1], activation = 'linear')(decoded12)

# Combine Encoder and Deocder layers
autoencoder = Model(inputs = input_dim, outputs = decoded13)
autoencoder.compile(optimizer = 'adam', loss = 'mean_squared_error')


In [None]:
autoencoder.fit(trans.iloc[:, 4:-1], trans.iloc[:, 4:-1], epochs = 1000, shuffle = False, validation_split=0.2)


In [None]:
encoder = Model(inputs = input_dim, outputs = encoded13)

In [None]:
encoded_train = pd.DataFrame(encoder.predict(trans.iloc[:, 4:-1]))
encoded_train = encoded_train.add_prefix('feature_')


In [None]:
print(encoded_train.shape)
encoded_train.head()

In [None]:
import plotly.express as px
encoded_train["label"] = trans.iloc[:, -1]
fig = px.scatter_3d(encoded_train, x='feature_0', y='feature_1', z='feature_2', color="label")
fig.show()