--- 
# Part 2: Cluster Analysis

--- 

## Importing Libraries

In [37]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN, KMeans, OPTICS
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import NearestNeighbors
from sklearn import metrics
#from sklearn.metrics import silhouette_score

#Package used to analyze users
from lifetimes.utils import summary_data_from_transaction_data

# Set a random seed.
np.random.seed(42)

pd.set_option('display.max_rows', None)
sns.set_palette('Greens_r')

## Reimport cleaned & merged data

In [38]:
offers = pd.read_csv('./data/final_offers_clean.csv')
transactions = pd.read_csv('./data/final_transactions_clean.csv')

In [39]:
offers.head()

Unnamed: 0,customer_id,time_hours,year,membership_days,age,income,reward,difficulty,duration,email,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
0,78afa995795e4d85b5d9ceeca43f5fef,0,2017,1589,75,100000,5,5,7,1,...,0,0,1,0,0,0,0,0,0,0
1,78afa995795e4d85b5d9ceeca43f5fef,6,2017,1589,75,100000,5,5,7,1,...,0,0,1,0,0,0,0,0,0,0
2,e2127556f4f64592b11af22de27a7932,408,2018,1237,68,70000,5,5,7,1,...,0,1,0,0,0,0,0,0,0,0
3,e2127556f4f64592b11af22de27a7932,420,2018,1237,68,70000,5,5,7,1,...,0,1,0,0,0,0,0,0,0,0
4,68617ca6246f4fbc85e91a2a49552598,504,2017,1443,96,37000,5,5,7,1,...,0,0,0,0,0,0,0,1,0,0


In [40]:
transactions.head()

Unnamed: 0,customer_id,amount,time_hours,year,membership_days,age,income,day_Friday,day_Monday,day_Saturday,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
0,02c083884c7d45b39cc68e1314fec56c,0.83,0,2016,1891,20,30000,0,1,0,...,0,0,0,0,1,0,0,0,0,0
1,02c083884c7d45b39cc68e1314fec56c,1.44,6,2016,1891,20,30000,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,02c083884c7d45b39cc68e1314fec56c,4.56,12,2016,1891,20,30000,0,1,0,...,0,0,0,0,1,0,0,0,0,0
3,02c083884c7d45b39cc68e1314fec56c,1.53,84,2016,1891,20,30000,0,1,0,...,0,0,0,0,1,0,0,0,0,0
4,02c083884c7d45b39cc68e1314fec56c,0.5,90,2016,1891,20,30000,0,1,0,...,0,0,0,0,1,0,0,0,0,0


## RMF Metrics to use for Clustering

In [41]:
# # add receipt, view and completion time columns
# offers['time'] = offers['time'] + 0.5
# offers['received_time'] = offers['offer_received'] * offers['time']
# offers['viewed_time'] = offers['offer_viewed'] * offers['time']
# offers['completed_time'] = offers['offer_completed'] * offers['time']
# offers.head()

In [42]:
# time to Datetime
transactions['datetime'] = transactions['time_hours'].apply(lambda x: pd.Timestamp('2000-01-01T12') + pd.Timedelta(hours=x))
transactions.head()

rf = summary_data_from_transaction_data(transactions, 'customer_id', 'datetime', monetary_value_col='amount')
rf.drop('T', axis=1, inplace=True)

offers.set_index('customer_id', inplace=True)

customers = offers.join(rf)
customers.reset_index(inplace=True)
customers.head()


Unnamed: 0,customer_id,time_hours,year,membership_days,age,income,reward,difficulty,duration,email,...,month_6,month_7,month_8,month_9,month_10,month_11,month_12,frequency,recency,monetary_value
0,0009655768c64bdeb2e877511632db8f,168,2017,1607,33,72000,0,0,3,1,...,0,0,0,0,0,0,0,6.0,19.0,17.573333
1,0009655768c64bdeb2e877511632db8f,192,2017,1607,33,72000,0,0,3,1,...,0,0,0,0,0,0,0,6.0,19.0,17.573333
2,0009655768c64bdeb2e877511632db8f,408,2017,1607,33,72000,5,5,5,1,...,0,0,0,0,0,0,0,6.0,19.0,17.573333
3,0009655768c64bdeb2e877511632db8f,456,2017,1607,33,72000,5,5,5,1,...,0,0,0,0,0,0,0,6.0,19.0,17.573333
4,0009655768c64bdeb2e877511632db8f,336,2017,1607,33,72000,0,0,4,1,...,0,0,0,0,0,0,0,6.0,19.0,17.573333


## Conversion Rates from Offer to Transaction

## Feature Scaling

In [53]:
customers = customers.dropna()

In [54]:
X = customers.drop(columns =['customer_id'])
y = customers['customer_id']

In [55]:
# Train/test split the data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.33,
                                                    random_state = 42)

In [56]:
# Instantiate our StandardScaler.
ss = StandardScaler()

# Standardize X_train.
X_train = ss.fit_transform(X_train)

# Standardize X_test.
X_test = ss.transform(X_test)

## PCA - Dimensionality Reduction

In [57]:
# Instantiate PCA
pca =PCA(random_state = 42)

In [58]:
# Fit PCA on the training data
pca.fit(X_train)

PCA(random_state=42)

In [70]:
# Transform PCA on the training data
Z_train = pca.transform(X_train)

In [71]:
# Transform the test data
Z_test = pca.transform(X_test)

In [72]:
# Pull the explained variance attribute.
var_exp = pca.explained_variance_ratio_
print(f'Explained variance (first 20 components): {np.round(var_exp[:20],3)}')

print('')

# Generate the cumulative explained variance.
cum_var_exp = np.cumsum(var_exp)
print(f'Cumulative explained variance (first 20 components): {np.round(cum_var_exp[:20],3)}')

Explained variance (first 20 components): [0.09  0.072 0.05  0.046 0.041 0.039 0.035 0.028 0.028 0.027 0.025 0.023
 0.023 0.023 0.023 0.023 0.022 0.022 0.022 0.022]

Cumulative explained variance (first 20 components): [0.09  0.162 0.212 0.258 0.299 0.339 0.374 0.402 0.43  0.457 0.481 0.505
 0.528 0.55  0.573 0.596 0.618 0.641 0.663 0.684]


## DBSCAN

In [84]:
model = DBSCAN(eps = 1.5, min_samples = 100)
dbscan_clusters = model.fit(Z_train)

print("Number of Clusters:", len(np.unique(dbscan_clusters)))

Number of Clusters: 1


In [85]:
silhouette_score(Z_train, dbscan.labels_)

ValueError: Found input variables with inconsistent numbers of samples: [87521, 43108]

In [83]:
df = Z_train.copy()
df['cluster'] = dbscan_clusters
sns.pairplot(df, hue='cluster');

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

## KMeans ?

## Model - supervised

use drop first on dummified columns

## Transfer Learning Model

## Building Personas of Each Customer Segment