# Imports and helper functions

In [42]:
import sqlalchemy
import sqlite3
from scipy.cluster import hierarchy
import umap
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from datetime import datetime as dt
import snakecase
import re
import numpy as np
import dtype_diet
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.2f' % x)
import joblib
import s3fs
from dotenv import load_dotenv
import os

# Loading data

In [None]:
AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID')
AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
s3 = s3fs.S3FileSystem(
    anon=False, key=AWS_ACCESS_KEY_ID, secret=AWS_SECRET_ACCESS_KEY)

In [None]:
load_dotenv()
 
host = os.getenv('HOST')
user = os.getenv('USER')
password = os.getenv('PASSWORD')

endpoint = 'postgresql://{}:{}@{}'.format(user, password, host)

conn = sqlalchemy.create_engine(endpoint)

In [191]:
conn = sqlalchemy.create_engine(endpoint)

In [193]:
pd.read_sql_table('customers', con=conn.connect())

Unnamed: 0,invoice_no,stock_code,description,quantity,invoice_date,unit_price,customer_id,country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,29-Nov-16,2.55,17850.00,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,29-Nov-16,3.39,17850.00,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,29-Nov-16,2.75,17850.00,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,29-Nov-16,3.39,17850.00,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,29-Nov-16,3.39,17850.00,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,7-Dec-17,0.85,12680.00,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,7-Dec-17,2.10,12680.00,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,7-Dec-17,4.15,12680.00,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,7-Dec-17,4.15,12680.00,France


In [194]:
with s3.open('s3://insiders-customers-dataset/data.csv', 'rb') as file:
    df = pd.read_csv(file)

# Data Description

# Data Wrangling

In [177]:
for i in df.columns:
    df = df.rename(columns={i: snakecase.convert(i)}
    )

## Categorical attributes analysis

### invoice_no

In [8]:
df['invoice_no'].unique()
df = df[~df['invoice_no']
    .astype(str)
    .apply(lambda x: bool(re.search('[^0-9]+', x)))
]

### stock_code

In [9]:
np.array(df[df['stock_code']
                .astype(str)
                .apply(lambda x: bool(re.search('[A-Z]{3,}', x)))]['stock_code']
                .unique()
)
df = df[~df['stock_code'].isin(
    ['DOT', 'BANK CHARGES', 'AMAZONFEE', 'PADS', 'POST', 'M', 'D', 'm']
    )
]

### invoice_date

In [10]:
df['invoice_date'] = df['invoice_date'].apply(
    lambda x: dt.strptime(x, '%d-%b-%y')
)

## Quantitative analysis

In [11]:
df = df[df['quantity'] > 0]
df = df[df['unit_price'] != 0]

## Checking/Replace NAs

In [13]:
df_aux = pd.DataFrame(
    df[df['customer_id'].isna()]['invoice_no']
    .drop_duplicates()
    )
df_aux = df_aux.assign(customer_id=np.arange(20000, 20000 + len(df_aux), 1))   


df = pd.merge(df,df_aux, how='left', on='invoice_no')
df['customer_id'] = (df['customer_id_x']
                            .combine_first(df['customer_id_y'])
                            .astype(int)
)
df = df.drop(columns=['customer_id_x', 'customer_id_y'], axis=1)

# Feature Engineering

In [14]:
df1 = df.drop(columns=['description', 'country'])
del(df)

### Gross Revenue

In [15]:
df_purchases = df1[['customer_id','invoice_no']].drop_duplicates()
df1_aux = (df1.loc[:,['invoice_no','quantity','unit_price']]
            .assign(gross_revenue=df1['quantity']*df1['unit_price'])
)                   
df1_aux = (df1_aux[['invoice_no','gross_revenue']]
            .groupby('invoice_no')
            .sum()
            .reset_index()
)
df_purchases = pd.merge(df_purchases,df1_aux, how='left', on='invoice_no')
df1_1 = (df_purchases[['customer_id','gross_revenue']]
        .groupby('customer_id')
        .sum()
        .reset_index()
)

###  Recency

In [16]:
df1_aux = (df1[['customer_id', 'invoice_date']]
           .groupby('customer_id')
           .max()
           .reset_index()
)
df1_aux['recency_days'] = (df1['invoice_date'].max()
                                                - df1_aux['invoice_date']).dt.days 

df1_1 = pd.merge(
    df1_1, df1_aux[['customer_id', 'recency_days']],
    on='customer_id', how='left')

### Quantity of purchases

In [17]:
df1_aux = (df1[['customer_id','invoice_no']]
            .groupby('customer_id')
            .nunique()
            .reset_index()
            .rename(columns={'invoice_no': 'qtd_purchases'})
)
df1_1 = pd.merge(df1_1, df1_aux, on='customer_id', how='left')

### Quantity of products

In [18]:
df1_aux = (df1[['customer_id', 'stock_code']]
            .groupby('customer_id')
            .nunique()
            .reset_index()
            .rename(columns={'stock_code':'qtd_products'})
)
df1_1 = pd.merge(df1_1, df1_aux, on='customer_id', how='left')

# Data Preparation

In [19]:
df2 = df1_1.copy()
df2 = df2.drop(columns='customer_id')

In [104]:
rs = RobustScaler()
df2['gross_revenue'] = rs.fit_transform(
    df2[['gross_revenue']].values)
pickle.dump(rs,s3.open(
    's3://insiders-customers-dataset/gross_revenue_scaler.pkl', 'wb')
)

df2['recency_days'] = rs.fit_transform(
    df2[['recency_days']].values)
pickle.dump(rs,s3.open(
    's3://insiders-customers-dataset/recency_days_scaler.pkl', 'wb')
)


df2['qtd_purchases'] = rs.fit_transform(
    df2[['qtd_purchases']].values)
pickle.dump(rs,s3.open(
    's3://insiders-customers-dataset/recency_days_scaler.pkl', 'wb')
)


df2['qtd_products'] = rs.fit_transform(
    df2[['qtd_products']].values)
pickle.dump(rs,open('qtd_products.pkl', 'wb'))




# df2['gross_revenue'] = pickle.load(
#     open('gross_revenue_scaler.pkl','rb').transform(
#     df2[['qtd_products']].values)

# df2['recency_days'] = pickle.load(
#     open('recency_days_scaler.pkl','rb').transform(
#     df2[['qtd_products']].values)

# df2['qtd_purchases'] = pickle.load(
#     open('qtd_purchases.pkl','rb').transform(
#     df2[['qtd_products']].values)

# df2['qtd_products'] = pickle.load(
#     open('qtd_products.pkl','rb').transform(
#     df2[['qtd_products']].values)

In [92]:
pipeline = Pipeline(
    steps = [
        ('preprocessor', RobustScaler()),
        ('umap_reducer', umap.UMAP(random_state=42))
    ]
)
embedding_umap = pipeline.fit(df2)
joblib.dump(embedding_umap,'embedding_umap.pkl')

df_umap = pd.DataFrame()
df_umap['embedding_x'] = embedding_umap[:, 0]
df_umap['embedding_y'] = embedding_umap[:, 1]
df_umap

TypeError: unhashable type: 'slice'

# Embedding space analysis

In [67]:
df3 = df2.copy()

## UMAP

In [68]:
reducer = umap.UMAP(random_state=42)
embedding_umap = reducer.fit_transform(df3)
pickle.dump(reducer, open('umap_embedding_space.pkl','wb'))
df_umap = pd.DataFrame()
df_umap['embedding_x'] = embedding_umap[:, 0]
df_umap['embedding_y'] = embedding_umap[:, 1]
df_umap.to_csv('umap_embedding_space.csv', index=False)
df3 = df_umap

# Model Training

## Final model

In [94]:
df4 = df_umap

In [112]:
hc_modelteste = hierarchy.linkage(dfteste, 'ward')
hierarchy.fcluster(hc_modelteste, 6, criterion='maxclust')

array([1, 1, 2, 2], dtype=int32)

In [152]:
hc_model = hierarchy.linkage(df4, 'ward')
# Model predict
labels = hierarchy.fcluster(hc_model, 6, criterion='maxclust')

# Cluster analysis

## Cluster profile

In [96]:
#removing fake customers
df5 = df1_1.copy()
df5['cluster'] = labels
df5 = df5[df5['customer_id'] < 20000]

In [99]:
df_cluster = (df5[['customer_id', 'cluster']]
                .groupby('cluster')
                .count()
                .reset_index()
)
df_cluster = df_cluster.assign(
    perc_customer=100*(df_cluster['customer_id'] / 
                                            df_cluster['customer_id'].sum())
)
df_avg_gross_revenue = (df5[['cluster', 'gross_revenue']]
                                    .groupby('cluster')
                                    .median()
                                    .reset_index()
)
df_cluster = pd.merge(
    df_cluster, df_avg_gross_revenue, how='inner', on='cluster')

# Avg recency
df_recency = df5[['cluster', 'recency_days']].groupby(
    'cluster').median().reset_index()
df_cluster = pd.merge(
    df_cluster, df_recency, how='inner', on='cluster')

# Avg quantity of purchases
df_recency = (df5[['cluster', 'qtd_purchases']]
                .groupby('cluster')
                .median()
                .reset_index()
)
df_cluster = pd.merge(
    df_cluster, df_recency, how='inner', on='cluster')

# Avg quantity of products
df_qtd_products = (df5[['cluster', 'qtd_products']]
                            .groupby('cluster')
                            .median()
                            .reset_index()
)
df_cluster = pd.merge(
    df_cluster, df_qtd_products, how='inner', on='cluster')

df_cluster_result = df_cluster.sort_values(
    by='gross_revenue', ascending=False)
display(df_cluster_result)

Unnamed: 0,cluster,customer_id,perc_customer,gross_revenue,recency_days,qtd_purchases,qtd_products
3,4,1007,23.23,3122.04,15.0,8.0,117.0
5,6,1036,23.9,1001.66,36.0,4.0,50.0
4,5,826,19.06,538.01,63.0,2.0,29.0
0,1,657,15.16,290.66,46.0,1.0,19.0
2,3,700,16.15,229.09,239.0,1.0,14.0
1,2,108,2.49,212.87,366.0,1.0,12.5


Cluster 4 (Candidate of insiders)

- Number of customers: 1007 (23.23%)
- median of gross_revenue: £2671.46
- median of Recency: 24 days
- median of quantity of purchases in one year: 6 purchases
- median of quantity of distinct products bought: 150 products

Cluster 6 (Cluster more products)

- Number of customers: 1036 (23.90%)
- median of gross_revenue: £547.06
- median of Recency: 40 days
- median of quantity of purchases in one year: 3 purchases
- median of quantity of distinct products bought: 53 products


Cluster 5 (Cluster even more products)

- Number of customers: 826 (19.06%)
- median of gross_revenue: £1059.97
- median of Recency: 64 days
- median of quantity of purchases in one year: 2 purchases
- median of quantity of distinct products bought: 29 products


Cluster 1 (Cluster more purchases )

- Number of customers: 657 (15.16%)
- Average gross_revenue: £232.14
- Average Recency: 45 days
- Average of quantity of purchases in one year: 1 purchase
- Average of quantity of distinct products bought: 15 products


Cluster 3 (Cluster decrease recency days)
- Number of customers: 700 (2.49%)
- median of gross_revenue: £196.43
- median of Recency: 232 days
- median of quantity of purchases in one year: 1 purchase
- median of quantity of distinct products bought: 14 products


Cluster 2 (Cluster decrease even more recency days)

- Number of customers: 108 (22.%)
- median gross_revenue: £105.72
- median Recency: 364 days
- median of quantity of purchases in one year: 1 purchase
- median of quantity of distinct products bought: 9 products

# Deploy to production

## Insert into SQLITE

In [153]:
load_dotenv()
 
host = os.getenv('HOST')
user = os.getenv('USER')
password = os.getenv('PASSWORD')

endpoint = 'postgresql://{}:{}@{}'.format(user, password, host)

conn = sqlalchemy.create_engine(endpoint)
# conn.connect()
# conn.execute(sqlalchemy.text(query_create_table))
# conn.commit()
# conn.close()


df5.to_sql(
    'insiders', con=conn, if_exists='append', index=False)

334