In [56]:
import importlib
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
# clustering algorithms
from sklearn.cluster import AgglomerativeClustering, KMeans, SpectralClustering
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer

import module

try:
    importlib.reload(module) # reload module
except NameError:
    pass

In [62]:
def catchstate(df, var_name: str) -> 'pd.DataFrame':
    """
    Helper function that captures intermediate Dataframes mid-chain.
    In the global namespace, make a new variable called var_name and set it to dataframe
    """
    globals()[var_name] = df
    return df

# Load Dataset

In [29]:
path = Path("../input/dataset-clean.parquet")
df = pd.read_parquet(path)

# Feature Engineering

In [30]:
df.head()

Unnamed: 0,invoice_no,invoice_date,description,stock_code,unit_price,quantity,total_price,customer_id,country
0,536365,2010-12-01 08:26:00,White Hanging Heart T-Light Holder,85123A,2.55,6,15.3,17850.0,United Kingdom
1,536365,2010-12-01 08:26:00,White Metal Lantern,71053,3.39,6,20.34,17850.0,United Kingdom
2,536365,2010-12-01 08:26:00,Cream Cupid Hearts Coat Hanger,84406B,2.75,8,22.0,17850.0,United Kingdom
3,536365,2010-12-01 08:26:00,Knitted Union Flag Hot Water Bottle,84029G,3.39,6,20.34,17850.0,United Kingdom
4,536365,2010-12-01 08:26:00,Red Woolly Hottie White Heart.,84029E,3.39,6,20.34,17850.0,United Kingdom


In [63]:
def prepare_for_segmentation(df):
    """Do feature engineering on dataset for segmentation."""
    return (df
        .groupby("customer_id")
        .agg(
            n_purchase = ("invoice_no", "nunique"), 
            n_item_purchased = ("quantity", "sum"),
            n_unique_item_purchased = ("stock_code", "nunique"),
            total_revenue = ("total_price", "sum"),
            avg_revenue_per_item = ("unit_price", "mean"),
            country = ("country", lambda x: x.unique()[0].lower()),
            recency = ("invoice_date", lambda x: (df["invoice_date"].max() - x.max()).days),
        )    
        .assign(
            country_mapped = lambda df_: df_["country"].rank(method='dense', ascending=False).astype(int),
            avg_revenue_per_purchase = lambda df_: df_["total_revenue"].div(df_["n_purchase"]), 
        )
        .reset_index()
        .pipe(catchstate, var_name="df_intermediate")
        .drop(columns="country")
    )

df_clean = prepare_for_segmentation(df)

In [70]:
country_map = (df_intermediate
    .loc[:, ["country", "country_mapped"]]
    .drop_duplicates()
    .sort_values("country_mapped", ignore_index=True)
)

# T

In [None]:
columns= ['age', 'income', 'total_spent', 'total_purchase']
scaler= StandardScaler()

X= pd.DataFrame(df
    .pipe(module.drop_outliers, columns, 'z_score')
    .pipe(pd.get_dummies)
    .pipe(scaler.fit_transform)
)
module.hopkins_test(X)