In [16]:
from sqlalchemy import create_engine
import numpy as np
import pandas as pd
import pandas.io.sql as sqlio
import os
from dotenv import load_dotenv
import datetime as dt
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier, plot_tree
import torch
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader, TensorDataset
import shap
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.cluster import KMeans, Birch
from sklearn.metrics import silhouette_score
from sklearn.pipeline import Pipeline
from hdbscan import HDBSCAN
import statsmodels.api as sm 
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import pickle as pkl
import json
import random

In [17]:
import sys
sys.path.insert(0, '../src/torch_helpers')

from autoencoder import AutoEncoder, AETrainer
from encoding_classifier import EncodingClassifier, ClfTrainer
from lrp import LinearLRP

In [18]:
RANDOM_STATE = 42

random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)

<torch._C.Generator at 0x72c93ec8ef70>

# EDA

In [19]:
# Curry function to initialize postgres engine and return read_query function

def make_read_query(protocol=None, user=None, password=None, host=None, port=None, db=None):
    load_dotenv()
    
    protocol = protocol if protocol else 'postgresql+psycopg2'
    user = user if user else os.environ.get('POSTGRES_USER')
    password = password if password else os.environ.get('POSTGRES_PASSWORD')
    host = host if host else 'localhost'
    port = port if port else 5432
    db = db if db else os.environ.get('POSTGRES_DB')
    
    db_url = f'{protocol}://{user}:{password}@{host}:{port}/{db}'
    engine = create_engine(db_url)

    def read_query(query, verbose=True):
        if verbose:
            print(query, '\n')
            
        with engine.connect() as conn:
            df = sqlio.read_sql_query(query, conn)

        return df

    return read_query

In [20]:
read_query = make_read_query()

In [21]:
users_enriched = read_query("SELECT * FROM users_enriched")
users_enriched.head()

SELECT * FROM users_enriched 



Unnamed: 0,id,age,gender,country,city,traffic_source,created_at,first_order_timestamp,last_order_timestamp,days_to_activation,...,fav_category,fav_cat_purchases,fav_cat_avg_item_value,fav_cat_freq_strength,fav_cat_spending_strength,order_items,loyalty_segment,order_value_segment,category_choice_segment,lifetime_status
0,56219,60,F,United States,Philadelphia,Facebook,2019-01-02 00:06:00,2021-05-27 23:07:12,2021-05-27 23:07:12,876.0,...,Jeans,1.0,74.99,0.0,0.0,"{""(70373,\""WranglerÂ® Booty Up Stretch Jean fo...",,,,
1,9418,69,F,United Kingdom,Cardiff,Search,2019-01-02 00:36:00,2021-07-30 12:35:12,2023-04-21 14:50:59,940.0,...,Outerwear & Coats,1.0,399.0,0.5,0.900677,"{""(90353,\""Colleen Skirt Slip\"",Women,Intimate...",,,,
2,21731,65,F,China,Harbin,Email,2019-01-02 01:47:00,2023-09-17 22:23:37,2023-09-17 22:23:37,1719.0,...,Blazers & Jackets,1.0,169.98,0.25,0.696811,"{""(27173,\""Modadorn New Arrivals Spring & fall...",,,,
3,39594,17,M,China,Shenzhen,Search,2019-01-02 03:00:00,2019-02-11 03:06:43,2019-10-25 02:29:03,40.0,...,Fashion Hoodies & Sweatshirts,1.0,51.96,0.5,0.722168,"{""(49470,\""Billabong Men's Semi-Button Up Stri...",,,,
4,2000,30,F,China,Hefei,Search,2019-01-02 03:19:00,NaT,NaT,,...,,,,,,,,,,


In [22]:
order_items_enriched = read_query("SELECT * FROM order_items_enriched")
order_items_enriched.head()

SELECT * FROM order_items_enriched 



Unnamed: 0,id,order_id,user_id,product_id,inventory_item_id,status,created_at,shipped_at,delivered_at,returned_at,...,product_category,product_department,product_brand,product_name,distribution_center,user_age,user_gender,user_county,user_city,user_traffic_source
0,1,1,1,2953,3,Shipped,2022-07-20 10:32:05,2022-07-20 16:42:00,NaT,NaT,...,Active,Women,Tommy Hilfiger,Tommy Hilfiger Women's 3 Pack Sports Crew,Memphis TN,62,F,South Korea,Bucheon City,Search
1,2,1,1,4731,7,Shipped,2022-07-20 11:05:38,2022-07-20 16:42:00,NaT,NaT,...,Jeans,Women,Joe's Jeans,Joe's Jeans Women's Skinny Jeans,Port Authority of New York/New Jersey NY/NJ,62,F,South Korea,Bucheon City,Search
2,3,1,1,7656,9,Shipped,2022-07-19 11:29:28,2022-07-20 16:42:00,NaT,NaT,...,Blazers & Jackets,Women,eVogues Apparel,Plus size Button Closure Cropped Bolero Jacket...,New Orleans LA,62,F,South Korea,Bucheon City,Search
3,4,2,2,25774,11,Complete,2022-02-20 10:28:57,2022-02-22 12:23:00,2022-02-23 18:29:00,NaT,...,Underwear,Men,Tommy Bahama,Tommy Bahama Tropical Print Boxer Short,Los Angeles CA,65,M,Brasil,Franca,Organic
4,5,3,3,22308,13,Processing,2023-04-23 09:04:57,NaT,NaT,NaT,...,Pants,Men,Dockers,Dockers Men's Limited Offer D2 Stretch Khaki Pant,Philadelphia PA,16,M,United States,Hallandale Beach,Organic


In [24]:
order_items_enriched.product_category.unique()

array(['Active', 'Jeans', 'Blazers & Jackets', 'Underwear', 'Pants',
       'Sleep & Lounge', 'Swim', 'Sweaters', 'Skirts', 'Shorts',
       'Accessories', 'Fashion Hoodies & Sweatshirts',
       'Suits & Sport Coats', 'Tops & Tees', 'Socks', 'Outerwear & Coats',
       'Dresses', 'Jumpsuits & Rompers', 'Maternity', 'Intimates',
       'Pants & Capris', 'Leggings', 'Plus', 'Socks & Hosiery', 'Suits',
       'Clothing Sets'], dtype=object)

In [25]:
order_items_enriched.status.value_counts()

status
Shipped       54440
Complete      45609
Processing    36388
Cancelled     27090
Returned      18232
Name: count, dtype: int64

In [48]:
repeat_purchasers = users_enriched.loc[users_enriched.n_orders>1]
repeat_purchasers_idx = list(set(order_items_enriched.user_id).intersection(repeat_purchasers.id))

In [118]:
order_items_filtered = order_items_enriched.loc[order_items_enriched.user_id.isin(repeat_purchasers_idx)]\
                                          .loc[order_items_enriched.user_gender=='F']\
                                          .loc[~order_items_enriched.status.isin(['Cancelled', 'Returned'])]

user_sequences = order_items_filtered.sort_values('created_at')\
                                    .groupby('user_id')['product_category'].apply(list)\
                                    .tolist()

In [119]:
user_sequences

[['Sleep & Lounge', 'Jeans', 'Jumpsuits & Rompers', 'Swim', 'Jeans'],
 ['Intimates', 'Maternity'],
 ['Tops & Tees', 'Leggings'],
 ['Socks & Hosiery'],
 ['Intimates',
  'Fashion Hoodies & Sweatshirts',
  'Fashion Hoodies & Sweatshirts',
  'Dresses',
  'Swim',
  'Sweaters',
  'Shorts',
  'Intimates',
  'Intimates'],
 ['Tops & Tees', 'Accessories'],
 ['Plus',
  'Sleep & Lounge',
  'Intimates',
  'Fashion Hoodies & Sweatshirts',
  'Jeans',
  'Socks & Hosiery',
  'Socks & Hosiery'],
 ['Dresses', 'Suits', 'Accessories', 'Leggings', 'Shorts'],
 ['Shorts', 'Suits', 'Pants & Capris'],
 ['Pants & Capris', 'Intimates'],
 ['Sleep & Lounge', 'Outerwear & Coats', 'Active'],
 ['Plus',
  'Socks & Hosiery',
  'Sweaters',
  'Intimates',
  'Leggings',
  'Intimates',
  'Plus'],
 ['Shorts', 'Intimates', 'Pants & Capris'],
 ['Jeans', 'Tops & Tees'],
 ['Shorts', 'Leggings', 'Intimates'],
 ['Intimates', 'Plus', 'Intimates'],
 ['Accessories', 'Blazers & Jackets'],
 ['Socks & Hosiery', 'Sleep & Lounge'],
 ['Plu

In [120]:
len(user_sequences)

10435

In [121]:
from prefixspan import PrefixSpan

ps = PrefixSpan(user_sequences)

In [122]:
ps.topk(20, closed=True, filter=lambda patt, matches: len(patt)>2)

[(121, ['Intimates', 'Intimates', 'Intimates']),
 (58, ['Intimates', 'Intimates', 'Jeans']),
 (58, ['Intimates', 'Maternity', 'Intimates']),
 (54, ['Intimates', 'Intimates', 'Shorts']),
 (54, ['Intimates', 'Intimates', 'Swim']),
 (53, ['Intimates', 'Sleep & Lounge', 'Intimates']),
 (52, ['Intimates', 'Dresses', 'Intimates']),
 (51, ['Intimates', 'Fashion Hoodies & Sweatshirts', 'Intimates']),
 (51, ['Intimates', 'Tops & Tees', 'Intimates']),
 (50, ['Intimates', 'Intimates', 'Sweaters']),
 (48, ['Intimates', 'Intimates', 'Active']),
 (47, ['Intimates', 'Intimates', 'Dresses']),
 (46, ['Accessories', 'Intimates', 'Intimates']),
 (46, ['Tops & Tees', 'Intimates', 'Intimates']),
 (45, ['Dresses', 'Intimates', 'Intimates']),
 (45, ['Intimates', 'Intimates', 'Fashion Hoodies & Sweatshirts']),
 (45, ['Intimates', 'Intimates', 'Tops & Tees']),
 (45, ['Jeans', 'Intimates', 'Intimates']),
 (44, ['Sleep & Lounge', 'Intimates', 'Intimates']),
 (43, ['Intimates', 'Accessories', 'Intimates'])]

In [84]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori

In [107]:
order_items_filtered = order_items_enriched.loc[order_items_enriched.user_id.isin(repeat_purchasers_idx)]\
                                          .loc[~order_items_enriched.status.isin(['Cancelled', 'Returned'])]

user_sets = order_items_filtered.groupby('user_id')['product_category'].apply(lambda x: list(set(x))).tolist()

In [111]:
te = TransactionEncoder()
te_arr = te.fit(user_sets).transform(user_sets)
apriori_df = pd.DataFrame(te_arr, columns=te.columns_)
apriori_results = apriori(apriori_df, min_support=0.025, use_colnames=True)

In [112]:
apriori_results

Unnamed: 0,support,itemsets
0,0.151711,(Accessories)
1,0.141075,(Active)
2,0.053085,(Blazers & Jackets)
3,0.087072,(Dresses)
4,0.184104,(Fashion Hoodies & Sweatshirts)
5,0.1886,(Intimates)
6,0.193725,(Jeans)
7,0.050957,(Leggings)
8,0.084558,(Maternity)
9,0.138368,(Outerwear & Coats)
