In [11]:
from sqlalchemy import create_engine
import numpy as np
import pandas as pd
import pandas.io.sql as sqlio
import os
from dotenv import load_dotenv
import datetime as dt
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, PowerTransformer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import pickle as pkl
import json
import ast

In [2]:
RANDOM_STATE = 42

In [3]:
# Curry function to initialize postgres engine and return read_query function

def make_read_query(protocol=None, user=None, password=None, host=None, port=None, db=None):
    load_dotenv()
    
    protocol = protocol if protocol else 'postgresql+psycopg2'
    user = user if user else os.environ.get('POSTGRES_USER')
    password = password if password else os.environ.get('POSTGRES_PASSWORD')
    host = host if host else 'localhost'
    port = port if port else 5432
    db = db if db else os.environ.get('POSTGRES_DB')
    
    db_url = f'{protocol}://{user}:{password}@{host}:{port}/{db}'
    engine = create_engine(db_url)

    def read_query(query, verbose=True):
        if verbose:
            print(query, '\n')
            
        with engine.connect() as conn:
            df = sqlio.read_sql_query(query, conn)

        return df

    return read_query

In [4]:
read_query = make_read_query()

In [5]:
users_enriched = read_query("SELECT * FROM users_enriched")
users_enriched.head()

SELECT * FROM users_enriched 



Unnamed: 0,id,age,gender,country,city,traffic_source,created_at,first_order_timestamp,last_order_timestamp,days_to_activation,...,avg_days_to_order,std_days_to_order,n_orders,avg_order_items,avg_item_value,avg_order_value,order_items,segment,predicted_segment,lifetime_status
0,63563,39,F,South Korea,Busan,Search,2019-01-20 10:29:00,2023-05-25 09:30:03,2023-05-25 09:30:03,1585.0,...,,0.0,1,2.0,34.25,68.5,"{""(79668,\""Roxy Juniors Ruins Short\"",Women,Sh...",One-Off Purchasers,,Active
1,37779,22,F,Germany,Kaiserslautern,Search,2019-01-20 11:02:00,2019-11-09 10:26:11,2023-11-02 07:51:08,292.0,...,1453.0,0.0,2,2.0,30.786667,46.18,"{""(47225,\""Echo Design Women's Touch Ruched Gl...",Customers with Long Time-To-Order,,Active
2,50505,17,F,Brasil,Sertânia,Search,2019-01-20 11:07:00,2022-05-15 07:30:24,2022-05-15 07:30:24,1210.0,...,,0.0,1,1.0,12.99,12.99,"{""(63169,\""White Cotton Gown/House Dress Sizes...",One-Off Purchasers,,Active
3,72695,25,M,Brasil,Barreirinhas,Search,2019-01-20 12:23:00,2021-06-11 10:00:32,2021-06-11 10:00:32,872.0,...,,0.0,1,1.0,25.0,25.0,"{""(91149,\""Big Star Men's Division Slim Fit Tw...",One-Off Purchasers,,Churned
4,22493,24,F,China,City of Yantai,Search,2019-01-20 12:30:00,NaT,NaT,,...,,,0,,,,,Never Ordered,,Inactive


In [6]:
users_enriched.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84011 entries, 0 to 84010
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   id                     84011 non-null  int64         
 1   age                    84011 non-null  int64         
 2   gender                 84011 non-null  object        
 3   country                84011 non-null  object        
 4   city                   84011 non-null  object        
 5   traffic_source         84011 non-null  object        
 6   created_at             84011 non-null  datetime64[ns]
 7   first_order_timestamp  69073 non-null  datetime64[ns]
 8   last_order_timestamp   69073 non-null  datetime64[ns]
 9   days_to_activation     69073 non-null  float64       
 10  active_days            69073 non-null  float64       
 11  inactive_days          69073 non-null  float64       
 12  avg_days_to_order      30731 non-null  float64       
 13  s

In [None]:
def parse_order_items(order_items_str):
    

In [10]:
users_enriched.order_items.loc[0]

'{"(79668,\\"Roxy Juniors Ruins Short\\",Women,Shorts,Roxy,44.5,\\"2023-05-26 08:02:42\\")","(79668,\\"Lace Sexy Babydoll Set-Black\\",Women,Intimates,\\"SEX NIGHT\\",24,\\"2023-05-25 09:30:03\\")"}'

In [36]:
order_items = list(ast.literal_eval(users_enriched.order_items.loc[0]))
order_items_list = []
for item in order_items:
    item = item.replace('(', '').replace(')', '')
    item_list = []
    for field in item.split(','):
        field = field.strip('"')

        cast_to_types = [int, float, dt.datetime]
        for type_ in cast_to_types:
            try:
                if type_ == dt.datetime:
                    field = dt.datetime.strptime(field, '%Y-%m-%d %H:%M:%S')
                else:
                    field = type_(field)
                break
            except Exception as e:
                print(e)
        
        item_list.append(field)
    order_items_list.append(item_list)

order_items_list

invalid literal for int() with base 10: 'Roxy Juniors Ruins Short'
could not convert string to float: 'Roxy Juniors Ruins Short'
time data 'Roxy Juniors Ruins Short' does not match format '%Y-%m-%d %H:%M:%S'
invalid literal for int() with base 10: 'Women'
could not convert string to float: 'Women'
time data 'Women' does not match format '%Y-%m-%d %H:%M:%S'
invalid literal for int() with base 10: 'Shorts'
could not convert string to float: 'Shorts'
time data 'Shorts' does not match format '%Y-%m-%d %H:%M:%S'
invalid literal for int() with base 10: 'Roxy'
could not convert string to float: 'Roxy'
time data 'Roxy' does not match format '%Y-%m-%d %H:%M:%S'
invalid literal for int() with base 10: '44.5'
invalid literal for int() with base 10: '2023-05-26 08:02:42'
could not convert string to float: '2023-05-26 08:02:42'
invalid literal for int() with base 10: 'Lace Sexy Babydoll Set-Black'
could not convert string to float: 'Lace Sexy Babydoll Set-Black'
time data 'Lace Sexy Babydoll Set-Bl

[[79668,
  'Roxy Juniors Ruins Short',
  'Women',
  'Shorts',
  'Roxy',
  44.5,
  datetime.datetime(2023, 5, 26, 8, 2, 42)],
 [79668,
  'Lace Sexy Babydoll Set-Black',
  'Women',
  'Intimates',
  'SEX NIGHT',
  24,
  datetime.datetime(2023, 5, 25, 9, 30, 3)]]