In [1]:
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

load_dotenv()
user = os.getenv('DB_USER')
password = os.getenv('DB_PASSWORD')
host = os.getenv('DB_HOST')
port = os.getenv('DB_PORT')
dbname = os.getenv('DB_NAME')

DATABASE_URL = f"postgresql://{user}:{password}@{host}:{port}/{dbname}"

engine = create_engine(DATABASE_URL)

In [2]:
# Se definen todos los dataframes 
df_customers = pd.read_sql("SELECT * FROM customer",engine)
df_geolocation = pd.read_sql("SELECT * FROM geolocation",engine)
df_order_items = pd.read_sql("SELECT * FROM transformed_order_items",engine)
df_order_payments = pd.read_sql("SELECT * FROM transformed_order_payments",engine)
df_order_reviews = pd.read_sql("SELECT * FROM order_reviews",engine)
df_orders = pd.read_sql("SELECT * FROM transformed_orders",engine)
df_products = pd.read_sql("SELECT * FROM products",engine)
df_sellers = pd.read_sql("SELECT * FROM sellers",engine)
df_products_translated = pd.read_sql("SELECT * FROM products_translated", engine)

In [3]:
# Se hace merge de las tablas relevantes para el modelo de clustering con ML
df_orders = df_orders[df_orders["order_status"] == "delivered"]
df_with_merges = df_orders.merge(df_order_items, how=("left"))
df_with_merges = df_with_merges.merge(df_order_payments, how=("left"))
df_with_merges = df_with_merges.merge(df_customers, how=("left"))
df_with_merges = df_with_merges.merge(df_customers, how=("left"))

In [4]:
# Se crean columnas relevantes para el modelo
total_spent = df_with_merges.groupby("customer_unique_id")["payment_value"].sum()
avg_ticket = df_with_merges.groupby("customer_unique_id")["payment_value"].mean()
max_ticket = df_with_merges.groupby("customer_unique_id")["payment_value"].max()
total_orders = df_with_merges.groupby("customer_unique_id")["order_id"].nunique()
total_items = df_with_merges.groupby("customer_unique_id")["products_quantity"].count()

# Se crea una vista sin suplicados para sacar el average de ordenes
df_sin_duplicados = df_with_merges.drop_duplicates(subset=["customer_unique_id", "order_id"])
avg_items_per_order = df_sin_duplicados.groupby("customer_unique_id")["products_quantity"].mean()

# Primera y ultima fecha de compra de cada cliente
first_purchase_date = df_with_merges.groupby("customer_unique_id")["order_purchase_timestamp"].min()
last_purchase_date = df_with_merges.groupby("customer_unique_id")["order_purchase_timestamp"].max()

In [5]:
# Se crea otra vista ya que habia conflicto con una columna
df_order_payments_copy = df_order_payments.drop(columns="payment_value")
payment_types = df_orders.merge(df_order_payments_copy, how="left")
payment_types = payment_types.merge(df_customers, how="left")

# Se crean columnas de metodos de pago y mas usados por cada cliente
unique_payment_types = payment_types.groupby("customer_unique_id")["payment_type"].nunique()
most_used_payment_type = payment_types.groupby("customer_unique_id")["payment_type"].apply(lambda x : x.mode().iloc[0])


In [6]:
# Se migran a formato dataframe

total_spent = pd.DataFrame(total_spent).reset_index()
avg_ticket = pd.DataFrame(avg_ticket).reset_index()
max_ticket = pd.DataFrame(max_ticket).reset_index()
total_orders = pd.DataFrame(total_orders).reset_index()
total_items = pd.DataFrame(total_items).reset_index()
avg_items_per_order = pd.DataFrame(avg_items_per_order).reset_index()
unique_payment_types = pd.DataFrame(unique_payment_types).reset_index()
most_used_payment_type = pd.DataFrame(most_used_payment_type).reset_index()
first_purchase_date = pd.DataFrame(first_purchase_date).reset_index()
last_purchase_date = pd.DataFrame(last_purchase_date).reset_index()

In [7]:
# Cambian el nombre de las columnas a los mas apropiados

total_spent = total_spent.rename(columns={"payment_value": "total_spent"})
avg_ticket = avg_ticket.rename(columns={"payment_value": "avg_ticket"})
max_ticket = max_ticket.rename(columns={"payment_value": "max_ticket"})
total_orders = total_orders.rename(columns={"order_id": "total_orders"})
total_items = total_items.rename(columns={"products_quantity": "total_items"})
avg_items_per_order = avg_items_per_order.rename(columns={"products_quantity": "avg_items_per_order"})
unique_payment_types = unique_payment_types.rename(columns={"payment_type": "unique_payment_types"})
most_used_payment_type = most_used_payment_type.rename(columns={"payment_type": "most_used_payment_type"})
first_purchase_date = first_purchase_date.rename(columns={"order_purchase_timestamp": "first_purchase_date"})
last_purchase_date = last_purchase_date.rename(columns={"order_purchase_timestamp": "last_purchase_date"})

In [8]:
# Se junta la info de todos los DFs para un unico y principal respecto al modelo

df_ml = total_spent.merge(avg_ticket, on="customer_unique_id", how="left")\
    .merge(max_ticket, on="customer_unique_id", how="left")\
    .merge(total_orders, on="customer_unique_id", how="left")\
    .merge(total_items, on="customer_unique_id", how="left")\
    .merge(avg_items_per_order, on="customer_unique_id", how="left")\
    .merge(unique_payment_types, on="customer_unique_id", how="left")\
    .merge(most_used_payment_type, on="customer_unique_id", how="left")\
    .merge(first_purchase_date, on="customer_unique_id", how="left")\
    .merge(last_purchase_date, on="customer_unique_id", how="left")


In [9]:
# Se cambian a formato fechas para operar luego
df_ml['first_purchase_date'] = pd.to_datetime(df_ml['first_purchase_date'].astype(str).str[:19])
df_ml['last_purchase_date'] = pd.to_datetime(df_ml['last_purchase_date'].astype(str).str[:19])

In [None]:
# Diferencia de dias entre la primera y ultima compra
df_ml['first_last_difference_days'] = (df_ml['last_purchase_date'] - df_ml['first_purchase_date']).dt.days

In [None]:
# Columna del numero de dias de la ultima compra hasta la ultima fecha del dataset 
df_ml['recency_days'] = (df_ml["last_purchase_date"].max() -  df_ml["last_purchase_date"]).dt.days

In [None]:
# Se a√±aden columnas de location
df_clientes_unicos = df_customers[["customer_unique_id", "customer_state", "customer_city"]].drop_duplicates(subset="customer_unique_id")
df_ml = df_ml.merge(df_clientes_unicos[["customer_unique_id", "customer_state", "customer_city"]],how="left")

In [None]:
# Export a Supabase
df_ml.to_sql("data_ML", engine, if_exists="replace", index=False)

357