In [0]:
# Databricks notebook source
# 1. Завантаження очищених даних
delta_path = "/Volumes/workspace/default/olist_delta_dataset"

olist_orders = spark.read.format("delta").load(delta_path)
olist_orders.show(5)
olist_orders.printSchema()



+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------+--------------+------------+----------------+-------------------+--------------------+------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|purchase_year|purchase_month|purchase_day|purchase_weekday|purchase_weekofyear|order_status_indexed|delivery_time_days|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------+--------------+------------+----------------+-------------------+--------------------+------------------+
|ccbabeb0b02433bd0...|c77ee2d8ba1614a4d...|   delivered|     2018-02-1

In [0]:
# 2. Первинний аналіз змінних
# Числові змінні
numeric_cols = ["purchase_year", "purchase_month", "purchase_day", 
                "purchase_weekday", "purchase_weekofyear", "delivery_time_days"]

# Категоріальні змінні
categorical_cols = ["order_status", "customer_id"]

# Часові змінні
time_cols = ["order_purchase_timestamp", "order_approved_at",
             "order_delivered_carrier_date", "order_delivered_customer_date",
             "order_estimated_delivery_date"]

print("Numeric:", numeric_cols)
print("Categorical:", categorical_cols)
print("Time:", time_cols)



Numeric: ['purchase_year', 'purchase_month', 'purchase_day', 'purchase_weekday', 'purchase_weekofyear', 'delivery_time_days']
Categorical: ['order_status', 'customer_id']
Time: ['order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date']


In [0]:
# Масштабування числових ознак
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml import Pipeline

# Об’єднання числових ознак у вектор
assembler = VectorAssembler(inputCols=numeric_cols, outputCol="numeric_features")

# Стандартизація
scaler = StandardScaler(inputCol="numeric_features", outputCol="numeric_scaled", withMean=True, withStd=True)

pipeline_numeric = Pipeline(stages=[assembler, scaler])
olist_orders_scaled = pipeline_numeric.fit(olist_orders).transform(olist_orders)

olist_orders_scaled.select("numeric_features", "numeric_scaled").show(5, truncate=False)


+--------------------------------+--------------------------------------------------------------------------------------------------------------------------+
|numeric_features                |numeric_scaled                                                                                                            |
+--------------------------------+--------------------------------------------------------------------------------------------------------------------------+
|[2018.0,2.0,19.0,2.0,8.0,18.0]  |[0.9212620248057964,-1.2435964538639488,0.4027424113392373,-1.011294959616418,-1.1594387074188466,0.6083045965080545]     |
|[2018.0,8.0,8.0,4.0,32.0,7.0]   |[0.9212620248057964,0.6042294965367236,-0.8689660624435991,0.04437592842202428,0.5394262246592859,-0.6544254259646151]    |
|[2018.0,6.0,4.0,2.0,23.0,7.0]   |[0.9212620248057964,-0.011712486930167146,-1.3314055074555395,-1.011294959616418,-0.09764812487001377,-0.6544254259646151]|
|[2017.0,11.0,30.0,5.0,48.0,28.0]|[-1.05830625593148

In [0]:
# Кодування категоріальних ознак
from pyspark.ml.feature import StringIndexer, OneHotEncoder

# Індексування
indexers = [StringIndexer(inputCol=col, outputCol=col+"_index") for col in categorical_cols]

# One-hot енкодінг
encoders = [OneHotEncoder(inputCol=col+"_index", outputCol=col+"_ohe") for col in categorical_cols]

pipeline_cat = Pipeline(stages=indexers + encoders)
olist_orders_cat = pipeline_cat.fit(olist_orders_scaled).transform(olist_orders_scaled)

olist_orders_cat.select(categorical_cols + [col+"_ohe" for col in categorical_cols]).show(5, truncate=False)

+------------+--------------------------------+----------------+---------------------+
|order_status|customer_id                     |order_status_ohe|customer_id_ohe      |
+------------+--------------------------------+----------------+---------------------+
|delivered   |c77ee2d8ba1614a4d489a44166894938|(7,[0],[1.0])   |(97761,[76451],[1.0])|
|delivered   |3d3c463710ea6e8dd9a63c1110eeb06b|(7,[0],[1.0])   |(97761,[23416],[1.0])|
|delivered   |538a4d02876412846b966a3c057395e5|(7,[0],[1.0])   |(97761,[31836],[1.0])|
|delivered   |0a978c825ff7d013133ddc7f77566172|(7,[0],[1.0])   |(97761,[4026],[1.0]) |
|delivered   |21a99191298d34fb6dd0b088e821591c|(7,[0],[1.0])   |(97761,[12892],[1.0])|
+------------+--------------------------------+----------------+---------------------+
only showing top 5 rows


In [0]:
# Створення часових ознак
from pyspark.sql.functions import year, month, dayofmonth, dayofweek, weekofyear, hour, datediff

olist_orders_time = olist_orders_cat.withColumn("purchase_year", year("order_purchase_timestamp")) \
    .withColumn("purchase_month", month("order_purchase_timestamp")) \
    .withColumn("purchase_day", dayofmonth("order_purchase_timestamp")) \
    .withColumn("purchase_weekday", dayofweek("order_purchase_timestamp")) \
    .withColumn("purchase_weekofyear", weekofyear("order_purchase_timestamp")) \
    .withColumn("delivery_time_days_calc", datediff("order_delivered_customer_date", "order_purchase_timestamp"))

olist_orders_time.select("order_id", "order_purchase_timestamp", "delivery_time_days_calc").show(5)


+--------------------+------------------------+-----------------------+
|            order_id|order_purchase_timestamp|delivery_time_days_calc|
+--------------------+------------------------+-----------------------+
|ccbabeb0b02433bd0...|     2018-02-19 20:31:09|                     18|
|c6bf92017bd40729c...|     2018-08-08 01:15:06|                      7|
|ab87dc5a5f1856a10...|     2018-06-04 12:38:45|                      7|
|06ff862a85c2402aa...|     2017-11-30 13:31:08|                     28|
|f23155f5fa9b82663...|     2017-09-20 12:19:12|                     12|
+--------------------+------------------------+-----------------------+
only showing top 5 rows


In [0]:
# Аналіз кореляцій між числовими ознаками
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler

assembler_corr = VectorAssembler(inputCols=numeric_cols, outputCol="corr_features")
olist_orders_corr = assembler_corr.transform(olist_orders_time)

corr_matrix = Correlation.corr(olist_orders_corr, "corr_features", "pearson").head()[0]
print("Кореляційна матриця (Pearson):")
print(corr_matrix)


Кореляційна матриця (Pearson):
DenseMatrix([[ 1.        , -0.55302372, -0.04325656, -0.01486727, -0.55139664,
              -0.06763211],
             [-0.55302372,  1.        ,  0.00559962,  0.01666232,  0.99589294,
              -0.05776641],
             [-0.04325656,  0.00559962,  1.        , -0.00819956,  0.09356577,
              -0.00558463],
             [-0.01486727,  0.01666232, -0.00819956,  1.        ,  0.00990218,
               0.06236576],
             [-0.55139664,  0.99589294,  0.09356577,  0.00990218,  1.        ,
              -0.05833627],
             [-0.06763211, -0.05776641, -0.00558463,  0.06236576, -0.05833627,
               1.        ]])


In [0]:
# Вибір найбільш інформативних числових ознак
numeric_cols_filtered = [
    "purchase_year", "purchase_month",
    "purchase_day", "purchase_weekday", "delivery_time_days"
]

print("Фінальний список числових ознак після відбору:", numeric_cols_filtered)

Фінальний список числових ознак після відбору: ['purchase_year', 'purchase_month', 'purchase_day', 'purchase_weekday', 'delivery_time_days']


In [0]:
# Агрегування даних по клієнту
from pyspark.sql.functions import avg, count

# Кількість замовлень на клієнта
olist_orders_agg = olist_orders_time.groupBy("customer_id").agg(
    count("order_id").alias("orders_per_customer"),
    avg("delivery_time_days_calc").alias("avg_delivery_days")
)

olist_orders_agg.show(5)

+--------------------+-------------------+-----------------+
|         customer_id|orders_per_customer|avg_delivery_days|
+--------------------+-------------------+-----------------+
|c2928a50aecf1bc47...|                  1|              9.0|
|f7398fc942c8fa80e...|                  1|              5.0|
|d0b0b2dd8bdaf36eb...|                  1|             24.0|
|0bd683b7ceca26b5b...|                  1|             17.0|
|f3457b8fdac18622d...|                  1|             10.0|
+--------------------+-------------------+-----------------+
only showing top 5 rows


In [0]:
# Формування фінального набору даних
# Залишено основні ознаки
final_cols = ["order_id", "customer_id", "numeric_scaled"] + \
             [col+"_ohe" for col in categorical_cols] + \
             ["purchase_year", "purchase_month", "purchase_day", "purchase_weekday",
              "delivery_time_days_calc", "orders_per_customer", "avg_delivery_days"]

olist_orders_final = olist_orders_agg.join(olist_orders_time, on="customer_id", how="inner").select(final_cols)
olist_orders_final.show(5)


+--------------------+--------------------+--------------------+----------------+--------------------+-------------+--------------+------------+----------------+-----------------------+-------------------+-----------------+
|            order_id|         customer_id|      numeric_scaled|order_status_ohe|     customer_id_ohe|purchase_year|purchase_month|purchase_day|purchase_weekday|delivery_time_days_calc|orders_per_customer|avg_delivery_days|
+--------------------+--------------------+--------------------+----------------+--------------------+-------------+--------------+------------+----------------+-----------------------+-------------------+-----------------+
|ccbabeb0b02433bd0...|c77ee2d8ba1614a4d...|[0.92126202480579...|   (7,[0],[1.0])|(97761,[76451],[1...|         2018|             2|          19|               2|                     18|                  1|             18.0|
|c6bf92017bd40729c...|3d3c463710ea6e8dd...|[0.92126202480579...|   (7,[0],[1.0])|(97761,[23416],[1...|  

In [0]:
delta_output_path = "/Volumes/workspace/default/olist_delta_2"

olist_orders_final.write.format("delta").mode("overwrite").save(delta_output_path)
