## Создаем Spark-сессию

In [1]:
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.window import Window

In [2]:
jar_files = [
    "E:\\EDesktop\\Webinar\\postgresql-42.7.5.jar",
    "E:\\EDesktop\\Webinar\\clickhouse-jdbc-0.4.6.jar"
]

In [4]:
spark = (
    SparkSession
    .builder
    .appName("SparkWebinar")
    .config("spark.jars", ",".join(jar_files))
    .getOrCreate()
)

In [3]:
",".join(jar_files)

'E:\\EDesktop\\Webinar\\postgresql-42.7.5.jar,E:\\EDesktop\\Webinar\\clickhouse-jdbc-0.4.6.jar'

In [5]:
spark

## Подключаемся к источникам

### csv

In [6]:
path = 'E:\\EDesktop\\Webinar\\data'

In [7]:
campaigns_dict = (
    spark.read
    .option('header', True)
    .csv(f'{path}\\campaigns_dict.csv')
)

In [10]:
# ленивые вычисления (transformations, actions)
campaigns_dict.show(5, truncate=False)

+-----------+------------------------------------------+
|campaign_id|campaign_name                             |
+-----------+------------------------------------------+
|1          |year_modern_kitchen_launch_20250115       |
|2          |quarter_custom_kitchens_showcase_20240210 |
|3          |month_smart_kitchen_promotion_20240305    |
|4          |year_luxury_kitchens_exhibit_20240420     |
|5          |quarter_ecofriendly_kitchen_offer_20240512|
+-----------+------------------------------------------+
only showing top 5 rows



In [13]:
campaigns_dict.printSchema()

root
 |-- campaign_id: string (nullable = true)
 |-- campaign_name: string (nullable = true)



### parquet

In [14]:
submits = spark.read.parquet(f'{path}\\submits.parquet')

In [15]:
submits.show(5, truncate=False)

+---------+--------+-----------+
|submit_id|name    |phone      |
+---------+--------+-----------+
|2282     |Jennifer|79511904041|
|9898     |Jeffrey |79824419733|
|9005     |Linda   |79074725672|
|1507     |Teresa  |79864203598|
|3803     |Tanya   |79779567654|
+---------+--------+-----------+
only showing top 5 rows



In [16]:
submits.printSchema()

root
 |-- submit_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- phone: long (nullable = true)



In [17]:
deals = spark.read.parquet(f'{path}\\deals.parquet')

In [18]:
deals.show(5, truncate=False)

+-------+----------+---------------------+-----------+------------------------+-----------------------------------------------------+
|deal_id|deal_date |fio                  |phone      |email                   |address                                              |
+-------+----------+---------------------+-----------+------------------------+-----------------------------------------------------+
|1      |2024-03-04|Gregory Wu           |79746561889|paul80@example.net      |098 Yates Cliff Apt. 241, East Monica, DE 88076      |
|2      |2024-08-20|William Ross Jr.     |79074725672|xyoung@example.org      |197 Willie Groves Apt. 655, Port Angelaberg, LA 39384|
|3      |2024-10-15|Sonya Kerr           |79201244835|elewis@example.com      |144 Andrew Cape, Lake Nicholas, SC 58918             |
|4      |2024-12-31|Mrs. Angela Tucker MD|79771829751|robertparker@example.net|6056 Collins View, South Harold, OR 15650            |
|5      |2024-03-23|Eric Flores          |79729054809|barbara7

In [19]:
deals.show(2, truncate=False, vertical=True)

-RECORD 0----------------------------------------------------------
 deal_id   | 1                                                     
 deal_date | 2024-03-04                                            
 fio       | Gregory Wu                                            
 phone     | 79746561889                                           
 email     | paul80@example.net                                    
 address   | 098 Yates Cliff Apt. 241, East Monica, DE 88076       
-RECORD 1----------------------------------------------------------
 deal_id   | 2                                                     
 deal_date | 2024-08-20                                            
 fio       | William Ross Jr.                                      
 phone     | 79074725672                                           
 email     | xyoung@example.org                                    
 address   | 197 Willie Groves Apt. 655, Port Angelaberg, LA 39384 
only showing top 2 rows



In [20]:
deals.printSchema()

root
 |-- deal_id: long (nullable = true)
 |-- deal_date: string (nullable = true)
 |-- fio: string (nullable = true)
 |-- phone: long (nullable = true)
 |-- email: string (nullable = true)
 |-- address: string (nullable = true)



In [21]:
import pyarrow as pa
import pyarrow.parquet as pq

In [22]:
pq.read_metadata(f'{path}\\submits.parquet')  # Parquet recommends row groups sized between 512MB and 1GB

<pyarrow._parquet.FileMetaData object at 0x000002B7D8374540>
  created_by: parquet-cpp-arrow version 19.0.0
  num_columns: 3
  num_rows: 4000
  num_row_groups: 1
  format_version: 2.6
  serialized_size: 2371

In [23]:
submits.count()

4000

### postgres

In [24]:
pg_host = 'localhost'
pg_port = '5434'
pg_db = 'webinar'
pg_table = 'costs'
pg_user = 'postgres'
pg_password = 'postgres'

In [25]:
costs = (
    spark.read
    .format('jdbc')
    .option('url', f'jdbc:postgresql://{pg_host}:{pg_port}/{pg_db}')
    .option('dbtable', pg_table)
    .option('user', pg_user)
    .option('password', pg_password)
    .option('driver', 'org.postgresql.Driver')
    .load()
)

In [26]:
costs.show(5, truncate=False)  # заглянуть в pgAdmin

+----------+-----------+------+------+-----+
|date      |campaign_id|costs |clicks|views|
+----------+-----------+------+------+-----+
|2024-01-01|1          |670.52|40    |110  |
|2024-01-01|2          |602.5 |11    |849  |
|2024-01-01|3          |654.74|51    |566  |
|2024-01-01|4          |897.24|86    |679  |
|2024-01-01|5          |758.19|30    |585  |
+----------+-----------+------+------+-----+
only showing top 5 rows



In [27]:
costs.printSchema()

root
 |-- date: date (nullable = true)
 |-- campaign_id: integer (nullable = true)
 |-- costs: double (nullable = true)
 |-- clicks: integer (nullable = true)
 |-- views: integer (nullable = true)



### clickhouse

In [28]:
ch_host = 'localhost'
ch_port = '8123'
ch_db = 'default'
ch_table = 'visits'

In [31]:
visits = (
    spark.read
    .format('jdbc')
    .option('url', f'jdbc:clickhouse://{ch_host}:{ch_port}/{ch_db}')
    .option('dbtable', ch_table)
    .option('driver', 'com.clickhouse.jdbc.ClickHouseDriver')
    .load()
)

In [32]:
visits.show(5)

+-------+-------------------+--------------------+--------+--------+--------+--------------------+----------------+
|visitid|      visitDateTime|                 URL|duration|clientID|  source|         UTMCampaign|          params|
+-------+-------------------+--------------------+--------+--------+--------+--------------------+----------------+
| 189665|2024-01-01 00:19:38|https://our-cool-...|      78|     848|  direct|quarter_ecofriend...|['submit', 2136]|
| 504698|2024-01-01 00:59:20|https://our-cool-...|       8|     527|  direct|year_modern_kitch...|['submit', 4630]|
| 632370|2024-01-01 01:49:09|https://our-cool-...|      72|     520| organic|quarter_ecofriend...|['submit', 2734]|
| 943112|2024-01-01 02:20:28|https://our-cool-...|      14|     117| organic|month_openconcept...|['submit', 5299]|
| 139778|2024-01-01 07:31:30|https://our-cool-...|      73|     655|internal|month_contemporar...| ['submit', 419]|
+-------+-------------------+--------------------+--------+--------+----

In [33]:
visits.show(1, truncate=False, vertical=True)

-RECORD 0-----------------------------------------------------
 visitid       | 189665                                       
 visitDateTime | 2024-01-01 00:19:38                          
 URL           | https://our-cool-website.com/checkout        
 duration      | 78                                           
 clientID      | 848                                          
 source        | direct                                       
 UTMCampaign   | quarter_ecofriendly_kitchens_launch_20240205 
 params        | ['submit', 2136]                             
only showing top 1 row



In [34]:
visits.printSchema()

root
 |-- visitid: integer (nullable = true)
 |-- visitDateTime: timestamp (nullable = true)
 |-- URL: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- clientID: integer (nullable = true)
 |-- source: string (nullable = true)
 |-- UTMCampaign: string (nullable = true)
 |-- params: string (nullable = true)



In [None]:
# методология и моника

## Готовим источники

### Визиты (clickhouse)

In [None]:
visits.show(1)

In [36]:
# кх
filtered_step1 = (
    visits
    .withColumn('dt', F.date_format(F.col('visitDateTime'), 'yyyy-MM-dd'))
    .where(F.col('dt').between('2024-01-01', '2025-01-27'))
    .where(F.col('source').isin('ad', 'direct'))
    .where(F.col('URL').rlike('.*checkout.*|.*add.*|.*home.*|.*contact.*|.*top50.*|.*customer-service.*|.*wishlist.*|.*sale.*|.*best-sellers.*|.*view.*|.*discount.*|.*featured.*|.*new-arrivals.*|.*settings.*|.*return-policy.*|.*edit.*|.*delete.*|.*reviews.*|.*products.*|.*about.*'))
    .select(
        'dt',
        'visitid',
        'clientID',
        'URL',
        'duration',
        'source',
        'UTMCampaign',
        'params',
        F.regexp_replace(F.col('params'), r'\[|\]', '').alias('params_regex')
    )
    .withColumn('params_split', F.split('params_regex', ', '))
)

In [37]:
filtered_step1.show(1, truncate=False, vertical=True)

-RECORD 0----------------------------------------------------
 dt           | 2024-01-01                                   
 visitid      | 189665                                       
 clientID     | 848                                          
 URL          | https://our-cool-website.com/checkout        
 duration     | 78                                           
 source       | direct                                       
 UTMCampaign  | quarter_ecofriendly_kitchens_launch_20240205 
 params       | ['submit', 2136]                             
 params_regex | 'submit', 2136                               
 params_split | ['submit', 2136]                             
only showing top 1 row



In [38]:
filtered_step1.printSchema()  # string vs array

root
 |-- dt: string (nullable = true)
 |-- visitid: integer (nullable = true)
 |-- clientID: integer (nullable = true)
 |-- URL: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- source: string (nullable = true)
 |-- UTMCampaign: string (nullable = true)
 |-- params: string (nullable = true)
 |-- params_regex: string (nullable = true)
 |-- params_split: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [40]:
filtered_step2 = (
    filtered_step1
    .withColumn('event_type', F.regexp_replace(F.col('params_split')[0], "'", ''))
    .withColumn('event_id', F.col('params_split')[1].cast('int'))
)

In [125]:
filtered_step2.show(1, truncate=False, vertical=True)

-RECORD 0----------------------------------------------------
 dt           | 2024-01-01                                   
 visitid      | 189665                                       
 clientID     | 848                                          
 URL          | https://our-cool-website.com/checkout        
 duration     | 78                                           
 source       | direct                                       
 UTMCampaign  | quarter_ecofriendly_kitchens_launch_20240205 
 params       | ['submit', 2136]                             
 params_regex | 'submit', 2136                               
 params_split | ['submit', 2136]                             
 event_type   | submit                                       
 event_id     | 2136                                         
only showing top 1 row



In [126]:
filtered_step2.printSchema()

root
 |-- dt: string (nullable = true)
 |-- visitid: integer (nullable = true)
 |-- clientID: integer (nullable = true)
 |-- URL: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- source: string (nullable = true)
 |-- UTMCampaign: string (nullable = true)
 |-- params: string (nullable = true)
 |-- params_regex: string (nullable = true)
 |-- params_split: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- event_type: string (nullable = true)
 |-- event_id: integer (nullable = true)



In [41]:
visits_df = (
    filtered_step2
    .where(F.col('event_type') == 'submit')
    .select(
        'dt',
        F.col('visitid').cast('string').alias('visitid'),
        F.col('clientID').cast('string').alias('clientid'),
        'URL',
        'duration',
        'source',
        'UTMCampaign',
        'event_type',
        'event_id'
    )
    .distinct()
)

In [129]:
visits_df.show(1, truncate=False, vertical=True)

-RECORD 0------------------------------------------------
 dt          | 2024-02-07                                
 visitid     | 925403                                    
 clientid    | 418                                       
 URL         | https://our-cool-website.com/best-sellers 
 duration    | 9                                         
 source      | direct                                    
 UTMCampaign | year_luxury_kitchens_event_20250120       
 event_type  | submit                                    
 event_id    | 7485                                      
only showing top 1 row



In [130]:
visits_df.printSchema()

root
 |-- dt: string (nullable = true)
 |-- visitid: string (nullable = true)
 |-- clientid: string (nullable = true)
 |-- URL: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- source: string (nullable = true)
 |-- UTMCampaign: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- event_id: integer (nullable = true)



In [131]:
visits_df.count()

2433

In [132]:
visits.count()

10000

### Расходы (postgres)

In [51]:
costs_df = (
    costs
    .groupBy(
        F.col('date').cast('string').alias('date'),
        'campaign_id'
    )
    .agg(
        F.sum(F.col('costs')).cast('decimal(19,2)').alias('costs'),
        F.sum(F.col('clicks')).alias('clicks'),
        F.sum(F.col('views')).alias('views')
    )
)

In [136]:
costs_df.show(5)

+----------+-----------+------+------+-----+
|      date|campaign_id| costs|clicks|views|
+----------+-----------+------+------+-----+
|2024-01-05|         75|107.03|    26|  958|
|2024-01-05|         97|408.51|    59|  292|
|2024-01-06|         44|748.68|    33|  380|
|2024-01-06|         58|422.91|    30|  575|
|2024-01-11|         91|496.45|     5|  435|
+----------+-----------+------+------+-----+
only showing top 5 rows



In [138]:
costs_df.printSchema()

root
 |-- date: string (nullable = true)
 |-- campaign_id: integer (nullable = true)
 |-- costs: decimal(19,2) (nullable = true)
 |-- clicks: long (nullable = true)
 |-- views: long (nullable = true)



### Кампании (csv)

In [139]:
campaigns_dict.show(1, truncate=False)

+-----------+-----------------------------------+
|campaign_id|campaign_name                      |
+-----------+-----------------------------------+
|1          |year_modern_kitchen_launch_20250115|
+-----------+-----------------------------------+
only showing top 1 row



In [None]:
campaigns_dict.printSchema()

In [42]:
campaigns_df = (
    campaigns_dict
    .withColumn('campaign_id', F.col('campaign_id').cast('integer'))
    .withColumn(
        'campaign_duration',
        F.when(F.col('campaign_name').like('year%'), 'Год')
        .when(F.col('campaign_name').like('quarter%'), 'Квартал')
        .when(F.col('campaign_name').like('month%'), 'Месяц')
        .otherwise(None)
    )
)

In [43]:
campaigns_df.show(5, truncate=False)

+-----------+------------------------------------------+-----------------+
|campaign_id|campaign_name                             |campaign_duration|
+-----------+------------------------------------------+-----------------+
|1          |year_modern_kitchen_launch_20250115       |Год              |
|2          |quarter_custom_kitchens_showcase_20240210 |Квартал          |
|3          |month_smart_kitchen_promotion_20240305    |Месяц            |
|4          |year_luxury_kitchens_exhibit_20240420     |Год              |
|5          |quarter_ecofriendly_kitchen_offer_20240512|Квартал          |
+-----------+------------------------------------------+-----------------+
only showing top 5 rows



In [44]:
campaigns_df.printSchema()

root
 |-- campaign_id: integer (nullable = true)
 |-- campaign_name: string (nullable = true)
 |-- campaign_duration: string (nullable = true)



### Заявки (parquet)

In [None]:
submits.printSchema()

In [None]:
submits.show(1)

In [45]:
submits_df = (
    submits
    .withColumn('phone', F.col('phone').cast('string'))
    .withColumn('phone_plus', F.concat(F.lit('+'), F.col('phone')))
    .withColumn('phone_md5', F.md5('phone'))
    .withColumn('phone_plus_md5', F.md5('phone_plus'))
)

In [46]:
submits_df.show(2, truncate=False)

+---------+--------+-----------+------------+--------------------------------+--------------------------------+
|submit_id|name    |phone      |phone_plus  |phone_md5                       |phone_plus_md5                  |
+---------+--------+-----------+------------+--------------------------------+--------------------------------+
|2282     |Jennifer|79511904041|+79511904041|4c7720fdf6f9eec623dc0f961f31f488|6ce81d5347bcd3eadb2921b7c4828e3b|
|9898     |Jeffrey |79824419733|+79824419733|9d106e45036bd4176774eb94adc9aacc|43630233dcc965f9827e394038b0321a|
+---------+--------+-----------+------------+--------------------------------+--------------------------------+
only showing top 2 rows



### Сделки (parquet)

In [None]:
deals.printSchema()

In [47]:
deals_df = (
    deals
    .withColumn('username', F.split(F.col('email'), '@').getItem(0))
    .withColumn('domain', F.split(F.col('email'), '@').getItem(1))
    .where(F.col('domain').isin('example.com', 'example.org', 'example.net'))
    .withColumn('phone', F.col('phone').cast('string'))
)

In [48]:
deals_df.show(1)

+-------+----------+----------+-----------+------------------+--------------------+--------+-----------+
|deal_id| deal_date|       fio|      phone|             email|             address|username|     domain|
+-------+----------+----------+-----------+------------------+--------------------+--------+-----------+
|      1|2024-03-04|Gregory Wu|79746561889|paul80@example.net|098 Yates Cliff A...|  paul80|example.net|
+-------+----------+----------+-----------+------------------+--------------------+--------+-----------+
only showing top 1 row



In [146]:
deals_df.printSchema()

root
 |-- deal_id: long (nullable = true)
 |-- deal_date: string (nullable = true)
 |-- fio: string (nullable = true)
 |-- phone: string (nullable = true)
 |-- email: string (nullable = true)
 |-- address: string (nullable = true)
 |-- username: string (nullable = true)
 |-- domain: string (nullable = true)



## Собираем витрину

In [54]:
# концепция "One Big Table"
customer_detailed = (
    visits_df.alias('v')
    .join(
        submits_df.alias('s'),
        F.col('v.event_id') == F.col('s.submit_id'),
        'left'
    )
    .join(
        deals_df.alias('d'),
        (F.col('s.phone') == F.col('d.phone')) &
        (F.col('v.dt') <= F.col('d.deal_date')),
        'left'
    )
    .join(
        campaigns_df.alias('camp'),
        F.col('v.utmcampaign') == F.col('camp.campaign_name'),
        'left'
    )
    .join(
        costs_df.alias('c'),
        (F.col('camp.campaign_id') == F.col('c.campaign_id')) &
        (F.col('v.dt') == F.col('c.date')),
        'left'
    )
    .select(
        'v.dt',
        F.col('v.visitid').alias('visit_id'),
        F.col('v.clientid').alias('client_id'),
        'v.url',
        'v.duration',
        'v.source',
        'v.utmcampaign',
        'v.event_type',
        'v.event_id',
        's.submit_id',
        's.name',
        's.phone',
        's.phone_plus',
        's.phone_md5',
        's.phone_plus_md5',
        'd.deal_id',
        'd.deal_date',
        'd.fio',
        F.col('d.phone').alias('phone_deal'),
        'd.email',
        'd.address',
        'd.username',
        'd.domain',
        'camp.campaign_name',
        'camp.campaign_duration',
        'c.costs',
        'c.clicks',
        'c.views'
    )
)

In [56]:
customer_detailed.printSchema()

root
 |-- dt: string (nullable = true)
 |-- visit_id: string (nullable = true)
 |-- client_id: string (nullable = true)
 |-- url: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- source: string (nullable = true)
 |-- utmcampaign: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- event_id: integer (nullable = true)
 |-- submit_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- phone: string (nullable = true)
 |-- phone_plus: string (nullable = true)
 |-- phone_md5: string (nullable = true)
 |-- phone_plus_md5: string (nullable = true)
 |-- deal_id: long (nullable = true)
 |-- deal_date: string (nullable = true)
 |-- fio: string (nullable = true)
 |-- phone_deal: string (nullable = true)
 |-- email: string (nullable = true)
 |-- address: string (nullable = true)
 |-- username: string (nullable = true)
 |-- domain: string (nullable = true)
 |-- campaign_name: string (nullable = true)
 |-- campaign_duration: string (nullable = tr

In [55]:
len(customer_detailed.columns)

28

In [153]:
customer_detailed.cache()

DataFrame[dt: string, visit_id: string, client_id: string, url: string, duration: int, source: string, utmcampaign: string, event_type: string, event_id: int, submit_id: bigint, name: string, phone: string, phone_plus: string, phone_md5: string, phone_plus_md5: string, deal_id: bigint, deal_date: string, fio: string, phone_deal: string, email: string, address: string, username: string, domain: string, campaign_name: string, costs: decimal(19,2), clicks: bigint, views: bigint]

In [154]:
customer_detailed.count()  # Spark UI

2639

In [22]:
campaigns_agg = (
    customer_detailed
    .groupBy('campaign_name')
    .agg(
        F.countDistinct('visit_id').alias('unique_visits'),
        F.countDistinct('client_id').alias('unique_clients'),
        F.countDistinct('submit_id').alias('unique_submits'),
        F.countDistinct('deal_id').alias('unique_deals'),
        F.sum('costs').alias('total_costs'),
        F.sum('clicks').alias('total_clicks'),
        F.sum('views').alias('total_views'),
        F.sum('duration').alias('total_duration')
    )
    .withColumn('avg_deal_cost', (F.col('total_costs') / F.col('unique_deals')).cast('decimal(19,2)'))
)

In [158]:
campaigns_agg.cache().count()

99

In [159]:
campaigns_agg.show(1, truncate=False, vertical=True)

-RECORD 0-------------------------------------------------
 campaign_name  | year_traditional_kitchens_demo_20240725 
 unique_visits  | 22                                      
 unique_clients | 21                                      
 unique_submits | 5                                       
 unique_deals   | 1                                       
 total_costs    | 12243.64                                
 total_clicks   | 1429                                    
 total_views    | 14294                                   
 total_duration | 1100                                    
 avg_deal_cost  | 12243.64                                
only showing top 1 row



In [23]:
dates_agg = (
    customer_detailed
    .groupBy(F.substring('dt', 1, 7).alias('month'))  # 2025-01-01
    .agg(
        F.countDistinct('visit_id').alias('unique_visits'),
        F.countDistinct('client_id').alias('unique_clients'),
        F.countDistinct('submit_id').alias('unique_submits'),
        F.countDistinct('deal_id').alias('unique_deals'),
        F.sum('costs').alias('total_costs'),
        F.sum('clicks').alias('total_clicks'),
        F.sum('views').alias('total_views'),
        F.sum('duration').alias('total_duration')
    )
    .withColumn('avg_deal_cost', (F.col('total_costs') / F.col('unique_deals')).cast('decimal(19,2)'))
)

In [163]:
dates_agg.cache().count()

13

In [164]:
dates_agg.show(1, truncate=False, vertical=True)

-RECORD 0-------------------
 month          | 2024-09   
 unique_visits  | 192       
 unique_clients | 169       
 unique_submits | 68        
 unique_deals   | 16        
 total_costs    | 100674.35 
 total_clicks   | 10320     
 total_views    | 101022    
 total_duration | 10285     
 avg_deal_cost  | 6292.15   
only showing top 1 row



In [52]:
def save_to_postgres(df, table_name):
    (
        df.write
        .format('jdbc')
        .option('url', f'jdbc:postgresql://{pg_host}:{pg_port}/{pg_db}')
        .option('dbtable', table_name)
        .option('user', pg_user)
        .option('password', pg_password)
        .option('driver', 'org.postgresql.Driver')
        .mode("overwrite")
        .save()
    )

In [57]:
save_to_postgres(customer_detailed, 'customer_detailed')

In [31]:
save_to_postgres(campaigns_agg, 'campaigns_agg')

In [30]:
save_to_postgres(dates_agg, 'dates_agg')

## Считаем метрики и анализируем результаты

In [165]:
# 1. Кампании без выручки
(
    campaigns_agg
    .where('unique_deals = 0')
    .select('campaign_name', 'total_costs', 'unique_deals')
    .sort('total_costs')
    .show(truncate=False)
)

+-----------------------------------------------+-----------+------------+
|campaign_name                                  |total_costs|unique_deals|
+-----------------------------------------------+-----------+------------+
|month_contemporary_kitchens_event_20241208     |8277.73    |0           |
|quarter_custom_kitchens_showcase_20240210      |8397.63    |0           |
|month_smart_kitchens_launch_20240330           |8703.22    |0           |
|month_smart_kitchen_promotion_20240305         |10276.42   |0           |
|month_ecofriendly_kitchens_experience_20241219 |10559.40   |0           |
|month_contemporary_kitchens_event_20240920     |10640.55   |0           |
|quarter_ecofriendly_kitchen_experience_20241119|10804.92   |0           |
|year_traditional_kitchens_showcase_20241018    |11655.58   |0           |
|year_modern_kitchens_demo_20240419             |12028.69   |0           |
|quarter_luxury_kitchens_innovation_20241116    |12894.26   |0           |
|month_openconcept_kitche

In [168]:
(
    campaigns_agg
    .where('unique_deals = 0')
    .select(F.count('campaign_name'), F.sum('total_costs'))
    .show()
)

+--------------------+----------------+
|count(campaign_name)|sum(total_costs)|
+--------------------+----------------+
|                  13|       147059.03|
+--------------------+----------------+



In [169]:
# 2. Средняя цена сделки
(
    campaigns_agg
    .where('unique_deals > 0')
    .select('campaign_name', 'total_costs', 'unique_deals', 'avg_deal_cost')
    .sort('avg_deal_cost')
    .show(truncate=False)
)

+------------------------------------------------+-----------+------------+-------------+
|campaign_name                                   |total_costs|unique_deals|avg_deal_cost|
+------------------------------------------------+-----------+------------+-------------+
|year_modern_kitchens_showcase_20241014          |10936.74   |8           |1367.09      |
|quarter_custom_kitchens_innovation_20240817     |15422.19   |10          |1542.22      |
|month_contemporary_kitchens_promotion_20241215  |12423.61   |8           |1552.95      |
|quarter_custom_kitchens_experience_20240222     |15455.11   |9           |1717.23      |
|year_modern_kitchen_experience_20240702         |9068.35    |5           |1813.67      |
|year_smart_kitchens_initiative_20241004         |12958.53   |6           |2159.76      |
|quarter_custom_kitchen_initiative_20241102      |18846.27   |8           |2355.78      |
|month_openconcept_kitchens_promotion_20240628   |15318.45   |6           |2553.08      |
|year_trad

In [170]:
(
    campaigns_agg
    .where('unique_deals > 0')
    .select('campaign_name', 'total_costs', 'unique_deals', 'avg_deal_cost')
    .sort('avg_deal_cost', ascending=False)
    .show(truncate=False)
)

+-----------------------------------------------+-----------+------------+-------------+
|campaign_name                                  |total_costs|unique_deals|avg_deal_cost|
+-----------------------------------------------+-----------+------------+-------------+
|quarter_custom_kitchen_show_20240213           |16643.02   |1           |16643.02     |
|month_contemporary_kitchen_showcase_20240630   |15908.47   |1           |15908.47     |
|quarter_spacesaving_kitchen_innovation_20240825|15314.02   |1           |15314.02     |
|year_luxury_kitchens_show_20241001             |15132.48   |1           |15132.48     |
|year_traditional_kitchens_launch_20240707      |13893.29   |1           |13893.29     |
|quarter_spacesaving_kitchen_showcase_20240228  |13126.07   |1           |13126.07     |
|quarter_spacesaving_kitchen_innovation_20241103|13063.42   |1           |13063.42     |
|month_custom_kitchens_show_20241205            |13045.27   |1           |13045.27     |
|year_traditional_kit

In [174]:
# 3. Убыточные кампании. Пусть каждая сделка стоит 5к
(
    campaigns_agg
    .select('campaign_name', 'total_costs', 'unique_deals', 'avg_deal_cost')
    .withColumn('revenue', F.col('unique_deals') * F.lit(5000))
    .withColumn('profit', F.col('revenue') - F.col('total_costs'))
    .sort('profit')
    .show(truncate=False)
)

+-----------------------------------------------+-----------+------------+-------------+-------+---------+
|campaign_name                                  |total_costs|unique_deals|avg_deal_cost|revenue|profit   |
+-----------------------------------------------+-----------+------------+-------------+-------+---------+
|quarter_custom_kitchens_experience_20240527    |15456.61   |0           |null         |0      |-15456.61|
|month_openconcept_kitchens_experience_20240910 |14042.09   |0           |null         |0      |-14042.09|
|month_openconcept_kitchens_initiative_20240328 |13321.93   |0           |null         |0      |-13321.93|
|quarter_luxury_kitchens_innovation_20241116    |12894.26   |0           |null         |0      |-12894.26|
|year_modern_kitchens_demo_20240419             |12028.69   |0           |null         |0      |-12028.69|
|year_traditional_kitchens_showcase_20241018    |11655.58   |0           |null         |0      |-11655.58|
|quarter_custom_kitchen_show_20240213

In [175]:
# 4. Самые прибыльные кампании
(
    campaigns_agg
    .select('campaign_name', 'total_costs', 'unique_deals', 'avg_deal_cost')
    .withColumn('revenue', F.col('unique_deals') * F.lit(5000))
    .withColumn('profit', F.col('revenue') - F.col('total_costs'))
    .sort(F.desc('profit'))
    .show(truncate=False)
)

+------------------------------------------------+-----------+------------+-------------+-------+--------+
|campaign_name                                   |total_costs|unique_deals|avg_deal_cost|revenue|profit  |
+------------------------------------------------+-----------+------------+-------------+-------+--------+
|quarter_custom_kitchens_innovation_20240817     |15422.19   |10          |1542.22      |50000  |34577.81|
|quarter_custom_kitchens_experience_20240222     |15455.11   |9           |1717.23      |45000  |29544.89|
|year_modern_kitchens_showcase_20241014          |10936.74   |8           |1367.09      |40000  |29063.26|
|month_contemporary_kitchens_promotion_20241215  |12423.61   |8           |1552.95      |40000  |27576.39|
|quarter_custom_kitchen_initiative_20241102      |18846.27   |8           |2355.78      |40000  |21153.73|
|year_traditional_kitchens_fair_20240415         |20811.03   |8           |2601.38      |40000  |19188.97|
|year_smart_kitchens_initiative_20241

In [178]:
# 5. Метрики в разбивке по месяцам
(
    dates_agg
    .withColumn('revenue', F.col('unique_deals') * F.lit(5000))
    .withColumn('profit', F.col('revenue') - F.col('total_costs'))
    .drop('total_duration', 'total_views')
    .sort('month')
    .show()
)

+-------+-------------+--------------+--------------+------------+-----------+------------+-------------+-------+---------+
|  month|unique_visits|unique_clients|unique_submits|unique_deals|total_costs|total_clicks|avg_deal_cost|revenue|   profit|
+-------+-------------+--------------+--------------+------------+-----------+------------+-------------+-------+---------+
|2024-01|          202|           183|            66|          36|  108348.27|       10843|      3009.67| 180000| 71651.73|
|2024-02|          185|           165|            56|          27|  101600.49|        8733|      3762.98| 135000| 33399.51|
|2024-03|          183|           164|            58|          35|  100875.50|        9967|      2882.16| 175000| 74124.50|
|2024-04|          184|           163|            53|          21|   97707.65|        9748|      4652.75| 105000|  7292.35|
|2024-05|          191|           177|            68|          30|  101051.11|       10386|      3368.37| 150000| 48948.89|
|2024-06

In [179]:
# 6. Сколько всего потратили денег на рекламу за год
dates_agg.select(F.sum('total_costs')).show()

+----------------+
|sum(total_costs)|
+----------------+
|      1313753.39|
+----------------+



## Освобождаем ресурсы, останавливаем Spark-сессию

In [180]:
# Spark UI
customer_detailed.unpersist()
campaigns_agg.unpersist()
dates_agg.unpersist()

DataFrame[month: string, unique_visits: bigint, unique_clients: bigint, unique_submits: bigint, unique_deals: bigint, total_costs: decimal(29,2), total_clicks: bigint, total_views: bigint, total_duration: bigint, avg_deal_cost: decimal(19,2)]

In [181]:
spark.stop()