## **Создаем Spark-сессию**

In [1]:
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.window import Window

In [2]:
jar_files = [
    "/usr/local/spark/jars/postgresql-42.6.0.jar",
    "/usr/local/spark/jars/clickhouse-jdbc-0.4.6-all.jar"
]

In [3]:
spark = (
    SparkSession
    .builder
    .appName("SparkDataMart")
    .config("spark.jars", ",".join(jar_files))
    .getOrCreate()
)

25/10/25 18:09:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
",".join(jar_files)

'/usr/local/spark/jars/postgresql-42.6.0.jar,/usr/local/spark/jars/clickhouse-jdbc-0.4.6-all.jar'

In [5]:
spark

## **Подключаемся к источникам**

**csv**

In [6]:
path = "/home/jovyan/work/data"

In [7]:
campaigns_dict = (
    spark.read
    .option('header', True)
    .csv(f'{path}/campaigns_dict.csv')
)

In [8]:
# ленивые вычисления (transformations, actions)
campaigns_dict.show(5, truncate=False) #truncate=False - чтобы Spark не обрезал текст до определенного количества символов при выводе, а выводил все

+-----------+------------------------------------------+
|campaign_id|campaign_name                             |
+-----------+------------------------------------------+
|1          |year_modern_kitchen_launch_20250115       |
|2          |quarter_custom_kitchens_showcase_20240210 |
|3          |month_smart_kitchen_promotion_20240305    |
|4          |year_luxury_kitchens_exhibit_20240420     |
|5          |quarter_ecofriendly_kitchen_offer_20240512|
+-----------+------------------------------------------+
only showing top 5 rows


In [9]:
campaigns_dict.printSchema()

root
 |-- campaign_id: string (nullable = true)
 |-- campaign_name: string (nullable = true)



**parquet**

In [10]:
submits = spark.read.parquet(f'{path}/submits.parquet')

In [11]:
submits.show(5, truncate=False)

+---------+--------+-----------+
|submit_id|name    |phone      |
+---------+--------+-----------+
|2282     |Jennifer|79511904041|
|9898     |Jeffrey |79824419733|
|9005     |Linda   |79074725672|
|1507     |Teresa  |79864203598|
|3803     |Tanya   |79779567654|
+---------+--------+-----------+
only showing top 5 rows


In [12]:
submits.printSchema()

root
 |-- submit_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- phone: long (nullable = true)



In [13]:
deals = spark.read.parquet(f'{path}/deals.parquet')

In [14]:
deals.show(5, truncate=False)

+-------+----------+---------------------+-----------+------------------------+-----------------------------------------------------+
|deal_id|deal_date |fio                  |phone      |email                   |address                                              |
+-------+----------+---------------------+-----------+------------------------+-----------------------------------------------------+
|1      |2024-03-04|Gregory Wu           |79746561889|paul80@example.net      |098 Yates Cliff Apt. 241, East Monica, DE 88076      |
|2      |2024-08-20|William Ross Jr.     |79074725672|xyoung@example.org      |197 Willie Groves Apt. 655, Port Angelaberg, LA 39384|
|3      |2024-10-15|Sonya Kerr           |79201244835|elewis@example.com      |144 Andrew Cape, Lake Nicholas, SC 58918             |
|4      |2024-12-31|Mrs. Angela Tucker MD|79771829751|robertparker@example.net|6056 Collins View, South Harold, OR 15650            |
|5      |2024-03-23|Eric Flores          |79729054809|barbara7

In [15]:
deals.show(2, truncate=False, vertical=True)

-RECORD 0----------------------------------------------------------
 deal_id   | 1                                                     
 deal_date | 2024-03-04                                            
 fio       | Gregory Wu                                            
 phone     | 79746561889                                           
 email     | paul80@example.net                                    
 address   | 098 Yates Cliff Apt. 241, East Monica, DE 88076       
-RECORD 1----------------------------------------------------------
 deal_id   | 2                                                     
 deal_date | 2024-08-20                                            
 fio       | William Ross Jr.                                      
 phone     | 79074725672                                           
 email     | xyoung@example.org                                    
 address   | 197 Willie Groves Apt. 655, Port Angelaberg, LA 39384 
only showing top 2 rows


In [23]:
deals.printSchema()

root
 |-- deal_id: long (nullable = true)
 |-- deal_date: string (nullable = true)
 |-- fio: string (nullable = true)
 |-- phone: long (nullable = true)
 |-- email: string (nullable = true)
 |-- address: string (nullable = true)



**Установка pyarrow**

In [19]:
# Удалим все старые закачки
!pip cache purge

Files removed: 6


In [20]:
# Обновим pip до последней версии
!pip install --upgrade pip

Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m138.1 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.3
    Uninstalling pip-23.3:
      Successfully uninstalled pip-23.3
Successfully installed pip-25.3


In [21]:
# Установим pyarrow без кэша и без проверки хэшей
!pip install --no-cache-dir --no-deps pyarrow

Collecting pyarrow
  Downloading pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl (47.7 MB)
[2K   [91m━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/47.7 MB[0m [31m275.9 kB/s[0m  [36m0:01:41[0m[0m
[0mResuming download pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl (20.0 MB/47.7 MB)
[2K   [91m━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m21.8/47.7 MB[0m [31m240.0 kB/s[0m  [36m0:01:48[0m[0m
[0mResuming download pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl (21.8 MB/47.7 MB)
[2K   [91m━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/47.7 MB[0m [31m73.8 kB/s[0m  [36m0:05:25[0m[0m
[0mResuming download pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl (23.8 MB/47.7 MB)
[2K   [91m━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━[0m [32m26.7/47.7 MB[0m [31m73.0 kB/s[0m  [36m0:04:49

In [22]:
import pyarrow as pa
import pyarrow.parquet as pq
print(pa.__version__)

22.0.0


In [24]:
pq.read_metadata(f'{path}/submits.parquet')  # Parquet recommends row groups sized between 512MB and 1GB

<pyarrow._parquet.FileMetaData object at 0x7fcd31d3f010>
  created_by: parquet-cpp-arrow version 19.0.0
  num_columns: 3
  num_rows: 4000
  num_row_groups: 1
  format_version: 2.6
  serialized_size: 2371

In [25]:
submits.count() # количество строк

4000