In [0]:
# 1. ЗАВАНТАЖЕННЯ ОЧИЩЕНИХ ДАНИХ

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SantanderFeatureEngineering").getOrCreate()
delta_path = "/Volumes/workspace/default/santader_delta_dataset"
df = spark.read.format("delta").load(delta_path)
df.show(5)
df.printSchema()

+----------+------+--------------------+-------------------+--------------------+-------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------------+-------------------+--------------------+--------------------+-------------------+-------------------+--------------------+--------------------+------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+--------------------+--------------------+-------------------+-------------------+--------------------+-------------------+------------------+-------------------+-------------------+--------------------+

In [0]:
# 2. АНАЛІЗ ЗМІННИХ
numeric_cols = [col for col, dtype in df.dtypes if dtype in ('double', 'int') and col != 'target']
df.select(numeric_cols).describe().show()


+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----

In [0]:
# 3. ОБРОБКА ЧИСЛОВИХ ОЗНАК

from pyspark.ml.feature import StandardScaler, VectorAssembler

assembler = VectorAssembler(inputCols=numeric_cols, outputCol="features_vector")
df_vector = assembler.transform(df)

scaler = StandardScaler(inputCol="features_vector", outputCol="scaled_features", withMean=True, withStd=True)
scaler_model = scaler.fit(df_vector)
df_scaled = scaler_model.transform(df_vector)
df_scaled.select("scaled_features").show(5)



+--------------------+
|     scaled_features|
+--------------------+
|[-0.3974546935055...|
|[-0.7490215000893...|
|[1.80294803892072...|
|[-0.1893845426701...|
|[0.98511583738521...|
+--------------------+
only showing top 5 rows


In [0]:

from pyspark.sql.functions import col, mean, stddev

# Середнє та стандартне відхилення по всіх числових колонках
agg_exprs = [mean(c).alias(f"{c}_mean") for c in numeric_cols] + [stddev(c).alias(f"{c}_std") for c in numeric_cols]
df_agg = df.agg(*agg_exprs)
df_agg.show()



+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------

In [0]:

low_variance_cols = []
for c in numeric_cols:
    std = df.select(stddev(c)).collect()[0][0]
    if std < 0.01: 
        low_variance_cols.append(c)

print("Низьковаріативні колонки:", low_variance_cols)
df = df.drop(*low_variance_cols)



Низьковаріативні колонки: []


In [0]:

# Збереження фінального набору
output_path = "/Volumes/workspace/default/santader_delta_2"
df.write.format("delta").mode("overwrite").save(output_path)
