In [None]:
#Install the required libraries
!pip install pyspark
!pip install spark
from pyspark.sql import SparkSession
# Initialize a Spark session
spark = SparkSession.builder.appName("02_feature_engineering_pyspark").getOrCreate()
spark.sparkContext.setLogLevel("WARN")



In [6]:
import os
import argparse
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.sql.types import DoubleType

parser = argparse.ArgumentParser()
parser.add_argument("--input_parquet", default="/content/data/cleaned.paraquet")
parser.add_argument("--out_parquet", default="/content/data/featured.paraquet")
args = parser.parse_args(args=[])

In [7]:
df = spark.read.parquet(args.input_parquet)
print("Loaded df shape (approx):", df.count(), len(df.columns))

Loaded df shape (approx): 2499784 79


In [8]:
cols = df.columns
if set(['flow_duration','tot_bytes']).issubset(set(cols)):
    df = df.withColumn("bytes_per_sec", (F.col("tot_bytes") / (F.col("flow_duration") + F.lit(1.0))).cast(DoubleType()))
if set(['tot_bytes','tot_fwd_packets','tot_bwd_packets']).issubset(set(cols)):
    df = df.withColumn("avg_packet_size", (F.col("tot_bytes") / (F.col("tot_fwd_packets") + F.col("tot_bwd_packets") + F.lit(1.0))).cast(DoubleType()))
if set(['tot_fwd_packets','tot_bwd_packets']).issubset(set(cols)):
    df = df.withColumn("fwd_pkt_ratio", (F.col("tot_fwd_packets") / (F.col("tot_fwd_packets") + F.col("tot_bwd_packets") + F.lit(1.0))).cast(DoubleType()))


In [9]:
string_cols = [c for c,t in df.dtypes if t == 'string']

In [10]:
candidates = []
for c in string_cols:
    distinct = df.select(c).distinct().count()
    if distinct <= 50:
        candidates.append(c)
print("Categorical columns to OHE (<=50 cardinality):", candidates)

Categorical columns to OHE (<=50 cardinality): []


In [11]:
indexers = [StringIndexer(inputCol=c, outputCol=c+"_idx", handleInvalid="keep") for c in candidates]
encoders = [OneHotEncoder(inputCol=c+"_idx", outputCol=c+"_ohe") for c in candidates]

In [12]:
numeric_cols = [c for c,t in df.dtypes if t in ('double','int','long','float','bigint','tinyint','smallint','decimal')]
numeric_cols = [c for c in numeric_cols if c not in ['y_binary']]  # exclude label

In [13]:
ohe_cols = [c+"_ohe" for c in candidates]
feature_cols = numeric_cols + ohe_cols
print("Feature column count:", len(feature_cols))

Feature column count: 78


In [14]:
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features_assembled", handleInvalid="keep")
scaler = StandardScaler(inputCol="features_assembled", outputCol="features", withStd=True, withMean=False)


In [15]:
pipeline_stages = []
pipeline_stages.extend(indexers)
pipeline_stages.extend(encoders)
pipeline_stages.append(assembler)
pipeline_stages.append(scaler)
pipeline = Pipeline(stages=pipeline_stages)

In [16]:
print("Fitting pipeline...")
model = pipeline.fit(df)
df_trans = model.transform(df)

Fitting pipeline...


In [17]:
os.makedirs("models", exist_ok=True)
model.write().overwrite().save("models/feature_pipeline")
print("Saved feature pipeline to models/feature_pipeline")

Saved feature pipeline to models/feature_pipeline


In [21]:
from pyspark.sql.functions import col

df_trans.select(
    *df.columns,
    "features",
    col("y_binary").alias("y_binary_new")
).write.mode("overwrite").parquet(args.out_parquet)



In [None]:
spark.stop()