In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import * 
from pyspark.ml.feature import StringIndexer, Word2Vec, VectorAssembler
from pyspark.ml.linalg import DenseVector, VectorUDT
from pyspark.sql.types import ArrayType, FloatType
import pyspark.sql.functions as F

In [0]:
telco_path = "sdd_dev.sohag_test.telco_customer_churn"
telco_df = spark.read.table(telco_path)
display(telco_df)

# Processing the dataset

## Handing missing values
* NULL values might be stored as "null"

In [0]:
bad_vals = ["null", ""]          # add more literals if you find them
for c in telco_df.columns:
    telco_df = telco_df.withColumn(
        c,
        when(trim(col(c)).isin(bad_vals), None)      # "null", "", "   "
        .otherwise(col(c))
    )

In [0]:
# 2️⃣  Cast columns that should be numeric ------------------------------------
#    – add any other numeric columns you have
numeric_cols = ["SeniorCitizen", "tenure", "MonthlyCharges", "TotalCharges"]
for c in numeric_cols:
    telco_df = telco_df.withColumn(c, col(c).cast("double"))

In [0]:
# # 3️⃣  Optional: drop any rows that still contain nulls -----------------------
# telco_df = telco_df.dropna() 

## Converting datatype
* SeniorCitizen should be Boolean.
* TotalCharges should be double

In [0]:
telco_df = telco_df.withColumn("SeniorCitizen", when(col("SeniorCitizen")==1, True).otherwise(False))

display(telco_df)

In [0]:
telco_df.columns

# Splitting the dataset into trianing and testing sets

In [0]:
train_df, test_df = telco_df.randomSplit([.8, .2], seed=42)

# Transforming the dataset
* Coverting Integer and Boolena columns to double

In [0]:
# Get a list of integer and boolean columns
integer_cols = [column.name for column in train_df.schema.fields if (column.dataType == IntegerType() or column.dataType == BooleanType())]
print(integer_cols)

for column in integer_cols:
    train_df = train_df.withColumn(column, col(column).cast("double"))
    test_df = test_df.withColumn(column, col(column).cast("double"))

In [0]:
# Get a list of numeric columns
num_cols = [c.name for c in train_df.schema.fields if c.dataType == DoubleType()]
print(num_cols)

# Dictionary of {column: missing_count} for columns with missing values
num_missing_cols = [
    c for c in num_cols
    if train_df.filter(col(c).isNull()).count() > 0
]
print(num_missing_cols)

In [0]:
# 1. Identify string columns
string_cols = [c.name for c in train_df.schema.fields if isinstance(c.dataType, StringType)]

# 2. Count missing values in string columns
string_missing_values_logic = [
    count(when(col(column).isNull(), column)).alias(column) for column in string_cols
]
row_dict_string = train_df.select(string_missing_values_logic).first().asDict()
string_missing_cols = [column for column in row_dict_string if row_dict_string[column] > 0]

print(f"String columns with missing values: {string_missing_cols}")

# Creating a Feature Engineering Pipeline (Spark ML pipeline)
* Generate embeffing for categoriacal features
* Handling missing values
* Standardizing numerical feature
* Combining features into a Final vector
* Encapsulating steps into a pipeline

In [0]:
from pyspark.ml.feature import Word2Vec
from pyspark.sql.functions import split, col

def generate_categorical_embeddings(df, categorical_cols, vector_size = 5):
    """
    Generate embeddings for categorical columns using Word2Vec

    Parameters:
    df (pyspark.sql.DataFrame): Input DataFrame
    categorical_cols (list): List of categorical columns to generate embeddings for
    vector_size (int): Size of the embedding vector

    Returns:
    pyspark.sql.DataFrame: DataFrame with embeddings for categorical columns
    """

    # Replace NULL categorical values with "unknown"
    for col_name in categorical_cols:
        df = df.withColumn(col_name, when(col(col_name).isNull(), "unknown").otherwise(col(col_name)))

    # Combine all categoical columns into a single text column for Word2Vec
    df = df.withColumn("categorical_sequence", concat_ws(" ", *categorical_cols))

    # Tokenize categorical data
    df = df.withColumn("categorical_tokens", split(col("categorical_sequence"), " "))

    # Infer Word2Vec model for categorical columns
    word2vec = Word2Vec(vectorSize=vector_size, inputCol="categorical_tokens", outputCol="categorical_struct")
    model = word2vec.fit(df)

    # Generate embeddings for categorical columns
    df = model.transform(df)

    return df

## Applying Embedding to Categorical Features

In [0]:
categorical_columns = ["gender", "Partner", "InternetService", "Contract", "PaperlessBilling", "PaymentMethod", "Churn"]

# Generate embeddings for categorical columns
train_df = generate_categorical_embeddings(train_df, categorical_columns)
test_df = generate_categorical_embeddings(test_df, categorical_columns)

display(train_df)

In [0]:
# Extract the 'value' field from the word3vec struct
DenseVector_udf = F.udf(lambda v: DenseVector(v.values) if v else DenseVector([0.0] * 5), VectorUDT())

# Convert embeddings into DenseVectors
for col_name in categorical_columns:
  train_df = train_df.withColumn(col_name + "_embeddings", DenseVector_udf(F.col("categorical_struct")))
  test_df = test_df.withColumn(col_name + "_embeddings", DenseVector_udf(F.col("categorical_struct")))

# Drop unnecessary columns after embeddings are extracted
train_df = train_df.drop("categorical_sequence", "categorical_tokens", "categorical_struct")
test_df = test_df.drop("categorical_sequence", "categorical_tokens", "categorical_struct")

display(train_df)

In [0]:
# Show a sample of embedded categorical features
train_df.select("PaymentMethod", "PaymentMethod_embeddings").show(truncate=False)

## Feature Engineering and Pipeline Initialization
* Handle missing val in Numerical Column.
* Standardize Numerical Feature
* Assemble the Final Feature Vector
* Initializing the Spark ML Pipeline

In [0]:
from pyspark.ml.feature import Imputer, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline

# Define numerical column for imputation
numerical_cols = ["SeniorCitizen", "tenure", "MonthlyCharges", "TotalCharges"]

# Impute missing numerical feature
imputer = Imputer(inputCols=numerical_cols, outputCols=[col + "_imputed" for col in numerical_cols])

# Assemble numerical columns into a single vector
numerical_assembler = VectorAssembler(
    inputCols=[col + "_imputed" for col in numerical_cols], 
    outputCol="numerical_assembled"
)

# Scale numerical featured to standardize values
numerical_Scaler = StandardScaler(inputCol="numerical_assembled", outputCol= "numerical_scaled")

# Assemble all features (numerical + categorical embeddings) into a single feature vector
feature_cols = ["numerical_scaled"] + [col + "_embeddings" for col in categorical_columns]
vector_assembler = VectorAssembler(inputCols=feature_cols, outputCol="all_features")

# Define the sequence of the transformations
stages_list = [imputer, numerical_assembler, numerical_Scaler, vector_assembler]

# Initiate the pipeline
pipeline = Pipeline(stages = stages_list)

# Fit the pipeline
pipeline_model = pipeline.fit(train_df)

In [0]:
# Transform both training_df and test_df 
train_transformed_df = pipeline_model.transform(train_df)
test_transformed_df = pipeline_model.transform(test_df)

In [0]:
# Show transformed features
display(train_transformed_df.select('all_features'))

# Save the pipeline

In [0]:
model_path = "/Workspace/Shared/artifact_wheels/spark_pipelines"
pipeline_model.write().overwrite().save(model_path)

# Load and use saved model

In [0]:
from pyspark.ml import PipelineModel

loaded_pipeline = PipelineModel.load(model_path)

# Show pipeline stages
loaded_pipeline.stages

In [0]:
# Use the loaded pipeline to transofrm the test dataset
test_transformed_df = loaded_pipeline.transform(test_df)
display(test_transformed_df)