In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import * 
from pyspark.ml.feature import StringIndexer, Word2Vec, VectorAssembler
from pyspark.ml.linalg import DenseVector, VectorUDT
from pyspark.sql.types import ArrayType, FloatType
import pyspark.sql.functions as F

In [0]:
telco_path = "sdd_dev.sohag_test.telco_customer_churn"
telco_df = spark.read.table(telco_path)
display(telco_df)

# Processing the dataset

## Handing missing values
* NULL values might be stored as "null"

In [0]:
bad_vals = ["null", ""]          # add more literals if you find them
for c in telco_df.columns:
    telco_df = telco_df.withColumn(
        c,
        when(trim(col(c)).isin(bad_vals), None)      # "null", "", "   "
        .otherwise(col(c))
    )

In [0]:
# 2️⃣  Cast columns that should be numeric ------------------------------------
#    – add any other numeric columns you have
numeric_cols = ["SeniorCitizen", "tenure", "MonthlyCharges", "TotalCharges"]
for c in numeric_cols:
    telco_df = telco_df.withColumn(c, col(c).cast("double"))

In [0]:
# # 3️⃣  Optional: drop any rows that still contain nulls -----------------------
# telco_df = telco_df.dropna() 

## Converting datatype
* SeniorCitizen should be Boolean.
* TotalCharges should be double

In [0]:
telco_df = telco_df.withColumn("SeniorCitizen", when(col("SeniorCitizen")==1, True).otherwise(False))

display(telco_df)

# Splitting the dataset into trianing and testing sets

In [0]:
train_df, test_df = telco_df.randomSplit([.8, .2], seed=42)

# Transforming the dataset
* Coverting Integer and Boolena columns to double

In [0]:
# Get a list of integer and boolean columns
integer_cols = [column.name for column in train_df.schema.fields if (column.dataType == IntegerType() or column.dataType == BooleanType())]
print(integer_cols)

for column in integer_cols:
    train_df = train_df.withColumn(column, col(column).cast("double"))
    test_df = test_df.withColumn(column, col(column).cast("double"))

In [0]:
# Get a list of numeric columns
num_cols = [c.name for c in train_df.schema.fields if c.dataType == DoubleType()]
print(num_cols)

# Dictionary of {column: missing_count} for columns with missing values
num_missing_cols = [
    c for c in num_cols
    if train_df.filter(col(c).isNull()).count() > 0
]
print(num_missing_cols)

In [0]:
# 1. Identify string columns
string_cols = [c.name for c in train_df.schema.fields if isinstance(c.dataType, StringType)]

# 2. Count missing values in string columns
string_missing_values_logic = [
    count(when(col(column).isNull(), column)).alias(column) for column in string_cols
]
row_dict_string = train_df.select(string_missing_values_logic).first().asDict()
string_missing_cols = [column for column in row_dict_string if row_dict_string[column] > 0]

print(f"String columns with missing values: {string_missing_cols}")

# Creating a Feature Engineering Pipeline (Spark ML pipeline)
* Generate embeffing for categoriacal features
* Handling missing values
* Standardizing numerical feature
* Combining features into a Final vector
* Encapsulating steps into a pipeline