In [None]:
# Import PySpark essentials
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, HashingTF, ChiSqSelector
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import udf, col
from pyspark.sql.types import BooleanType, IntegerType
import random
import time

# 1. Start Spark Session
spark = SparkSession.builder \
    .appName("UCI Student Performance Feature Generation") \
    .master("local[*]") \
    .getOrCreate()

# 2. Load UCI Student Performance data (Math students dataset)
data_path = "https://archive.ics.uci.edu/ml/machine-learning-databases/00320/student-mat.csv"
df = spark.read.csv(data_path, header=True, sep=";")

df.show(5)

# 3. Define simple feature extraction functions

# Example: Convert G3 (final grade) to integer and binarize as pass/fail label
df = df.withColumn("G3_int", col("G3").cast(IntegerType()))

# Binarize G3: pass if grade >= 10, else fail
binarize_udf = udf(lambda grade: grade >= 10, BooleanType())
df = df.withColumn("passed", binarize_udf(col("G3_int")))

df.select("G3_int", "passed").show(5)

# 4. Extract features - for demonstration, tokenize 'reason' (reason to choose school)

tokenizer = Tokenizer(inputCol="reason", outputCol="words")
tokenized_df = tokenizer.transform(df)

tokenized_df.select("reason", "words").show(5, truncate=False)

# 5. Hashing Term Frequency for tokenized words
hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20)
featurized_df = hashingTF.transform(tokenized_df)

featurized_df.select("words", "features").show(5, truncate=False)

# 6. Prepare labeled dataset: features and label (passed)
final_df = featurized_df.select("features", "passed")

# 7. Use Chi-Square Selector to select top K features correlated with label
selector = ChiSqSelector(numTopFeatures=5, featuresCol="features", outputCol="selectedFeatures", labelCol="passed")
model = selector.fit(final_df)
result_df = model.transform(final_df)

result_df.select("selectedFeatures", "passed").show(5, truncate=False)

# 8. Feature binarization example: create a binary feature from number of absences
df = df.withColumn("absences_int", col("absences").cast(IntegerType()))

binarize_absences_udf = udf(lambda absences: absences > 5, BooleanType())
df = df.withColumn("high_absences", binarize_absences_udf(col("absences_int")))

df.select("absences_int", "high_absences").show(5)

# 9. Stop Spark session when done
spark.stop()
