In [None]:
# evaluation_student_performance.ipynb

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, max as spark_max, when
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

spark = SparkSession.builder.appName("StudentPerformanceEvaluation").master("local[*]").getOrCreate()

# Load UCI Student Performance data CSV
data_path = "student-mat.csv"  # Put the dataset file here or update path
df = spark.read.csv(data_path, header=True, sep=";")

# Create binary label: passed = final grade (G3) >= 10
df = df.withColumn("G3_int", col("G3").cast("int"))
df = df.withColumn("label", when(col("G3_int") >= 10, 1).otherwise(0))

# Select numeric features for simplicity
feature_cols = ["absences", "age", "failures", "G1", "G2"]
for c in feature_cols:
    df = df.withColumn(c, col(c).cast("double"))

from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(df).select("features", "label")

# Split dataset
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# Train logistic regression model
lr = LogisticRegression(featuresCol="features", labelCol="label")
model = lr.fit(train_data)

print(f"Model coefficients: {model.coefficients}")
print(f"Model intercept: {model.intercept}")

# Evaluate model on training data with summary
training_summary = model.summary

# ROC curve
roc = training_summary.roc
roc.show()

print(f"Area under ROC curve: {training_summary.areaUnderROC}")

# Find best threshold by maximizing F-measure
f_measure = training_summary.fMeasureByThreshold
max_f_measure = f_measure.agg(spark_max("F-Measure")).collect()[0][0]
best_threshold = f_measure.filter(f_measure["F-Measure"] == max_f_measure).select("threshold").collect()[0][0]
print(f"Best threshold by F-measure: {best_threshold}")

# Set best threshold to model
model.setThreshold(best_threshold)

# Predict on test data
predictions = model.transform(test_data)
predictions.select("label", "rawPrediction", "probability", "prediction").show(5, truncate=False)

# Evaluate Precision-Recall on test data
evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderPR")
area_under_pr = evaluator.evaluate(predictions)
print(f"Area under Precision-Recall curve: {area_under_pr}")

def better_than_random(metric):
    return metric > 0.5

print("Better than random (ROC):", better_than_random(training_summary.areaUnderROC))
print("Better than random (PR):", better_than_random(area_under_pr))

spark.stop()
