In [None]:
# =============================================================================
# IMPORT LIBRARIES AND INITIALIZE SPARK SESSION
# =============================================================================
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("EmployeeAttritionPrediction") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()
# =============================================================================
# DATA LOADING
# =============================================================================
# Load the dataset
df = spark.read.csv("HR_Employee_Cleaned.csv", header=True, inferSchema=True)
# Check schema and class distribution
print("Schema:")
df.printSchema()
print("\nClass distribution:")
df.groupBy("Attrition").count().show()
# =============================================================================
# PIPELINE PREPARATION
# =============================================================================
# Prepare feature vector
feature_columns = [col for col in df.columns if col != "Attrition"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
# =============================================================================
# SPLITTING THE DATA
# =============================================================================
# Split data into training and test sets (70/30)
train_data, test_data = df.randomSplit([0.7, 0.3], seed=42)

# =============================================================================
# MODEL TRAINING: RANDOM FOREST CLASSIFIER
# =============================================================================

# Initialize Random Forest Classifier with class weighting
rf = RandomForestClassifier(
    labelCol="Attrition",
    featuresCol="features",
    weightCol="classWeight",
    seed=42
)

# Add class weights to handle imbalance (if needed)
from pyspark.sql import functions as F
class_weights = df.groupBy("Attrition").count().rdd.collectAsMap()
total = df.count()
weight_udf = F.when(F.col("Attrition") == 0, total / (2 * class_weights[0])) \
              .otherwise(total / (2 * class_weights[1]))
df = df.withColumn("classWeight", weight_udf)
train_data = df.randomSplit([0.7, 0.3], seed=42)[0]  # Re-split after adding weights

# Pipeline stages
pipeline = Pipeline(stages=[assembler, rf])

# Hyperparameter grid for tuning
param_grid = ParamGridBuilder() \
    .addGrid(rf.maxDepth, [5, 10, 15, 20]) \
    .addGrid(rf.numTrees, [50, 100, 150, 200]) \
    .addGrid(rf.maxBins, [32, 64, 128]) \
    .addGrid(rf.minInfoGain, [0.0, 0.1, 0.2]) \
    .addGrid(rf.impurity, ["gini", "entropy"]) \
    .build()


# Evaluators (for metrics and cross-validation)
multi_evaluator = MulticlassClassificationEvaluator(
    labelCol="Attrition",
    metricName="f1"
)
bin_evaluator = BinaryClassificationEvaluator(
    labelCol="Attrition",
    metricName="areaUnderROC"
)

# Cross-validator with 3-fold cross-validation
cv = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=param_grid,
    evaluator=bin_evaluator,
    numFolds=5,  # More robust validation
    seed=42
)

# Train the model
cv_model = cv.fit(train_data)

# Get best model and predictions
best_model = cv_model.bestModel
test_predictions = best_model.transform(test_data)

# Evaluate performance
accuracy = multi_evaluator.evaluate(test_predictions, {multi_evaluator.metricName: "accuracy"})
precision = multi_evaluator.evaluate(test_predictions, {multi_evaluator.metricName: "weightedPrecision"})
recall = multi_evaluator.evaluate(test_predictions, {multi_evaluator.metricName: "weightedRecall"})
f1 = multi_evaluator.evaluate(test_predictions, {multi_evaluator.metricName: "f1"})
auc = bin_evaluator.evaluate(test_predictions)

# Feature importance from the Random Forest model
feature_importances = best_model.stages[-1].featureImportances.toArray()
features_importance_df = spark.createDataFrame(
    [(feature, float(importance)) for feature, importance in zip(feature_columns, feature_importances)],
    ["Feature", "Importance"]
).orderBy("Importance", ascending=False)
# =============================================================================
# FINAL EVALUATION & CLEANUP
# =============================================================================
# Display results
print("\nModel Evaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"AUC: {auc:.4f}")

print("\nTop 10 Features Influencing Attrition:")
features_importance_df.show(10, truncate=False)

# Stop Spark session
spark.stop()

In [None]:
import matplotlib.pyplot as plt

# After running the Random Forest model code in HR_Project_BigData.ipynb

# 1. Feature Importance Plot
plt.figure(figsize=(12, 8))
features_importance_pd = features_importance_df.toPandas()
plt.barh(features_importance_pd['Feature'][:10], features_importance_pd['Importance'][:10])
plt.xlabel('Importance Score')
plt.title('Top 10 Features Influencing Attrition (Random Forest)')
plt.gca().invert_yaxis()  # Most important at top
plt.show()

# 2. Class Distribution Pie Chart
class_dist = df.groupBy("Attrition").count().toPandas()
plt.figure(figsize=(6, 6))
plt.pie(class_dist['count'], labels=['No Attrition', 'Attrition'], 
        autopct='%1.1f%%', colors=['lightgreen', 'lightcoral'])
plt.title('Class Distribution in Dataset')
plt.show()

# 3. ROC Curve (if you want to plot it manually)
from sklearn.metrics import roc_curve, auc
import numpy as np

# Get probabilities and true labels
y_true = test_predictions.select("Attrition").rdd.flatMap(lambda x: x).collect()
y_prob = test_predictions.select("probability").rdd.map(lambda x: float(x[0][1])).collect()

fpr, tpr, _ = roc_curve(y_true, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Random Forest ROC Curve')
plt.legend(loc="lower right")
plt.show()