<a href="https://colab.research.google.com/github/elmanbkng123/BigData/blob/main/UAS_Big_Data%26Pal_22_11_5336.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, LogisticRegression, LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create Spark session
spark = SparkSession.builder.appName("Credit Risk Analysis").getOrCreate()

# Load dataset
data = spark.read.csv("credit_risk_data.csv", header=True, inferSchema=True)
data.show(5)

# Data Preprocessing
# Handle missing values
data = data.dropna()

# Convert categorical columns to numerical
indexer = StringIndexer(inputCols=["EmploymentStatus", "LoanPurpose"],
                         outputCols=["EmploymentStatusIndex", "LoanPurposeIndex"])
data = indexer.fit(data).transform(data)

# Feature Selection
assembler = VectorAssembler(inputCols=["Age", "AnnualIncome", "CreditScore", "LoanAmount",
                                        "LoanTerm", "EmploymentStatusIndex", "LoanPurposeIndex"],
                            outputCol="features")
data = assembler.transform(data)

# Split dataset into training and testing sets
train, test = data.randomSplit([0.8, 0.2], seed=42)

# Model Training and Evaluation
# Define models
models = {
    "Random Forest": RandomForestClassifier(featuresCol="features", labelCol="Default"),
    "Gradient Boost Tree": GBTClassifier(featuresCol="features", labelCol="Default"),
    "Logistic Regression": LogisticRegression(featuresCol="features", labelCol="Default"),
    "SVM": LinearSVC(featuresCol="features", labelCol="Default")
}

# Initialize evaluators
evaluator = BinaryClassificationEvaluator(labelCol="Default")

# Train and evaluate models
results = {}
for name, model in models.items():
    print(f"Training {name}...")
    fitted_model = model.fit(train)
    predictions = fitted_model.transform(test)
    auc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})
    accuracy = MulticlassClassificationEvaluator(labelCol="Default", metricName="accuracy").evaluate(predictions)
    f1_score = MulticlassClassificationEvaluator(labelCol="Default", metricName="f1").evaluate(predictions)
    results[name] = {"AUC": auc, "Accuracy": accuracy, "F1 Score": f1_score}
    print(f"{name} - AUC: {auc:.4f}, Accuracy: {accuracy:.4f}, F1 Score: {f1_score:.4f}")

# Hyperparameter Tuning for the Best Models
# Example for Random Forest
grid = ParamGridBuilder() \
    .addGrid(RandomForestClassifier.numTrees, [50, 100, 150]) \
    .addGrid(RandomForestClassifier.maxDepth, [5, 10, 15]) \
    .build()
crossval = CrossValidator(estimator=RandomForestClassifier(featuresCol="features", labelCol="Default"),
                           estimatorParamMaps=grid,
                           evaluator=evaluator,
                           numFolds=5)
best_model = crossval.fit(train).bestModel

# Evaluate the best model
best_predictions = best_model.transform(test)
final_auc = evaluator.evaluate(best_predictions, {evaluator.metricName: "areaUnderROC"})
print(f"Best Model AUC after Hyperparameter Tuning: {final_auc:.4f}")

# Save model and predictions
best_model.save("best_rf_model")
test.write.csv("test_predictions.csv")


AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/content/credit_risk_data.csv.