# ML Model Training

In [0]:
# Import Delta Lake dependencies
from delta.tables import *

# Read Delta table into a DataFrame
df = spark.read.format("delta").load("dbfs:/user/hive/warehouse/delta-table")


## Model development

In [0]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

# Initialize Spark session
spark = SparkSession.builder.appName("PropertyTypePrediction").getOrCreate()

# Load data (assuming df_cleaned is already loaded as shown in your image)

# 2. Encode Target Variable (Property Type)
indexer = StringIndexer(inputCol="Property_Type", outputCol="PropertyTypeIndex")

# 3. Feature Selection and Vectorization
feature_cols = ['List_Year', 'Assessed_Value', 'Sale_Amount', 'Sales_Ratio']  # Add more numeric features
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# 4. Split Data into Train and Test Sets
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

# 5. Train Random Forest Classifier
rf = RandomForestClassifier(labelCol="PropertyTypeIndex", featuresCol="features", numTrees=50)

# 6. Create a Pipeline
pipeline = Pipeline(stages=[indexer, assembler, rf])

# 7. Train the Model
model = pipeline.fit(train_data)



## Model testing

In [0]:

# 8. Make Predictions
predictions = model.transform(test_data)

# 9. Evaluate Model Performance
evaluator = MulticlassClassificationEvaluator(labelCol="PropertyTypeIndex", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"Test Accuracy: {accuracy}")

# # Optional: Show predictions
# predictions.select("Property Type", "prediction", "features").show()

Test Accuracy: 0.6882716049382716


## Save the model

In [0]:
import os

# Generate absolute path
save_path = os.path.abspath("property_type_model")

# Save the trained model with overwrite mode
model.write().overwrite().save(save_path)
print(f"Model saved successfully at: {save_path}")

Model saved successfully at: /Workspace/Users/azuser2373_mml.local@techademy.com/property_type_model


## Prediction Output

In [0]:
from pyspark.ml import PipelineModel

# Generate absolute path
load_path = os.path.abspath("property_type_model")

# Load the saved model
loaded_model = PipelineModel.load(load_path)
print("Model loaded successfully!")

# Use the loaded model to make predictions
new_predictions = loaded_model.transform(test_data)
new_predictions.select("Property_Type", "prediction", "features").show()


Model loaded successfully!
+-------------+----------+--------------------+
|Property_Type|prediction|            features|
+-------------+----------+--------------------+
|Single Family|       0.0|[2013.0,170890.0,...|
|Single Family|       0.0|[2017.0,35840.0,6...|
|Single Family|       1.0|[2017.0,46725.0,9...|
|Single Family|       1.0|[2017.0,81970.0,9...|
|        Condo|       0.0|[2017.0,101170.0,...|
|Single Family|       0.0|[2017.0,117150.0,...|
|  Four Family|       0.0|[2017.0,161490.0,...|
|Single Family|       0.0|[2017.0,116935.0,...|
|Single Family|       0.0|[2017.0,52360.0,4...|
|        Condo|       0.0|[2017.0,17220.0,2...|
|Single Family|       0.0|[2017.0,23870.0,3...|
|   Two Family|       1.0|[2017.0,64610.0,1...|
|        Condo|       0.0|[2017.0,75990.0,7...|
| Three Family|       1.0|[2017.0,45570.0,1...|
|Single Family|       0.0|[2018.0,161840.0,...|
|Single Family|       0.0|[2018.0,14000.0,3...|
|Single Family|       0.0|[2019.0,96630.0,2...|
|Single Famil