In [9]:
import os
os.environ['SPARK_HOME'] = '/mnt/c/spark'
print(os.environ.get('SPARK_HOME'))

/mnt/c/spark


In [10]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

spark = SparkSession.builder.appName("prediction-models").getOrCreate()
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)
df = spark.read.csv("preprocessed_datasets/factorized_data.csv", sep = ',', header=True)

In [11]:
from pyspark.sql.types import IntegerType, DoubleType
from pyspark.sql.functions import col

df = df.withColumn("model_year", col("model_year").cast(IntegerType())) \
               .withColumn("milage", col("milage").cast(DoubleType())) \
               .withColumn("engine_capacity", col("engine_capacity").cast(DoubleType())) \
               .withColumn("engine_horsepower", col("engine_horsepower").cast(DoubleType())) \
               .withColumn("brand_numeric", col("brand_numeric").cast(IntegerType())) \
               .withColumn("transmission_numeric", col("transmission_numeric").cast(IntegerType())) \
               .withColumn("fuel_type_numeric", col("fuel_type_numeric").cast(IntegerType())) \
               .withColumn("ext_col_numeric", col("ext_col_numeric").cast(IntegerType())) \
               .withColumn("int_col_numeric", col("int_col_numeric").cast(IntegerType())) \
               .withColumn("accident_numeric", col("accident_numeric").cast(IntegerType())) \
                .withColumn("price", col("price").cast(IntegerType()))

In [12]:
feature_columns = ["model_year", "milage", "engine_capacity", "engine_horsepower", 
                   "brand_numeric", "transmission_numeric", 
                   "fuel_type_numeric", "ext_col_numeric", 
                   "int_col_numeric", "accident_numeric"]

In [13]:
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

In [14]:
df_vectorized = assembler.transform(df)

In [15]:
final_data = df_vectorized.select("features", "price")

In [16]:
train_data, test_data = final_data.randomSplit([0.8, 0.2], seed=42)

### Linear Regression

In [11]:
# Initialize the model
lr = LinearRegression(featuresCol="features", labelCol="price")

# Define the parameter grid for hyperparameter tuning
paramGrid = ParamGridBuilder() \
    .addGrid(lr.maxIter, [50, 100, 200]) \
    .addGrid(lr.regParam, [0.01, 0.1, 0.5, 1.0]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .addGrid(lr.tol, [1e-6, 1e-8]) \
    .build()

# Define the evaluator to minimize RMSE
evaluator = RegressionEvaluator(
    labelCol="price",
    predictionCol="prediction",
    metricName="rmse"
)
    
# Set up CrossValidator for hyperparameter tuning
crossval = CrossValidator(
    estimator=lr,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=5
)

# Perform the hyperparameter tuning
cv_model = crossval.fit(train_data)

best_model = cv_model.bestModel
print("Best Parameters:")
print(f"  MaxIter: {best_model._java_obj.getMaxIter()}")
print(f"  RegParam: {best_model._java_obj.getRegParam()}")
print(f"  ElasticNetParam: {best_model._java_obj.getElasticNetParam()}")
print(f"  Tol: {best_model._java_obj.getTol()}")

# Make predictions
predictions = best_model.transform(test_data)

# Show predictions
predictions.select("features", "price", "prediction").show()

evaluator_rmse = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
evaluator_mae = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="mae")
evaluator_r2 = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="r2")
rmse = evaluator_rmse.evaluate(predictions)
mae = evaluator_mae.evaluate(predictions)
r2 = evaluator_r2.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared or Coefficient of Determination (R2): {r2}")

best_model.write().overwrite().save(r"prediction_models\LR")
print("Best model Linear Regression saved")

                                                                                

Best Parameters:
  MaxIter: 50
  RegParam: 0.01
  ElasticNetParam: 1.0
  Tol: 1e-06
+--------------------+------+-------------------+
|            features| price|         prediction|
+--------------------+------+-------------------+
|(10,[0,1,2,3],[20...|  9995| 14609.589893190656|
|(10,[0,1,2,3],[20...|  6999| 25318.206373632886|
|(10,[0,1,2,3],[20...| 18500| 14028.679266107501|
|(10,[0,1,2,3],[20...|  8500|  36602.15228249831|
|(10,[0,1,2,3],[20...| 12500| -7526.949929713272|
|(10,[0,1,2,3],[20...| 20997|   17717.4860908119|
|(10,[0,1,2,3],[20...| 11499|  5413.003342607059|
|(10,[0,1,2,3],[20...| 28250|  22322.06967954291|
|(10,[0,1,2,3],[20...| 12000| -5433.922244911082|
|(10,[0,1,2,3],[20...|  9000| -7861.703079900937|
|(10,[0,1,2,3],[20...|  3950|-12514.243963225745|
|(10,[0,1,2,3],[20...| 10300| 28266.093517392408|
|(10,[0,1,2,3],[20...|  6500| 11326.824738914613|
|(10,[0,1,2,3],[20...|  8500|  5014.594567940803|
|(10,[0,1,2,3],[20...|399950|  4216.178444917081|
|(10,[0,1,2,3],[

### Random Forest

In [17]:
rf = RandomForestRegressor(featuresCol="features", labelCol="price", seed=42)

# Define the parameter grid for hyperparameter tuning
paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [50,100,150,200]) \
    .addGrid(rf.maxDepth, [5,8,10]) \
    .addGrid(rf.minInstancesPerNode, [1, 2, 4]) \
    .build()

# Define the evaluator to minimize RMSE
evaluator_rmse = RegressionEvaluator(
    labelCol="price",
    predictionCol="prediction",
    metricName="rmse"
)

# Set up CrossValidator for hyperparameter tuning
crossval = CrossValidator(
    estimator=rf,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator_rmse,
    numFolds=5  # 5-fold cross-validation
)

# Perform the hyperparameter tuning
cv_model = crossval.fit(train_data)

# Best model and parameters
best_model = cv_model.bestModel
print("Best Parameters:")
print(f"  NumTrees: {best_model.getNumTrees}")
print(f"  MaxDepth: {best_model.getOrDefault('maxDepth')}")
print(f"  MinInstancesPerNode: {best_model.getOrDefault('minInstancesPerNode')}")

# Evaluate the best model on the test data
predictions = best_model.transform(test_data)
evaluator_mae = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="mae")
evaluator_r2 = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="r2")
rmse = evaluator_rmse.evaluate(predictions)
mae = evaluator_mae.evaluate(predictions)
r2 = evaluator_r2.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared or Coefficient of Determination (R2): {r2}")

# Show some predictions
predictions.select("features", "price", "prediction").show()

# Save the best model
best_model.write().overwrite().save("prediction_models/RF_best")
print("Best Radnom Forrrest model saved.")

24/12/16 23:46:03 WARN DAGScheduler: Broadcasting large task binary with size 1068.2 KiB
24/12/16 23:46:04 WARN DAGScheduler: Broadcasting large task binary with size 2039.6 KiB
24/12/16 23:46:10 WARN DAGScheduler: Broadcasting large task binary with size 1068.2 KiB
24/12/16 23:46:11 WARN DAGScheduler: Broadcasting large task binary with size 2039.6 KiB
24/12/16 23:46:17 WARN DAGScheduler: Broadcasting large task binary with size 1068.5 KiB
24/12/16 23:46:18 WARN DAGScheduler: Broadcasting large task binary with size 2039.0 KiB
24/12/16 23:46:23 WARN DAGScheduler: Broadcasting large task binary with size 1068.2 KiB
24/12/16 23:46:25 WARN DAGScheduler: Broadcasting large task binary with size 2039.6 KiB
24/12/16 23:46:26 WARN DAGScheduler: Broadcasting large task binary with size 3.8 MiB
24/12/16 23:46:27 WARN DAGScheduler: Broadcasting large task binary with size 1152.4 KiB
24/12/16 23:46:29 WARN DAGScheduler: Broadcasting large task binary with size 7.2 MiB
24/12/16 23:46:30 WARN DAGS

Best Parameters:
  NumTrees: 200
  MaxDepth: 10
  MinInstancesPerNode: 2


                                                                                

Root Mean Squared Error (RMSE): 11024.982599599829
Mean Absolute Error (MAE): 7975.814619110444
R-squared or Coefficient of Determination (R2): 0.6616955483175495


                                                                                

+--------------------+-----+------------------+
|            features|price|        prediction|
+--------------------+-----+------------------+
|(10,[0,1,2,3],[20...| 9995|13877.316236860075|
|(10,[0,1,2,3],[20...| 9250|  9480.81192185179|
|(10,[0,1,2,3],[20...|12990|12496.506386567939|
|(10,[0,1,2,3],[20...|32850|18591.472975695844|
|(10,[0,1,2,3],[20...| 6348| 21810.97555532204|
|(10,[0,1,2,3],[20...| 7900|12824.360807051136|
|(10,[0,1,2,3],[20...| 4900| 10465.65074036291|
|(10,[0,1,2,3],[20...| 8100| 13085.36797301295|
|(10,[0,1,2,3],[20...| 9000| 9595.286127243287|
|(10,[0,1,2,3],[20...|13000|24348.084984839705|
|(10,[0,1,2,3],[20...|10300|18939.361110116704|
|(10,[0,1,2,3],[20...| 6500|11139.137301882116|
|(10,[0,1,2,3],[20...| 5000| 15214.66383705592|
|(10,[0,1,2,3],[20...| 8500| 10413.30285100032|
|(10,[0,1,2,3],[20...|15500|27397.545242173368|
|(10,[0,1,2,3],[20...|12500|12714.137643249296|
|(10,[0,1,2,3],[20...| 9999| 11285.93554035694|
|(10,[0,1,2,3],[20...|16900|  9722.75268

24/12/17 00:58:11 WARN TaskSetManager: Stage 3704 contains a task of very large size (1661 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

Best Radnom Forrrest model saved.


### Decision Tree

In [13]:
dt = DecisionTreeRegressor(featuresCol="features", labelCol="price", seed=42)

# Define the parameter grid for hyperparameter tuning
paramGrid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [5, 10, 15,20]) \
    .addGrid(dt.maxBins, [20, 30, 40,50]) \
    .addGrid(dt.minInstancesPerNode, [1, 2, 4]) \
    .addGrid(dt.minInfoGain, [0.0, 0.1, 0.2]) \
    .build()

# Define the evaluator to minimize RMSE
evaluator = RegressionEvaluator(
    labelCol="price",
    predictionCol="prediction",
    metricName="rmse"
)

# Set up CrossValidator for hyperparameter tuning
crossval = CrossValidator(
    estimator=dt,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=5  # 5-fold cross-validation
)

# Perform the hyperparameter tuning
cv_model = crossval.fit(train_data)

# Best model and parameters
best_model = cv_model.bestModel
print("Best Parameters:")
print(f"  MaxDepth: {best_model.getOrDefault('maxDepth')}")
print(f"  MaxBins: {best_model.getOrDefault('maxBins')}")
print(f"  MinInstancesPerNode: {best_model.getOrDefault('minInstancesPerNode')}")
print(f"  MinInfoGain: {best_model.getOrDefault('minInfoGain')}")

# Evaluate the best model on the test data
predictions = best_model.transform(test_data)
rmse = evaluator.evaluate(predictions)
print(f"Best Model RMSE on Test Data: {rmse}")

# Show some predictions
predictions.select("features", "price", "prediction").show()

# Save the best model
best_model.write().overwrite().save("prediction_models/DT_best")
print("Best Decision Tree model saved.")


24/12/16 16:07:49 WARN DAGScheduler: Broadcasting large task binary with size 1293.7 KiB
24/12/16 16:07:49 WARN DAGScheduler: Broadcasting large task binary with size 1947.7 KiB
24/12/16 16:07:50 WARN DAGScheduler: Broadcasting large task binary with size 1663.5 KiB
24/12/16 16:07:51 WARN DAGScheduler: Broadcasting large task binary with size 1293.7 KiB
24/12/16 16:07:51 WARN DAGScheduler: Broadcasting large task binary with size 1947.7 KiB
24/12/16 16:07:52 WARN DAGScheduler: Broadcasting large task binary with size 1663.4 KiB
24/12/16 16:07:53 WARN DAGScheduler: Broadcasting large task binary with size 1293.7 KiB
24/12/16 16:07:53 WARN DAGScheduler: Broadcasting large task binary with size 1947.7 KiB
24/12/16 16:07:54 WARN DAGScheduler: Broadcasting large task binary with size 1663.4 KiB
24/12/16 16:07:55 WARN DAGScheduler: Broadcasting large task binary with size 1178.5 KiB
24/12/16 16:07:56 WARN DAGScheduler: Broadcasting large task binary with size 1700.5 KiB
24/12/16 16:07:56 WAR

Best Parameters:
  MaxDepth: 5
  MaxBins: 30
  MinInstancesPerNode: 1
  MinInfoGain: 0.0


                                                                                

Best Model RMSE on Test Data: 64560.432713721806
+--------------------+------+------------------+
|            features| price|        prediction|
+--------------------+------+------------------+
|(10,[0,1,2,3],[20...|  9995| 16681.45782974251|
|(10,[0,1,2,3],[20...|  6999|26970.651409618575|
|(10,[0,1,2,3],[20...| 18500| 16681.45782974251|
|(10,[0,1,2,3],[20...|  8500| 42923.97003329634|
|(10,[0,1,2,3],[20...| 12500|11849.234396467125|
|(10,[0,1,2,3],[20...| 20997| 16681.45782974251|
|(10,[0,1,2,3],[20...| 11499|11849.234396467125|
|(10,[0,1,2,3],[20...| 28250|21477.972679081147|
|(10,[0,1,2,3],[20...| 12000|11849.234396467125|
|(10,[0,1,2,3],[20...|  9000|11849.234396467125|
|(10,[0,1,2,3],[20...|  3950|11849.234396467125|
|(10,[0,1,2,3],[20...| 10300|26970.651409618575|
|(10,[0,1,2,3],[20...|  6500|11849.234396467125|
|(10,[0,1,2,3],[20...|  8500|11849.234396467125|
|(10,[0,1,2,3],[20...|399950|11849.234396467125|
|(10,[0,1,2,3],[20...| 12900|19908.368576962515|
|(10,[0,1,2,3],[20..

### How to make a prediction on raw numbers

In [63]:
from pyspark.ml.regression import RandomForestRegressionModel

# Load a Random Forest model
model = RandomForestRegressionModel.load("prediction_models/RF_best")

In [64]:
feature_columns = ["model_year", "milage", "engine_capacity", "engine_horsepower", 
                   "brand_numeric", "transmission_numeric", 
                   "fuel_type_numeric", "ext_col_numeric", 
                   "int_col_numeric", "accident_numeric"]

new_data = spark.createDataFrame([(2007, 213000, 1.6, 150.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)], 
                                 feature_columns)
new_vector = assembler.transform(new_data)

prediction = model.transform(new_vector)
prediction.select("features", "prediction").show()


+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|(10,[0,1,2,3],[20...|65358.501385596675|
+--------------------+------------------+



### How to make a prediction using column mappings

In [55]:
import json

# Load mappings from the JSON file
with open("column_mappings.json", "r") as json_file:
    mappings_dict = json.load(json_file)


In [56]:
def map_to_index(column_name, value, mappings):
    """
    Maps a categorical value to its corresponding index.
    :param column_name: Name of the column (e.g., "brand", "transmission").
    :param value: The raw categorical value to map (e.g., "Ford", "Automatic").
    :param mappings: Dictionary containing the mappings for all categorical columns.
    :return: The index of the value in the mapping, or -1 if not found.
    """
    if column_name in mappings:
        try:
            return mappings[column_name].index(value)
        except ValueError:
            return -1  # Return -1 if the value is not found
    else:
        raise KeyError(f"Column '{column_name}' not found in mappings.")


In [61]:
# Example raw input with categorical values
raw_input = {
    "brand": "Ford",
    "transmission": "Automatic",
    "fuel_type": "Gasoline",
    "ext_col": "Black",
    "int_col": "Black",
    "accident": "None reported"
}

# Map raw input to indices
mapped_input = {
    col: map_to_index(col, value, mappings_dict)
    for col, value in raw_input.items()
}

# Add other non-categorical values
numeric_input = {
    "model_year": 2007,
    "milage": 213000,
    "engine_capacity": 1.6,
    "engine_horsepower": 150,
}

numeric_input.update(mapped_input)

print(numeric_input)
print("Features MUST be in correct order. Model input vectors are created from them.")

{'model_year': 2007, 'milage': 213000, 'engine_capacity': 1.6, 'engine_horsepower': 150, 'brand': 0, 'transmission': 0, 'fuel_type': 0, 'ext_col': 0, 'int_col': 0, 'accident': 0}


In [62]:
from pyspark.sql import Row

# Define the feature columns (same as in your pipeline)
feature_columns = ["model_year", "engine_capacity", "engine_horsepower", "milage", "brand_numeric", "transmission_numeric", 
                   "fuel_type_numeric", "ext_col_numeric", "int_col_numeric", "accident_numeric"]

# Convert the mapped input to a Spark DataFrame
new_data = spark.createDataFrame([Row(**numeric_input)], schema=feature_columns)

# Transform data using the same assembler
new_vector = assembler.transform(new_data)

# Make a prediction
prediction = model.transform(new_vector)
prediction.select("features", "prediction").show()


+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|(10,[0,1,2,3],[20...|65358.501385596675|
+--------------------+------------------+

