In [1]:
!pip install fastparquet

[0m

In [2]:
%pip install textblob

[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
# Modules to read parquet files
import pyarrow
import fastparquet
import pandas as pd

In [4]:
import pyspark
from pyspark.sql.functions import *

In [5]:
# Import pipeline
from pyspark.ml import Pipeline
# Import random forest
from pyspark.ml.regression import RandomForestRegressor

In [6]:
# Import the evaluation module
from pyspark.ml.evaluation import RegressionEvaluator
# Import the model tuning module
from pyspark.ml.tuning import *
import numpy as np

In [7]:
spark = SparkSession.builder.getOrCreate()

In [8]:
folder_path = "gs://yelpfrog/trusted/"

# Modeling

# Read business_review_features File

In [9]:
business_review_features = spark.read.parquet(f"{folder_path}business_review_features.parquet/*", engine='pyarrow')

                                                                                

In [10]:
business_review_features.printSchema()

root
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- text: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- useful: long (nullable = true)
 |-- review_stars: double (nullable = true)
 |-- review_count_buckets: double (nullable = true)
 |-- city_index: double (nullable = true)
 |-- state_index: double (nullable = true)
 |-- encoded_review_count: vector (nullable = true)
 |-- encoded_city: vector (nullable = true)
 |-- encoded_state: vector (nullable = true)
 |-- sentiment_score: double (nullable = true)
 |-- features: vector (nullable = true)



# Train/Test Data

In [11]:
# Split the data into 70% training and 30% test sets  
trainingData, testData = business_review_features.randomSplit([0.7, 0.3], seed=42)

# Random Forest

In [12]:
# Create a Random Forest Estimator
rf = RandomForestRegressor(labelCol="review_stars", featuresCol="features", predictionCol="predicted_stars", seed=42)

In [27]:
# Create a regression evaluator (to get RMSE, R2, RME, etc.)
evaluator = RegressionEvaluator(labelCol='review_stars', predictionCol="predicted_stars")

In [14]:
rf_pipeline = Pipeline(stages=[rf])

In [15]:
grid = ParamGridBuilder()

# Build the parameter grid
grid = grid.build()

# Create the CrossValidator using the hyperparameter grid
cv = CrossValidator(estimator=rf_pipeline, 
                    estimatorParamMaps=grid, 
                    evaluator=evaluator, 
                    numFolds=3)

In [16]:
# .tranform would give you new columns based on pipeline specifications
# unless we change the name in Estimator (like prediction to predicted_star)
# rawPrediction, probability, prediction are the default ones
transformed_sdf = rf_pipeline.fit(business_review_features).transform(business_review_features)



# Cross Validation

In [18]:
# Train the models
all_models  = cv.fit(trainingData)

# Show the average performance over the three folds
print(f"Average metric {all_models.avgMetrics}")

24/11/24 02:23:39 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_149_25 !
24/11/24 02:23:39 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_149_22 !
24/11/24 02:23:39 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_186_1 !
24/11/24 02:23:39 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_149_4 !
24/11/24 02:23:39 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_149_10 !
24/11/24 02:23:39 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_149_14 !
24/11/24 02:23:39 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_186_15 !
24/11/24 02:23:39 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_149_18 !
24/11/24 02:23:39 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_186_7 !
24/11/24 02:23:39 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_149_12 !
24/11/24 02:23:39 WARN BlockManagerMasterEndpoint: No



Average metric [1.0922024601402958]



                                                                                

In [19]:
# Get the best model from all of the models trained
bestModel = all_models.bestModel

# Use the model 'bestModel' to predict the test set
test_results = bestModel.transform(testData)

In [28]:
# Show the prediction
test_results.select('name','review_stars', 'predicted_stars').show(truncate=False)


[Stage 115:>                                                        (0 + 1) / 1]

+-------------------------+------------+------------------+
|name                     |review_stars|predicted_stars   |
+-------------------------+------------+------------------+
|1-800-GOT-JUNK? Reno     |1.0         |2.180508235564684 |
|1-800-GOT-JUNK? Reno     |5.0         |2.7254295781106532|
|1-800-GOT-JUNK? Reno     |5.0         |4.2481245437219775|
|1-800-GOT-JUNK? Reno     |5.0         |4.220801253690862 |
|1-800-GOT-JUNK? Reno     |1.0         |3.9217610480174327|
|1-800-GOT-JUNK? Reno     |5.0         |3.666732095465368 |
|1-800-GOT-JUNK? Reno     |5.0         |4.220801253690862 |
|1-800-GOT-JUNK? Reno     |5.0         |2.64457172682535  |
|1-800-GOT-JUNK? Reno     |1.0         |2.180508235564684 |
|1-800-GOT-JUNK? Reno     |1.0         |3.666732095465368 |
|1-800-GOT-JUNK? Reno     |5.0         |4.2481245437219775|
|1-800-PACK-RAT           |1.0         |1.9906241942446936|
|1-800-PACK-RAT           |1.0         |1.9906241942446936|
|1-800-PACK-RAT           |1.0         |


                                                                                

# Evaluation

In [21]:
# Calculate R2, RMSE, and MAE
r2 =evaluator.evaluate(test_results,{evaluator.metricName:'r2'})
rmse = evaluator.evaluate(test_results, {evaluator.metricName: "rmse"})
mae = evaluator.evaluate(test_results, {evaluator.metricName: "mae"})

print(f"R-squared:{r2}  RMSE:{rmse}  MAE:{mae}")



R-squared:0.4527255230297541  RMSE:1.093685102290293  MAE:0.8906223640588542



                                                                                

# Confusion Matrix

In [22]:
# Show the confusion matrix
test_results.groupby('review_stars').pivot('predicted_stars').count().sort('review_stars').show()

24/11/24 02:28:05 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
24/11/24 02:28:18 WARN DAGScheduler: Broadcasting large task binary with size 1323.5 KiB
                                                                                

+------------+------------------+------------------+------------------+-----------------+-----------------+------------------+-----------------+------------------+-----------------+------------------+-----------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+-----------------+------------------+-----------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+-----------------+-----------

In [23]:
# review_stars (actual) is columns and predicted_stars (predicted) are rows

# actual: Col1 Col2      
# Row 0 -: tn, fp 
# Row 1 +: fn, tp
#           -   +     


confusion_matrix = test_results.groupby('review_stars').pivot('predicted_stars').count().fillna(0).collect()

def calculate_recall_precision(confusion_matrix):
    tn = confusion_matrix[0][1]  # True Negative 
    fp = confusion_matrix[0][2]  # False Positive
    fn = confusion_matrix[1][1]  # False Negative
    tp = confusion_matrix[1][2]  # True Positive
    precision = tp / (tp + fp)            
    recall = tp / (tp + fn)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    f1_score = 2 * ((precision * recall) / (precision + recall))
    return accuracy, precision, recall, f1_score

24/11/24 02:28:43 WARN DAGScheduler: Broadcasting large task binary with size 1463.2 KiB
                                                                                

In [24]:
print("Accuracy, Precision, Recall, F1 Score")
print(calculate_recall_precision(confusion_matrix))

Accuracy, Precision, Recall, F1 Score


ZeroDivisionError: division by zero

In [34]:
# This is bad model but still save so we can overwrite later
model_path = "gs://yelpfrog/models/review_stars_rf_model.11-15"
bestModel.write().overwrite().save(model_path)

                                                                                

In [52]:
best_rf_model = all_models.bestModel.stages[-1]
importances = best_rf_model.featureImportances
feature_list = ["review_count_buckets", 
                "city_index", "state_index", 
                "encoded_review_count", "encoded_city", "encoded_state", 
                "sentiment_score"]

print("Feature Importances:")
for feature, importance in zip(feature_list, importances):
    print(f"{feature}: {importance:.4f}")

Feature Importances:
review_count_buckets: 0.0098
city_index: 0.0049
state_index: 0.0006
encoded_review_count: 0.0025
encoded_city: 0.0009
encoded_state: 0.0017
sentiment_score: 0.0000


In [54]:
potential_feature_list = ["name", "address", "postal_code", "useful"]

print("Feature Importances:")
for feature, importance in zip(potential_feature_list, importances):
    print(f"{feature}: {importance:.4f}")

Feature Importances:
name: 0.0098
address: 0.0049
postal_code: 0.0006
useful: 0.0025
