In [1]:
!pip install fastparquet
%pip install textblob
# Modules to read parquet files
import pyarrow
import fastparquet
import pandas as pd
import pyspark
from pyspark.sql.functions import *
spark = SparkSession.builder.getOrCreate()
folder_path = "gs://yelpfrog/cleaned/cleaned_"
business = spark.read.parquet(f"{folder_path}business.parquet/*", engine='pyarrow')
review = spark.read.parquet(f"{folder_path}review.parquet/*", engine='pyarrow')
# Join review to business while rearranging columns to make it look better
business_review = business.join(review, on='business_id').select(
    'name', 'address', 'city', 'state', 'postal_code',
    'text', 'review_count', 'useful', 
    business.stars.alias('business_stars')
)
business_review.printSchema()
# Feature Engineering
from pyspark.ml.feature import Bucketizer, StringIndexer, OneHotEncoder, VectorAssembler
from textblob import TextBlob
from pyspark.sql.types import DoubleType
# from pyspark.sql.functions import col, isnan, when, count, udf
# Create buckets based on percentile
percentiles = business_review.select(
    percentile_approx('review_count', [0.2, 0.4, 0.6, 0.8]).alias('percentiles')
).collect()[0][0]
# 5 buckets 
bucket_review_count = Bucketizer(splits= [0.0] + list(percentiles) + [float('inf')], 
                                     inputCol='review_count', 
                                     outputCol='review_count_buckets')
indexer = StringIndexer(inputCols=["city", "state"], 
                        outputCols=["city_index", "state_index"])
encoder = OneHotEncoder(inputCols=['review_count_buckets', 'city_index', 'state_index'], 
                        outputCols=['encoded_review_count', 'encoded_city', 'encoded_state']
                       )
# Create a function to perform sentiment analysis on some text
def sentiment_analysis(text):
    sentiment = TextBlob(text).sentiment.polarity
    return sentiment
# Turn function into a UDF
sentiment_analysis_udf = udf(sentiment_analysis, DoubleType())
from pyspark.ml import Transformer
# Allows sentiment_score to integrate into pipeline
class SentimentAnalysisTransformer(Transformer):
    def __init__(self, inputCol="text", outputCol="sentiment_score"):
        super().__init__()
        self.inputCol = inputCol
        self.outputCol = outputCol
    def _transform(self, df):
        return df.withColumn(self.outputCol, sentiment_analysis_udf(df[self.inputCol]))
sentiment_transformer = SentimentAnalysisTransformer(inputCol="text", outputCol="sentiment_score")
# Want to include useful to the Vector
assembler = VectorAssembler(inputCols=['encoded_review_count',
                                       'encoded_city', 'encoded_state',
                                       'sentiment_score'
                                      ],
                                      outputCol='features')
# Show Features In A Copy
# Make copy of sdf to show features while keeping original the same
business_review_features = business_review.select("*")
business_review_features = bucket_review_count.transform(business_review_features)
business_review_features = indexer.fit(business_review_features).transform(business_review_features)
business_review_features = encoder.fit(business_review_features).transform(business_review_features)
# Apply the sentiment analysis function to the text column
# and create a new column sentiment_score
business_review_features = business_review_features.withColumn('sentiment_score',
                                             sentiment_analysis_udf(business_review['text'])
                                                              )
business_review_features = assembler.transform(business_review_features)
business_review_features.printSchema()
business_review_features.select(['encoded_review_count',
                                 'encoded_city', 'encoded_state',
                                 'sentiment_score', 
                                 'features'
                                 ]).show(truncate=False)
# Checking On Original Data
business_review.printSchema()
# Check for missing values
for c in business_review.columns:
    print(c, business_review.where(col(c).isNull()).count())

[0mNote: you may need to restart the kernel to use updated packages.


24/12/09 17:06:21 WARN YarnScheduler: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
24/12/09 17:06:36 WARN YarnScheduler: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
24/12/09 17:06:51 WARN YarnScheduler: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
                                                                                

root
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- text: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- useful: long (nullable = true)
 |-- business_stars: double (nullable = true)



                                                                                

root
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- text: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- useful: long (nullable = true)
 |-- business_stars: double (nullable = true)
 |-- review_count_buckets: double (nullable = true)
 |-- city_index: double (nullable = false)
 |-- state_index: double (nullable = false)
 |-- encoded_review_count: vector (nullable = true)
 |-- encoded_city: vector (nullable = true)
 |-- encoded_state: vector (nullable = true)
 |-- sentiment_score: double (nullable = true)
 |-- features: vector (nullable = true)



                                                                                

+--------------------+------------------+---------------+--------------------+-----------------------------------------------------------+
|encoded_review_count|encoded_city      |encoded_state  |sentiment_score     |features                                                   |
+--------------------+------------------+---------------+--------------------+-----------------------------------------------------------+
|(4,[3],[1.0])       |(1415,[186],[1.0])|(26,[1],[1.0]) |0.3680555555555556  |(1446,[3,190,1420,1445],[1.0,1.0,1.0,0.3680555555555556])  |
|(4,[0],[1.0])       |(1415,[482],[1.0])|(26,[0],[1.0]) |0.2507575757575758  |(1446,[0,486,1419,1445],[1.0,1.0,1.0,0.2507575757575758])  |
|(4,[2],[1.0])       |(1415,[164],[1.0])|(26,[0],[1.0]) |0.4200892857142857  |(1446,[2,168,1419,1445],[1.0,1.0,1.0,0.4200892857142857])  |
|(4,[1],[1.0])       |(1415,[6],[1.0])  |(26,[7],[1.0]) |0.35666666666666663 |(1446,[1,10,1426,1445],[1.0,1.0,1.0,0.35666666666666663])  |
|(4,[0],[1.0])       |(1415

                                                                                

text 0
review_count 0


                                                                                

useful 0
business_stars 0


In [2]:
# To Parquet
trusted_folder="gs://yelpfrog/trusted/"
feature_engineer = f"{trusted_folder}business_review_features2.01.parquet"
business_review_features.write.parquet(feature_engineer)

                                                                                

In [3]:
!pip install fastparquet
%pip install textblob
# Modules to read parquet files
import pyarrow
import fastparquet
import pandas as pd
import pyspark
from pyspark.sql.functions import *
# Import pipeline
from pyspark.ml import Pipeline
# Import random forest
from pyspark.ml.regression import RandomForestRegressor
# Import the evaluation module
from pyspark.ml.evaluation import RegressionEvaluator
# Import the model tuning module
from pyspark.ml.tuning import *
import numpy as np
spark = SparkSession.builder.getOrCreate()
folder_path = "gs://yelpfrog/trusted/"
# Modeling
# Read business_review_features File
business_review_features = spark.read.parquet(f"{folder_path}business_review_features2.01.parquet/*", engine='pyarrow')
business_review_features.printSchema()
# Train/Test Data
# Split the data into 70% training and 30% test sets  
trainingData, testData = business_review_features.randomSplit([0.7, 0.3], seed=42)
# Random Forest
# Create a Random Forest Estimator
rf = RandomForestRegressor(labelCol="business_stars", featuresCol="features", predictionCol="predicted_stars", seed=42)
# Create a regression evaluator (to get RMSE, R2, RME, etc.)
evaluator = RegressionEvaluator(labelCol='business_stars', predictionCol="predicted_stars")
rf_pipeline = Pipeline(stages=[rf])

[0mNote: you may need to restart the kernel to use updated packages.
root
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- text: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- useful: long (nullable = true)
 |-- business_stars: double (nullable = true)
 |-- review_count_buckets: double (nullable = true)
 |-- city_index: double (nullable = true)
 |-- state_index: double (nullable = true)
 |-- encoded_review_count: vector (nullable = true)
 |-- encoded_city: vector (nullable = true)
 |-- encoded_state: vector (nullable = true)
 |-- sentiment_score: double (nullable = true)
 |-- features: vector (nullable = true)




[Stage 45:>                                                         (0 + 1) / 1]

                                                                                

In [4]:
# Build the parameter grid
grid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10]) \
    .addGrid(rf.maxDepth, [3]) \
    .build()

# Create the CrossValidator using the hyperparameter grid
cv = CrossValidator(estimator=rf_pipeline, 
                    estimatorParamMaps=grid, 
                    evaluator=evaluator, 
                    numFolds=3)
# .tranform would give you new columns based on pipeline specifications
# unless we change the name in Estimator (like prediction to predicted_star)
# rawPrediction, probability, prediction are the default ones
transformed_sdf = rf_pipeline.fit(business_review_features).transform(business_review_features)

# Cross Validation
# Train the models
all_models  = cv.fit(trainingData)

# Show the average performance over the three folds
print(f"Average metric {all_models.avgMetrics}")
# Get the best model from all of the models trained
bestModel = all_models.bestModel

# Use the model 'bestModel' to predict the test set
test_results = bestModel.transform(testData)
# Show the prediction
test_results.select('name','business_stars', 'predicted_stars').show(truncate=False)

                                                                                

Average metric [0.6961178526024475]



[Stage 112:>                                                        (0 + 1) / 1]

+-------------------------+--------------+------------------+
|name                     |business_stars|predicted_stars   |
+-------------------------+--------------+------------------+
|1-800-GOT-JUNK? Reno     |4.5           |3.3553510955394   |
|1-800-GOT-JUNK? Reno     |4.5           |3.4398889122605603|
|1-800-GOT-JUNK? Reno     |4.5           |3.7881922630238614|
|1-800-GOT-JUNK? Reno     |4.5           |3.7881922630238614|
|1-800-GOT-JUNK? Reno     |4.5           |3.7632192431568066|
|1-800-GOT-JUNK? Reno     |4.5           |3.7632192431568066|
|1-800-GOT-JUNK? Reno     |4.5           |3.7881922630238614|
|1-800-GOT-JUNK? Reno     |4.5           |3.4398889122605603|
|1-800-GOT-JUNK? Reno     |4.5           |3.3553510955394   |
|1-800-GOT-JUNK? Reno     |4.5           |3.7632192431568066|
|1-800-GOT-JUNK? Reno     |4.5           |3.7881922630238614|
|1-800-PACK-RAT           |1.5           |3.1678885955954166|
|1-800-PACK-RAT           |1.5           |3.1678885955954166|
|1-800-P


                                                                                

In [5]:
from pyspark.sql.functions import col

In [6]:
# Create a consildated list of unique businesses instead of multiple of the same ones
consolidated_results = test_results.groupBy("name", "business_stars").agg({"predicted_stars": "avg"})
consolidated_results = consolidated_results.select('name','business_stars',
                            col("avg(predicted_stars)").alias("predicted_stars"))

consolidated_results.show(truncate=False)



+-----------------------------------------------------+--------------+------------------+
|name                                                 |business_stars|predicted_stars   |
+-----------------------------------------------------+--------------+------------------+
|Big Ray's Fish Camp                                  |4.5           |3.8414893871252676|
|Bowl of Heaven Cafe                                  |4.5           |3.741532716695067 |
|Canyon ridge apartments                              |1.5           |3.4184965650772603|
|Cutting Edge Salon and Boutique                      |4.5           |3.642720114627044 |
|NYPD New York Pizza Department                       |3.5           |3.7829597220209883|
|Napa Sonoma Grocery Company                          |4.0           |3.8014345182335707|
|Rentz of Clearwater                                  |4.0           |3.693028173765031 |
|Resellers Consignment Gallery                        |3.0           |3.8509242753693407|
|The Butch


                                                                                

In [7]:
# Checking to make sure grouping and aggregation was correctly used
consolidated_results.filter(consolidated_results["name"] == "1-800-GOT-JUNK? Reno").show()



+--------------------+--------------+-----------------+
|                name|business_stars|  predicted_stars|
+--------------------+--------------+-----------------+
|1-800-GOT-JUNK? Reno|           4.5|3.677199372989091|
+--------------------+--------------+-----------------+




                                                                                

In [8]:
# Evaluation
# Calculate R2, RMSE, and MAE
r2 =evaluator.evaluate(consolidated_results,{evaluator.metricName:'r2'})
rmse = evaluator.evaluate(consolidated_results, {evaluator.metricName: "rmse"})
mae = evaluator.evaluate(consolidated_results, {evaluator.metricName: "mae"})

print(f"R-squared:{r2}  RMSE:{rmse}  MAE:{mae}")



R-squared:0.20234340852099342  RMSE:0.8380488594865366  MAE:0.6817354983882484



                                                                                

In [9]:
# business_stars (actual) is columns and predicted_stars (predicted) are rows

# actual: Col1 Col2      
# Row 0 -: tn, fp 
# Row 1 +: fn, tp
#           -   +  
confusion_matrix = test_results.groupby('business_stars').pivot('predicted_stars').count().fillna(0).collect()
tn = confusion_matrix[0][1]  # True Negative 
fp = confusion_matrix[0][2]  # False Positive
fn = confusion_matrix[1][1]  # False Negative
tp = confusion_matrix[1][2]  # True Positive
precision = tp / (tp + fp)            
recall = tp / (tp + fn)
accuracy = (tp + tn) / (tp + tn + fp + fn)
f1_score = 2 * ((precision * recall) / (precision + recall))

24/12/09 17:29:05 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [10]:
accuracy

0.815988351937877

In [11]:
precision

0.8195567326456649

In [12]:
recall

0.9945027063599459

In [13]:
f1_score

0.8985939171633807

In [14]:
best_rf_model = all_models.bestModel.stages[-1]
importances = best_rf_model.featureImportances
feature_list = ["review_count_buckets", 
                "city_index", "state_index", 
                "encoded_review_count", "encoded_city", "encoded_state", 
                "sentiment_score"]

print("Feature Importances:")
for feature, importance in zip(feature_list, importances):
    print(f"{feature}: {importance:.4f}")
potential_feature_list = ["name", "address", "postal_code", "useful"]

print("Feature Importances:")
for feature, importance in zip(potential_feature_list, importances):
    print(f"{feature}: {importance:.4f}")

Feature Importances:
review_count_buckets: 0.1341
city_index: 0.0411
state_index: 0.0015
encoded_review_count: 0.0109
encoded_city: 0.0000
encoded_state: 0.0116
sentiment_score: 0.0000
Feature Importances:
name: 0.1341
address: 0.0411
postal_code: 0.0015
useful: 0.0109


In [None]:
# Save this as an additional file without overwriting previous one
model_path = "gs://yelpfrog/models/review_stars_rf_model.12-09"
bestModel.write().save(model_path)

                                                                                