In [1]:
!pip install fastparquet

[0m

In [2]:
%pip install textblob

[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
# Modules to read parquet files
import pyarrow
import fastparquet
import pandas as pd

In [4]:
import pyspark
from pyspark.sql.functions import *

In [5]:
spark = SparkSession.builder.getOrCreate()

In [6]:
folder_path = "gs://yelpfrog/cleaned/cleaned_"

In [7]:
business = spark.read.parquet(f"{folder_path}business.parquet/*", engine='pyarrow')

                                                                                

In [8]:
review = spark.read.parquet(f"{folder_path}review.parquet/*", engine='pyarrow')

In [9]:
# Join review to business while rearranging columns to make it look better
business_review = business.join(review, on='business_id').select(
    'name', 'address', 'city', 'state', 'postal_code',
    'text', 'review_count', 'useful', 
    review.stars.alias('review_stars')
)

In [10]:
business_review.printSchema()

root
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- text: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- useful: long (nullable = true)
 |-- review_stars: double (nullable = true)



# Feature Engineering

In [11]:
from pyspark.ml.feature import Bucketizer, StringIndexer, OneHotEncoder, VectorAssembler

In [12]:
from textblob import TextBlob
from pyspark.sql.types import DoubleType
# from pyspark.sql.functions import col, isnan, when, count, udf

In [13]:
# Create buckets based on percentile
percentiles = business_review.select(
    percentile_approx('review_count', [0.2, 0.4, 0.6, 0.8]).alias('percentiles')
).collect()[0][0]

                                                                                

In [14]:
# 5 buckets 
bucket_review_count = Bucketizer(splits= [0.0] + list(percentiles) + [float('inf')], 
                                     inputCol='review_count', 
                                     outputCol='review_count_buckets')

In [15]:
indexer = StringIndexer(inputCols=["city", "state"], 
                        outputCols=["city_index", "state_index"])

In [16]:
encoder = OneHotEncoder(inputCols=['review_count_buckets', 'city_index', 'state_index'], 
                        outputCols=['encoded_review_count', 'encoded_city', 'encoded_state']
                       )

In [17]:
# Create a function to perform sentiment analysis on some text
def sentiment_analysis(text):
    sentiment = TextBlob(text).sentiment.polarity
    return sentiment

In [18]:
# Turn function into a UDF
sentiment_analysis_udf = udf(sentiment_analysis, DoubleType())

In [19]:
from pyspark.ml import Transformer

In [None]:
# Allows sentiment_score to integrate into pipeline
class SentimentAnalysisTransformer(Transformer):
    def __init__(self, inputCol="text", outputCol="sentiment_score"):
        super().__init__()
        self.inputCol = inputCol
        self.outputCol = outputCol
    def _transform(self, df):
        return df.withColumn(self.outputCol, sentiment_analysis_udf(df[self.inputCol]))

In [21]:
sentiment_transformer = SentimentAnalysisTransformer(inputCol="text", outputCol="sentiment_score")

In [22]:
# Want to include useful to the Vector
assembler = VectorAssembler(inputCols=['encoded_review_count',
                                       'encoded_city', 'encoded_state',
                                       'sentiment_score'
                                      ],
                                      outputCol='features')

# Show Features In A Copy

In [23]:
# Make copy of sdf to show features while keeping original the same
business_review_features = business_review.select("*")

In [24]:
business_review_features = bucket_review_count.transform(business_review_features)

In [25]:
business_review_features = indexer.fit(business_review_features).transform(business_review_features)

                                                                                

In [26]:
business_review_features = encoder.fit(business_review_features).transform(business_review_features)

In [27]:
# Apply the sentiment analysis function to the text column
# and create a new column sentiment_score
business_review_features = business_review_features.withColumn('sentiment_score',
                                             sentiment_analysis_udf(business_review['text'])
                                                              )

In [28]:
business_review_features = assembler.transform(business_review_features)

In [29]:
business_review_features.printSchema()

root
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- text: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- useful: long (nullable = true)
 |-- review_stars: double (nullable = true)
 |-- review_count_buckets: double (nullable = true)
 |-- city_index: double (nullable = false)
 |-- state_index: double (nullable = false)
 |-- encoded_review_count: vector (nullable = true)
 |-- encoded_city: vector (nullable = true)
 |-- encoded_state: vector (nullable = true)
 |-- sentiment_score: double (nullable = true)
 |-- features: vector (nullable = true)



In [30]:
business_review_features.select(['encoded_review_count',
                                 'encoded_city', 'encoded_state',
                                 'sentiment_score', 
                                 'features'
                                 ]).show(truncate=False)


[Stage 12:>                                                         (0 + 1) / 1]

+--------------------+------------------+---------------+--------------------+-----------------------------------------------------------+
|encoded_review_count|encoded_city      |encoded_state  |sentiment_score     |features                                                   |
+--------------------+------------------+---------------+--------------------+-----------------------------------------------------------+
|(4,[3],[1.0])       |(1415,[186],[1.0])|(26,[1],[1.0]) |0.3680555555555556  |(1446,[3,190,1420,1445],[1.0,1.0,1.0,0.3680555555555556])  |
|(4,[0],[1.0])       |(1415,[482],[1.0])|(26,[0],[1.0]) |0.2507575757575758  |(1446,[0,486,1419,1445],[1.0,1.0,1.0,0.2507575757575758])  |
|(4,[2],[1.0])       |(1415,[164],[1.0])|(26,[0],[1.0]) |0.4200892857142857  |(1446,[2,168,1419,1445],[1.0,1.0,1.0,0.4200892857142857])  |
|(4,[1],[1.0])       |(1415,[6],[1.0])  |(26,[7],[1.0]) |0.35666666666666663 |(1446,[1,10,1426,1445],[1.0,1.0,1.0,0.35666666666666663])  |
|(4,[0],[1.0])       |(1415


                                                                                

# Checking On Original Data

In [31]:
business_review.printSchema()

root
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- text: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- useful: long (nullable = true)
 |-- review_stars: double (nullable = true)



In [32]:
# Check for missing values
for c in business_review.columns:
    print(c, business_review.where(col(c).isNull()).count())

name 0
address 0
city 0
state 0
postal_code 0


                                                                                

text 0
review_count 0


                                                                                

useful 0





review_stars 0




                                                                                

# To Parquet

In [33]:
trusted_folder="gs://yelpfrog/trusted/"

In [34]:
feature_engineer = f"{trusted_folder}business_review_features.parquet"

In [None]:
business_review_features.write.parquet(feature_engineer)

