# Modeling Building

### Importing Required Packages

In [77]:
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, StructField, StructType, StringType, IntegerType, MapType
from pyspark.sql.functions import col, expr, when
from pyspark.sql import functions as F
import pandas as pd
from pyspark.ml.feature import VectorAssembler 
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
from pyspark.sql.functions import isnan, when, count, col
import pyspark.sql.functions as f
from functools import reduce
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel



# spark session
spark = SparkSession \
    .builder \
    .appName("Modeling") \
    .getOrCreate()
sc = spark.sparkContext

### Outline of Model Building 

#### 1) Explore and choose potential predictors, put them into a separate dataframe

#### 2) Convert data from categorical to numerical

#### 3) Try 2 different strategies for dealing with NULL/missing values
- Strategy 1: Impute NULL values with the mode of column
- Strategy 2: Drop all rows that contain NULL values

#### 4) Construct 3 models using the data with imputed values (LogisticRegressionWithLBFGS, RandomForestClassifer, NaiveBayes)

#### 5) Construct 3 models using the data with the dropped NULLs (LogisticRegressionWithLBFGS, RandomForestClassifer, NaiveBayes)

In [64]:
# read data from parquet file
dineSafe_yelp_data = spark.read.parquet('data.parquet')

In [65]:
# examine schema 
dineSafe_yelp_data.printSchema()

root
 |-- ds_address: string (nullable = true)
 |-- ds_id: string (nullable = true)
 |-- ds_latitude: double (nullable = true)
 |-- ds_longitude: double (nullable = true)
 |-- ds_name: string (nullable = true)
 |-- ds_status: string (nullable = true)
 |-- ds_type: string (nullable = true)
 |-- ds_inspection: string (nullable = true)
 |-- ds_id_ins: string (nullable = true)
 |-- ds_pos_ins: string (nullable = true)
 |-- ds_date: string (nullable = true)
 |-- ds_status_ins: string (nullable = true)
 |-- ds_severity: string (nullable = true)
 |-- ds_action: string (nullable = true)
 |-- ds_subdatabase: string (nullable = true)
 |-- address: string (nullable = true)
 |-- attributes: struct (nullable = true)
 |    |-- AcceptsInsurance: string (nullable = true)
 |    |-- AgesAllowed: string (nullable = true)
 |    |-- Alcohol: string (nullable = true)
 |    |-- Ambience: string (nullable = true)
 |    |-- BYOB: string (nullable = true)
 |    |-- BYOBCorkage: string (nullable = true)
 |    |-

### Explore Possible Predictors to Include

ds_severity, ds_status_ins, ds_action, review_count, RestaurantsDelivery, RestaurantsTakeOut, OutdoorSeating, BikeParking, RestaurantsPriceRange

In [66]:
dineSafe_yelp_data.groupby('ds_severity').count().show(truncate=False)
dineSafe_yelp_data.groupby('ds_status_ins').count().show(truncate=False)
dineSafe_yelp_data.groupby('ds_action').count().show(truncate=False)
dineSafe_yelp_data.groupby('review_count').count().show(10, truncate=False)
dineSafe_yelp_data.groupby('attributes.RestaurantsDelivery').count().show(truncate=False)
dineSafe_yelp_data.groupby('attributes.RestaurantsTakeOut').count().show(truncate=False)
dineSafe_yelp_data.groupby('attributes.OutdoorSeating').count().show(truncate=False)
dineSafe_yelp_data.groupby('attributes.BikeParking').count().show(truncate=False)
dineSafe_yelp_data.groupby('attributes.RestaurantsPriceRange2').count().show(truncate=False)

+-------------------+-----+
|ds_severity        |count|
+-------------------+-----+
|null               |613  |
|C - Crucial        |28   |
|NA - Not Applicable|57   |
|M - Minor          |523  |
|S - Significant    |180  |
+-------------------+-----+

+----------------+-----+
|ds_status_ins   |count|
+----------------+-----+
|null            |14   |
|Conditional Pass|97   |
|Closed          |1    |
|Pass            |1289 |
+----------------+-----+

+---------------------------+-----+
|ds_action                  |count|
+---------------------------+-----+
|null                       |613  |
|Ticket                     |14   |
|Corrected During Inspection|156  |
|Notice to Comply           |617  |
|Summons                    |1    |
+---------------------------+-----+

+------------+-----+
|review_count|count|
+------------+-----+
|26          |11   |
|29          |11   |
|65          |14   |
|19          |18   |
|54          |4    |
|22          |22   |
|7           |54   |
|34        

### Create New Dataframe with Selected Predictors and Response Variable

Use spark.sql to create a new dataframe that only contains the predictors we want to train the model with:

In [67]:
dineSafe_yelp_data.createOrReplaceTempView("dineSafe_yelp_data")
ratings_df = spark.sql("SELECT rating, ds_severity, ds_status_ins, ds_action, review_count, attributes.RestaurantsDelivery, attributes.RestaurantsTakeOut, attributes.OutdoorSeating, attributes.BikeParking, attributes.RestaurantsPriceRange2 FROM dineSafe_yelp_data")
ratings_df.show(5)

+------+-----------+-------------+----------------+------------+-------------------+------------------+--------------+-----------+----------------------+
|rating|ds_severity|ds_status_ins|       ds_action|review_count|RestaurantsDelivery|RestaurantsTakeOut|OutdoorSeating|BikeParking|RestaurantsPriceRange2|
+------+-----------+-------------+----------------+------------+-------------------+------------------+--------------+-----------+----------------------+
|   bad|  M - Minor|         Pass|Notice to Comply|          41|               null|              True|          null|       True|                     3|
|   bad|       null|         Pass|            null|          41|               null|              True|          null|       True|                     3|
|   bad|       null|         Pass|            null|          41|               null|              True|          null|       True|                     3|
|   bad|       null|         Pass|            null|          41|            

### Exploring Missing/NULL Entries

In [68]:
print('There are {} missing values in ds_severity.'.format(ratings_df.filter(ratings_df['ds_severity'].isNull()).count()))
print('There are {} missing values in ds_status_ins.'.format(ratings_df.filter(ratings_df['ds_status_ins'].isNull()).count()))
print('There are {} missing values in ds_action.'.format(ratings_df.filter(ratings_df['ds_action'].isNull()).count()))
print('There are {} missing values in review_count.'.format(ratings_df.filter(ratings_df['review_count'].isNull()).count()))
print('There are {} missing values in RestaurantsDelivery.'.format(ratings_df.filter(ratings_df['RestaurantsDelivery'].isNull()).count()))
print('There are {} missing values in RestaurantsTakeOut.'.format(ratings_df.filter(ratings_df['RestaurantsTakeOut'].isNull()).count()))
print('There are {} missing values in OutdoorSeating.'.format(ratings_df.filter(ratings_df['OutdoorSeating'].isNull()).count()))
print('There are {} missing values in BikeParking.'.format(ratings_df.filter(ratings_df['BikeParking'].isNull()).count()))
print('There are {} missing values in RestaurantsPriceRange2.'.format(ratings_df.filter(ratings_df['RestaurantsPriceRange2'].isNull()).count()))

There are 613 missing values in ds_severity.
There are 14 missing values in ds_status_ins.
There are 613 missing values in ds_action.
There are 0 missing values in review_count.
There are 499 missing values in RestaurantsDelivery.
There are 340 missing values in RestaurantsTakeOut.
There are 446 missing values in OutdoorSeating.
There are 519 missing values in BikeParking.
There are 327 missing values in RestaurantsPriceRange2.


When examining missing values, we will only drop entire columns if more than half of their values are null or missing. Since each column has 1401 entries, we won't outright drop any column entirely.

### Relabel Data from Categorical to Numerical

We will relabel all the predictors with categorical variables into numerical ones as follows:

- rating: bad = 0, neutral = 1, good = 2
- ds_severity: NA = 0, minor = 1, significant = 2, crucial = 3
- ds_status_ins: closed = 0, conditional pass = 1, pass = 2
- ds_action: Ticket = 0, Corrected During Inspection = 1, Notice to Comply = 2, Summons = 3
- review_count: keep
- RestaurantsDeliery: false = 0, true = 1
- RestaurantsTakeOut: false = 0, true = 1
- OutdoorSeating: false = 0, true = 1
- BikeParking: false = 0, true = 1
- RestaurantPriceRange2: keep

In [69]:
# create new features that match the labels above 
ratings_num = when(
    col('rating')=='bad',0).when(
    col('rating')=='neutral',1).when(
    col('rating')=='good',2)

ds_severity_num = when(
    col('ds_severity')=='NA - Not Applicable',0).when(
    col('ds_severity')=='M - Minor',1).when(
    col('ds_severity')=='S - Significant',2).when(
    col('ds_severity')=='C - Crucial',3)

ds_status_ins_num = when(
    col('ds_status_ins')=='Closed',0).when(
    col('ds_status_ins')=='Conditional Pass',1).when(
    col('ds_status_ins')=='Pass',2)

ds_action_num = when(
    col('ds_action') == 'Ticket', 0).when(
    col('ds_action') == 'Corrected During Inspection', 1).when(
    col('ds_action') == 'Notice to Comply', 2).when(
    col('ds_action') == 'Summons', 3)

In [70]:
# add columns for each numerical predictor
ratings_df = ratings_df.withColumn('rating_num', ratings_num)
ratings_df = ratings_df.withColumn('ds_severity_num', ds_severity_num)
ratings_df = ratings_df.withColumn('ds_status_ins_num', ds_status_ins_num)
ratings_df = ratings_df.withColumn('ds_action_num', ds_action_num)
ratings_df = ratings_df.withColumn('restaurantsDelivery_num', F.when(ratings_df['RestaurantsDelivery'] == 'False', 0).when(ratings_df['RestaurantsDelivery'] == 'True', 1))
ratings_df = ratings_df.withColumn('restaurantsTakeOut_num', F.when(ratings_df['RestaurantsTakeOut'] == 'False', 0).when(ratings_df['RestaurantsTakeout'] == 'True', 1))
ratings_df = ratings_df.withColumn('outdoorSeating_num', F.when(ratings_df['OutdoorSeating'] == 'False', 0).when(ratings_df['OutdoorSeating'] == 'True', 1))
ratings_df = ratings_df.withColumn('bikeParking_num', F.when(ratings_df['BikeParking'] == 'False', 0).when(ratings_df['BikeParking'] == 'True', 1))
ratings_df = ratings_df.withColumn('restaurantsPriceRange_num', ratings_df['RestaurantsPriceRange2'].cast('double')) 

# print out updated dataframe (there should still be null values)
ratings_df.show(3)

+------+-----------+-------------+----------------+------------+-------------------+------------------+--------------+-----------+----------------------+----------+---------------+-----------------+-------------+-----------------------+----------------------+------------------+---------------+-------------------------+
|rating|ds_severity|ds_status_ins|       ds_action|review_count|RestaurantsDelivery|RestaurantsTakeOut|OutdoorSeating|BikeParking|RestaurantsPriceRange2|rating_num|ds_severity_num|ds_status_ins_num|ds_action_num|restaurantsDelivery_num|restaurantsTakeOut_num|outdoorSeating_num|bikeParking_num|restaurantsPriceRange_num|
+------+-----------+-------------+----------------+------------+-------------------+------------------+--------------+-----------+----------------------+----------+---------------+-----------------+-------------+-----------------------+----------------------+------------------+---------------+-------------------------+
|   bad|  M - Minor|         Pass|Not

In [71]:
# create new dataframe that only contains the numerically represented predictors
pred_keep = ['rating_num','ds_severity_num','ds_status_ins_num', 'ds_action_num','restaurantsDelivery_num',
             'restaurantsTakeOut_num','outdoorSeating_num','bikeParking_num','review_count','restaurantsPriceRange_num']

ratings_final = ratings_df.select(pred_keep)
ratings_final.show(5)

+----------+---------------+-----------------+-------------+-----------------------+----------------------+------------------+---------------+------------+-------------------------+
|rating_num|ds_severity_num|ds_status_ins_num|ds_action_num|restaurantsDelivery_num|restaurantsTakeOut_num|outdoorSeating_num|bikeParking_num|review_count|restaurantsPriceRange_num|
+----------+---------------+-----------------+-------------+-----------------------+----------------------+------------------+---------------+------------+-------------------------+
|         0|              1|                2|            2|                   null|                     1|              null|              1|          41|                      3.0|
|         0|           null|                2|         null|                   null|                     1|              null|              1|          41|                      3.0|
|         0|           null|                2|         null|                   null|      

### Strategy 1: Impute Missing Values with Mode
First we are going to try imputing missing values. Since the variables are categorical, we will use the mode of each predictor:

In [9]:
# find the mode of each column and store in a dataframe

feature_modes = reduce(
    lambda ratingcol, modecol: ratingcol.union(modecol),
    [
        ratings_final.dropna().groupBy(i)\
            .count()\
             .sort(f.col("count").desc())\
             .limit(1)\
             .select(
                f.lit(i).alias("Column"),
                f.col(i).alias("Mode")
            ) 
        for i in ratings_final.columns
    ]
)

feature_modes.show()

+--------------------+----+
|              Column|Mode|
+--------------------+----+
|          rating_num| 1.0|
|     ds_severity_num| 1.0|
|   ds_status_ins_num| 2.0|
|       ds_action_num| 2.0|
|restaurantsDelive...| 0.0|
|restaurantsTakeOu...| 1.0|
|  outdoorSeating_num| 0.0|
|     bikeParking_num| 1.0|
|        review_count|12.0|
|restaurantsPriceR...| 2.0|
+--------------------+----+



In [73]:
# Now we can replace the missing values in each column with that column's mode (there are no missing vals in reviewcount)
ratings_final = ratings_final.fillna({'ds_severity_num':1.0})
ratings_final = ratings_final.fillna({'ds_status_ins_num':2.0})
ratings_final = ratings_final.fillna({'ds_action_num':2.0})
ratings_final = ratings_final.fillna({'restaurantsDelivery_num':0.0})
ratings_final = ratings_final.fillna({'restaurantsTakeOut_num':1.0})
ratings_final = ratings_final.fillna({'outdoorSeating_num':0.0})
ratings_final = ratings_final.fillna({'bikeParking_num':1.0})
ratings_final = ratings_final.fillna({'restaurantsPriceRange_num':2.0})

In [74]:
# Check missing values
print('There are {} missing values in ds_severity_num.'.format(ratings_final.filter(ratings_final['ds_severity_num'].isNull()).count()))
print('There are {} missing values in ds_status_ins_num.'.format(ratings_final.filter(ratings_final['ds_status_ins_num'].isNull()).count()))
print('There are {} missing values in ds_action_num.'.format(ratings_final.filter(ratings_final['ds_action_num'].isNull()).count()))
print('There are {} missing values in review_count.'.format(ratings_final.filter(ratings_final['review_count'].isNull()).count()))
print('There are {} missing values in restaurantsDelivery_num.'.format(ratings_final.filter(ratings_final['restaurantsDelivery_num'].isNull()).count()))
print('There are {} missing values in restaurantsTakeOut_num.'.format(ratings_final.filter(ratings_final['restaurantsTakeOut_num'].isNull()).count()))
print('There are {} missing values in outdoorSeating_num.'.format(ratings_final.filter(ratings_final['outdoorSeating_num'].isNull()).count()))
print('There are {} missing values in bikeParking_num.'.format(ratings_final.filter(ratings_final['bikeParking_num'].isNull()).count()))
print('There are {} missing values in restaurantsPriceRange_num.'.format(ratings_final.filter(ratings_final['restaurantsPriceRange_num'].isNull()).count()))

There are 0 missing values in ds_severity_num.
There are 0 missing values in ds_status_ins_num.
There are 0 missing values in ds_action_num.
There are 0 missing values in review_count.
There are 0 missing values in restaurantsDelivery_num.
There are 0 missing values in restaurantsTakeOut_num.
There are 0 missing values in outdoorSeating_num.
There are 0 missing values in bikeParking_num.
There are 0 missing values in restaurantsPriceRange_num.


Now we can proceed with the model building.

### Scaling Features into 1 Vector

In [75]:
# create a new df with label and features columns
assembler = VectorAssembler(inputCols=ratings_final.columns[1:],
                           outputCol="features")

ratings_scaled = assembler.transform(ratings_final)
ratings_scaled = ratings_scaled.select(['rating_num','features'])
ratings_scaled.show(5, truncate=False)

+----------+--------------------------------------+
|rating_num|features                              |
+----------+--------------------------------------+
|0         |[1.0,2.0,2.0,0.0,1.0,0.0,1.0,41.0,3.0]|
|0         |[1.0,2.0,2.0,0.0,1.0,0.0,1.0,41.0,3.0]|
|0         |[1.0,2.0,2.0,0.0,1.0,0.0,1.0,41.0,3.0]|
|0         |[1.0,2.0,2.0,0.0,1.0,0.0,1.0,41.0,3.0]|
|2         |[1.0,2.0,2.0,0.0,0.0,0.0,1.0,4.0,3.0] |
+----------+--------------------------------------+
only showing top 5 rows



## 1) LogisticRegressionWithLBFGS Model (Imputed Data)
First we are going to use LogisticRegressionWithLBFGS because it includes feature scaling and L2 regularization.

In order to use LogisticRegressionWithLBFGS, we have to convert our dataframe to an RDD and use the labeledpoint format.

In [78]:
# select which features to use (all)
assembler_1 = VectorAssembler(inputCols=['ds_severity_num','ds_status_ins_num',
                                          'ds_action_num','restaurantsDelivery_num',
                                          'restaurantsTakeOut_num','outdoorSeating_num',
                                         'bikeParking_num','review_count','restaurantsPriceRange_num'], outputCol="features") 

transformed_1 = assembler_1.transform(ratings_final)


In [79]:
# convert to RDD, since MLlib uses RDDs not dataframe
ratings_final_rdd = transformed_1.select(col("rating_num"), col("features")).rdd.map(tuple)
ratings_final_rdd.take(5)

[(0, DenseVector([1.0, 2.0, 2.0, 0.0, 1.0, 0.0, 1.0, 41.0, 3.0])),
 (0, DenseVector([1.0, 2.0, 2.0, 0.0, 1.0, 0.0, 1.0, 41.0, 3.0])),
 (0, DenseVector([1.0, 2.0, 2.0, 0.0, 1.0, 0.0, 1.0, 41.0, 3.0])),
 (0, DenseVector([1.0, 2.0, 2.0, 0.0, 1.0, 0.0, 1.0, 41.0, 3.0])),
 (2, DenseVector([1.0, 2.0, 2.0, 0.0, 0.0, 0.0, 1.0, 4.0, 3.0]))]

In [80]:
# convert to LabeledPoint type
ratings_final_lp = ratings_final_rdd.map(lambda row:(row[0],Vectors.dense(row[1]))).map(lambda row:LabeledPoint(row[0],row[1]))
ratings_final_lp.take(5)

[LabeledPoint(0.0, [1.0,2.0,2.0,0.0,1.0,0.0,1.0,41.0,3.0]),
 LabeledPoint(0.0, [1.0,2.0,2.0,0.0,1.0,0.0,1.0,41.0,3.0]),
 LabeledPoint(0.0, [1.0,2.0,2.0,0.0,1.0,0.0,1.0,41.0,3.0]),
 LabeledPoint(0.0, [1.0,2.0,2.0,0.0,1.0,0.0,1.0,41.0,3.0]),
 LabeledPoint(2.0, [1.0,2.0,2.0,0.0,0.0,0.0,1.0,4.0,3.0])]

In [81]:
# split RDD into 80% train, 20% test
ratings_train_lp, ratings_test_lp = ratings_final_lp.randomSplit([0.8, 0.2], seed=314)

# check if split is approximately 80-20 train, test
(ratings_train_lp.count()/ratings_final_lp.count(), ratings_test_lp.count()/ratings_final_lp.count(), ratings_final_lp.count()/ratings_final_lp.count())

(0.7880085653104925, 0.21199143468950749, 1.0)

In [82]:
# build the model
ratings_model = LogisticRegressionWithLBFGS.train(ratings_train_lp,numClasses=3)
ratings_test_lp.take(5)

[LabeledPoint(2.0, [1.0,2.0,2.0,0.0,1.0,0.0,1.0,4.0,2.0]),
 LabeledPoint(1.0, [1.0,2.0,2.0,0.0,1.0,0.0,1.0,201.0,2.0]),
 LabeledPoint(1.0, [1.0,2.0,2.0,0.0,1.0,1.0,1.0,107.0,2.0]),
 LabeledPoint(1.0, [1.0,2.0,2.0,0.0,1.0,0.0,1.0,71.0,2.0]),
 LabeledPoint(2.0, [1.0,2.0,2.0,0.0,1.0,1.0,1.0,40.0,2.0])]

### Evaluation of LogisticRegressionWithLBFGS Model (Imputed Data)

In [83]:
# map the prediction and label for each entry 
predictionAndLabels = ratings_test_lp.map(lambda lp: (float(ratings_model.predict(lp.features)), lp.label))                                 

# create a metrics object based on predictions
metrics = MulticlassMetrics(predictionAndLabels)

# create a dataframe to store model metrics for each label
evaldata = {'Label = 0.0':[metrics.precision(label=0.0),metrics.recall(label=0.0),metrics.fMeasure(label=0.0)],
            'Label = 1.0':[metrics.precision(label=1.0),metrics.recall(label=1.0),metrics.fMeasure(label=1.0)],
            'Label = 2.0':[metrics.precision(label=2.0),metrics.recall(label=2.0),metrics.fMeasure(label=2.0)]}
lbfgs_df = pd.DataFrame(evaldata,index=['precision','recall','f1 score'])
labelsAndPreds_test = ratings_test_lp.map(lambda p: (p.label, ratings_model.predict(p.features)))
accuracy_te = 1.0 * labelsAndPreds_test.filter(lambda pl: pl[0] == pl[1]).count() / ratings_test_lp.count()

print(lbfgs_df)
print('=================================================')
print('Overall Model Accuracy (test): {}'.format(accuracy_te))


           Label = 0.0  Label = 1.0  Label = 2.0
precision     0.439024     0.489083     0.370370
recall        0.236842     0.842105     0.113636
f1 score      0.307692     0.618785     0.173913
Overall Model Accuracy (test): 0.4713804713804714


### Split Data into 80% Training and 20% Testing Sets
This split will be used for the RandomForestClassifier and NaiveBayes models.

In [85]:
# split data 
seed=314
train_test = [0.80, 0.20]
ratings_train, ratings_test = ratings_scaled.randomSplit(train_test,seed=seed)

print('Training set has {} observations.'.format(ratings_train.count()))
print('Testing set has {} observations.'.format(ratings_test.count()))

Training set has 1106 observations.
Testing set has 295 observations.


## 2) Random Forest Classifier Model (Imputed Data)

In [86]:
# initialize model
ratings_rfc1 = RandomForestClassifier(numTrees=9, featuresCol='features',labelCol='rating_num')

# fit model on training data
ratings_rfc_model1 = ratings_rfc1.fit(ratings_train)

In [87]:
# make predictions on test data using Random Forest classifier
ratings_rfc_pred1 = ratings_rfc_model1.transform(ratings_test)
ratings_rfc_pred1.show(1)

+----------+--------------------+--------------------+--------------------+----------+
|rating_num|            features|       rawPrediction|         probability|prediction|
+----------+--------------------+--------------------+--------------------+----------+
|         0|[0.0,2.0,2.0,0.0,...|[3.63063380677546...|[0.40340375630838...|       0.0|
+----------+--------------------+--------------------+--------------------+----------+
only showing top 1 row



### Evaluation of Random Forest Model (Imputed Data)

In [91]:
ratings_rfc_pred1.select('rating_num','prediction','probability').show(5)

+----------+----------+--------------------+
|rating_num|prediction|         probability|
+----------+----------+--------------------+
|         0|       0.0|[0.40340375630838...|
|         0|       0.0|[0.43357633755891...|
|         0|       2.0|[0.22378854620293...|
|         0|       2.0|[0.32913569194417...|
|         0|       0.0|[0.56408288601058...|
+----------+----------+--------------------+
only showing top 5 rows



In [90]:
eval = MulticlassClassificationEvaluator(labelCol='rating_num',predictionCol='prediction',metricName="accuracy")
acc_rfc_im = eval.evaluate(ratings_rfc_pred1)
print('Accuracy using RandomForestClassifier with Imputed Values= %g' % (acc_rfc_im))

Accuracy using RandomForestClassifier with Imputed Values= 0.555932


## 3) Naive Bayes Classifier (Imputed Data)

In [92]:
# initialize model
ratings_nb1 = NaiveBayes(smoothing=0.5, modelType='multinomial',labelCol='rating_num',featuresCol='features')

# fit on training data
ratings_nb_model1 = ratings_nb1.fit(ratings_train)

In [93]:
# make predictions on test data using Naive Bayes classifier
ratings_nb_pred1 =  ratings_nb_model1.transform(ratings_test)
ratings_nb_pred1.show(5)

+----------+--------------------+--------------------+--------------------+----------+
|rating_num|            features|       rawPrediction|         probability|prediction|
+----------+--------------------+--------------------+--------------------+----------+
|         0|[0.0,2.0,2.0,0.0,...|[-27.562421848768...|[0.65156041508465...|       0.0|
|         0|[0.0,2.0,2.0,0.0,...|[-25.345883237094...|[0.58543790393481...|       0.0|
|         0|[0.0,2.0,2.0,0.0,...|[-32.444180692090...|[0.61587495829703...|       0.0|
|         0|[1.0,2.0,1.0,0.0,...|[-28.904190675738...|[0.63934097509873...|       0.0|
|         0|[1.0,2.0,1.0,0.0,...|[-26.978824177823...|[0.54955311034679...|       0.0|
+----------+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



### Evaluation of Naive Bayes Model (Imputed Data)

In [95]:
# accuracy
eval = MulticlassClassificationEvaluator(labelCol='rating_num',predictionCol='prediction',metricName="accuracy")
acc_nb_im = eval.evaluate(ratings_nb_pred1)
print('Accuracy using Naive Bayes Classifier with Imputed Values=  = %g' % (acc_nb_im))

Accuracy using Naive Bayes Classifier with Imputed Values=  = 0.427119


## Strategy 2: Drop Rows that Contain Null/Missing values
We can see how the model performs if we were to drop all the null values instead of imputing them with the mode.

In [96]:
# create new dataframe
dineSafe_yelp_data.createOrReplaceTempView("dineSafe_yelp_data")
ratings_df_2 = spark.sql("SELECT rating, ds_severity, ds_status_ins, ds_action, review_count, attributes.RestaurantsDelivery, attributes.RestaurantsTakeOut, attributes.OutdoorSeating, attributes.BikeParking, attributes.RestaurantsPriceRange2 FROM dineSafe_yelp_data")
ratings_df_2.show(5)

+------+-----------+-------------+----------------+------------+-------------------+------------------+--------------+-----------+----------------------+
|rating|ds_severity|ds_status_ins|       ds_action|review_count|RestaurantsDelivery|RestaurantsTakeOut|OutdoorSeating|BikeParking|RestaurantsPriceRange2|
+------+-----------+-------------+----------------+------------+-------------------+------------------+--------------+-----------+----------------------+
|   bad|  M - Minor|         Pass|Notice to Comply|          41|               null|              True|          null|       True|                     3|
|   bad|       null|         Pass|            null|          41|               null|              True|          null|       True|                     3|
|   bad|       null|         Pass|            null|          41|               null|              True|          null|       True|                     3|
|   bad|       null|         Pass|            null|          41|            

In [97]:
# drop missing values
print('There are {} rows that contain NULL/missing values in ratings_df_2.'.format(ratings_df_2.subtract(ratings_df_2.na.drop()).count()))
ratings_df_2_fixed=ratings_df_2.na.drop()
print('After .na.drop(), there are {} rows that contain NULL/missing values in ratings_df_2_fixed.'.format(ratings_df_2_fixed.subtract(ratings_df_2_fixed.dropna()).count()))

There are 592 rows that contain NULL/missing values in ratings_df_2.
After .na.drop(), there are 0 rows that contain NULL/missing values in ratings_df_2_fixed.


In [98]:
# create new columns that represent the predictors numerically
ratings_df_2_fixed = ratings_df_2_fixed.withColumn('rating_num', ratings_num)
ratings_df_2_fixed = ratings_df_2_fixed.withColumn('ds_severity_num', ds_severity_num)
ratings_df_2_fixed = ratings_df_2_fixed.withColumn('ds_status_ins_num', ds_status_ins_num)
ratings_df_2_fixed = ratings_df_2_fixed.withColumn('ds_action_num', ds_action_num)
ratings_df_2_fixed = ratings_df_2_fixed.withColumn('restaurantsDelivery_num', F.when(ratings_df_2_fixed['RestaurantsDelivery'] == 'False', 0).when(ratings_df_2_fixed['RestaurantsDelivery'] == 'True', 1))
ratings_df_2_fixed = ratings_df_2_fixed.withColumn('restaurantsTakeOut_num', F.when(ratings_df_2_fixed['RestaurantsTakeOut'] == 'False', 0).when(ratings_df_2_fixed['RestaurantsTakeout'] == 'True', 1))
ratings_df_2_fixed = ratings_df_2_fixed.withColumn('outdoorSeating_num', F.when(ratings_df_2_fixed['OutdoorSeating'] == 'False', 0).when(ratings_df_2_fixed['OutdoorSeating'] == 'True', 1))
ratings_df_2_fixed = ratings_df_2_fixed.withColumn('bikeParking_num', F.when(ratings_df_2_fixed['BikeParking'] == 'False', 0).when(ratings_df_2_fixed['BikeParking'] == 'True', 1))
ratings_df_2_fixed = ratings_df_2_fixed.withColumn('restaurantsPriceRange_num', ratings_df_2_fixed['RestaurantsPriceRange2'].cast('double')) 

In [99]:
# create new dataframe that only contains the numerically represented predictors
pred_keep_2 = ['rating_num','ds_severity_num','ds_status_ins_num', 'ds_action_num','restaurantsDelivery_num',
             'restaurantsTakeOut_num','outdoorSeating_num','bikeParking_num','review_count','restaurantsPriceRange_num']
ratings_2_final = ratings_df_2_fixed.select(pred_keep_2)
ratings_2_final.show(3)

+----------+---------------+-----------------+-------------+-----------------------+----------------------+------------------+---------------+------------+-------------------------+
|rating_num|ds_severity_num|ds_status_ins_num|ds_action_num|restaurantsDelivery_num|restaurantsTakeOut_num|outdoorSeating_num|bikeParking_num|review_count|restaurantsPriceRange_num|
+----------+---------------+-----------------+-------------+-----------------------+----------------------+------------------+---------------+------------+-------------------------+
|         2|              1|                2|            2|                      0|                     1|                 1|              1|          13|                      2.0|
|         2|              2|                1|            2|                      0|                     1|                 1|              1|          13|                      2.0|
|         1|              2|                1|            2|                      0|      

### Scaling Features into 1 Vector

In [100]:
# create a new df with label and features columns
assembler = VectorAssembler(inputCols=ratings_2_final.columns[1:],
                           outputCol="features")

ratings_2_scaled = assembler.transform(ratings_2_final)
ratings_2_scaled = ratings_2_scaled.select(['rating_num','features'])
ratings_2_scaled.show(5, truncate=False)

+----------+---------------------------------------+
|rating_num|features                               |
+----------+---------------------------------------+
|2         |[1.0,2.0,2.0,0.0,1.0,1.0,1.0,13.0,2.0] |
|2         |[2.0,1.0,2.0,0.0,1.0,1.0,1.0,13.0,2.0] |
|1         |[2.0,1.0,2.0,0.0,1.0,0.0,1.0,201.0,2.0]|
|1         |[1.0,2.0,2.0,0.0,1.0,0.0,1.0,201.0,2.0]|
|1         |[1.0,2.0,2.0,0.0,1.0,0.0,1.0,201.0,2.0]|
+----------+---------------------------------------+
only showing top 5 rows



## 1) LogisticRegressionWithLBFGS Model (with dropped NULL values)


In [101]:
# select which features to use (all)
assembler_2 = VectorAssembler(inputCols=['ds_severity_num','ds_status_ins_num',
                                          'ds_action_num','restaurantsDelivery_num',
                                          'restaurantsTakeOut_num','outdoorSeating_num',
                                         'bikeParking_num','review_count','restaurantsPriceRange_num'], outputCol="features") 

transformed_2 = assembler_2.transform(ratings_2_final)

In [102]:
# convert to RDD, since MLlib uses RDDs not dataframe
ratings_final_rdd_2 = transformed_2.select(col("rating_num"), col("features")).rdd.map(tuple)
ratings_final_rdd_2.take(5)

[(2, DenseVector([1.0, 2.0, 2.0, 0.0, 1.0, 1.0, 1.0, 13.0, 2.0])),
 (2, DenseVector([2.0, 1.0, 2.0, 0.0, 1.0, 1.0, 1.0, 13.0, 2.0])),
 (1, DenseVector([2.0, 1.0, 2.0, 0.0, 1.0, 0.0, 1.0, 201.0, 2.0])),
 (1, DenseVector([1.0, 2.0, 2.0, 0.0, 1.0, 0.0, 1.0, 201.0, 2.0])),
 (1, DenseVector([1.0, 2.0, 2.0, 0.0, 1.0, 0.0, 1.0, 201.0, 2.0]))]

In [103]:
# convert to LabeledPoint type
ratings_final_lp_2 = ratings_final_rdd_2.map(lambda row:(row[0],Vectors.dense(row[1]))).map(lambda row:LabeledPoint(row[0],row[1]))
ratings_final_lp_2.take(5)

[LabeledPoint(2.0, [1.0,2.0,2.0,0.0,1.0,1.0,1.0,13.0,2.0]),
 LabeledPoint(2.0, [2.0,1.0,2.0,0.0,1.0,1.0,1.0,13.0,2.0]),
 LabeledPoint(1.0, [2.0,1.0,2.0,0.0,1.0,0.0,1.0,201.0,2.0]),
 LabeledPoint(1.0, [1.0,2.0,2.0,0.0,1.0,0.0,1.0,201.0,2.0]),
 LabeledPoint(1.0, [1.0,2.0,2.0,0.0,1.0,0.0,1.0,201.0,2.0])]

In [104]:
# split RDD into 80% train, 20% test
ratings_train_lp_2, ratings_test_lp_2 = ratings_final_lp_2.randomSplit([0.8, 0.2], seed=314)

# check if split is approximately 80-20 train, test
(ratings_train_lp_2.count()/ratings_final_lp_2.count(), ratings_test_lp_2.count()/ratings_final_lp_2.count(), ratings_final_lp_2.count()/ratings_final_lp_2.count())

(0.7945205479452054, 0.2054794520547945, 1.0)

In [105]:
# build the model 
ratings_model_2 = LogisticRegressionWithLBFGS.train(ratings_train_lp_2,numClasses=3)
ratings_test_lp_2.take(5)

[LabeledPoint(1.0, [2.0,2.0,1.0,0.0,1.0,0.0,1.0,71.0,2.0]),
 LabeledPoint(2.0, [1.0,2.0,2.0,0.0,1.0,0.0,0.0,119.0,2.0]),
 LabeledPoint(1.0, [0.0,2.0,2.0,0.0,1.0,1.0,1.0,49.0,1.0]),
 LabeledPoint(1.0, [1.0,2.0,2.0,1.0,1.0,0.0,1.0,27.0,1.0]),
 LabeledPoint(1.0, [1.0,2.0,2.0,1.0,1.0,0.0,1.0,12.0,1.0])]

### Evaluation of LogisticRegressionWithLBFGS Model (with dropped NULL values)

In [108]:
# map the prediction and label for each entry 
predictionAndLabels2 = ratings_test_lp_2.map(lambda lp: (float(ratings_model_2.predict(lp.features)), lp.label))                                 

# create a metrics object based on predictions
metrics2 = MulticlassMetrics(predictionAndLabels2)

# create a dataframe to store model metrics for each label
evaldata2 = {'Label = 0.0':[metrics2.precision(label=0.0),metrics2.recall(label=0.0),metrics2.fMeasure(label=0.0)],
            'Label = 1.0':[metrics2.precision(label=1.0),metrics2.recall(label=1.0),metrics2.fMeasure(label=1.0)],
            'Label = 2.0':[metrics2.precision(label=2.0),metrics2.recall(label=2.0),metrics2.fMeasure(label=2.0)]}
lbfgs_df2 = pd.DataFrame(evaldata2,index=['precision','recall','f1 score'])
labelsAndPreds_test2 = ratings_test_lp_2.map(lambda p: (p.label, ratings_model_2.predict(p.features)))
accuracy_te2 = 1.0 * labelsAndPreds_test2.filter(lambda pl: pl[0] == pl[1]).count() / ratings_test_lp_2.count()

print(lbfgs_df2)
print('=================================================')
print('Overall Model Accuracy (test): {}'.format(accuracy_te2))


           Label = 0.0  Label = 1.0  Label = 2.0
precision     0.333333     0.532258          0.0
recall        0.210526     0.868421          0.0
f1 score      0.258065     0.660000          0.0
Overall Model Accuracy (test): 0.49333333333333335


### Split Data into 80% Training and 20% Testing Sets (with dropped NULL values)
This split will be used for the RandomForestClassifier and NaiveBayes models.

In [109]:
seed=314
train_test = [0.80, 0.20]
ratings_train, ratings_test = ratings_2_scaled.randomSplit(train_test,seed=seed)

print('Training set has {} observations.'.format(ratings_train.count()))
print('Testing set has {} observations.'.format(ratings_test.count()))

Training set has 289 observations.
Testing set has 76 observations.


## 2) Random Forest Classifier (with dropped NULL values)

In [34]:
# initialize model
ratings_rfc2 = RandomForestClassifier(numTrees=9, featuresCol='features',labelCol='rating_num')

# fit model on training data
ratings_rfc_model2 = ratings_rfc2.fit(ratings_train)

In [35]:
# make predictions on test data using Random Forest classifier
ratings_rfc_pred2 = ratings_rfc_model2.transform(ratings_test)
ratings_rfc_pred2.show(3)

+----------+--------------------+--------------------+--------------------+----------+
|rating_num|            features|       rawPrediction|         probability|prediction|
+----------+--------------------+--------------------+--------------------+----------+
|         0|[1.0,1.0,2.0,0.0,...|[1.70215657019686...|[0.18912850779965...|       1.0|
|         0|[1.0,2.0,1.0,0.0,...|[3.57669495635402...|[0.39741055070600...|       1.0|
|         0|[1.0,2.0,2.0,0.0,...|[3.70560109289617...|[0.41173345476624...|       0.0|
+----------+--------------------+--------------------+--------------------+----------+
only showing top 3 rows



### Evaluation of Random Forest Classifier Model (with dropped NULL values)

In [110]:
# accuracy
eval = MulticlassClassificationEvaluator(labelCol='rating_num',predictionCol='prediction',metricName="accuracy")
acc_rfc = eval.evaluate(ratings_rfc_pred2)
print('Accuracy for Random Forest Classifier Model (with dropped NULL values) = %g' % (acc_rfc))

Accuracy for Random Forest Classifier Model (with dropped NULL values) = 0.671053


## 3) Naive Bayes Classifier (with dropped NULL values)

In [111]:
# initialize model
ratings_nb2 = NaiveBayes(smoothing=0.5, modelType='multinomial',labelCol='rating_num',featuresCol='features')

# fit on training data
ratings_nb_model2 = ratings_nb2.fit(ratings_train)

In [113]:
# make predictions on test data using Naive Bayes classifier
ratings_nb_pred2 =  ratings_nb_model2.transform(ratings_test)
ratings_nb_pred2.show(3)

+----------+--------------------+--------------------+--------------------+----------+
|rating_num|            features|       rawPrediction|         probability|prediction|
+----------+--------------------+--------------------+--------------------+----------+
|         0|[1.0,1.0,2.0,0.0,...|[-37.916116355028...|[0.20661779230043...|       1.0|
|         0|[1.0,2.0,1.0,0.0,...|[-30.335501127766...|[0.25104515800018...|       1.0|
|         0|[1.0,2.0,2.0,0.0,...|[-30.433046029020...|[0.28394162720222...|       1.0|
+----------+--------------------+--------------------+--------------------+----------+
only showing top 3 rows



### Evaluation of Naive Bayes Classifer (with dropped NULL values)

In [114]:
# accuracy
eval = MulticlassClassificationEvaluator(labelCol='rating_num',predictionCol='prediction',metricName="accuracy")
acc_nb = eval.evaluate(ratings_nb_pred2)
print('Accuracy for Naive Bayes Classifier Model (with dropped NULL values) = %g' % (acc_nb))

Accuracy for Naive Bayes Classifier Model (with dropped NULL values) = 0.552632


## Comparing Accuracy of 3 models With/Without Imputed Data:


In [121]:
# construct dataframe of accuracy for each model
acc_noimpute_arr = [acc_lr,acc_rfc,acc_nb]
acc_impute_arr = [acc_lr_im,acc_rfc_im,acc_nb_im]
acc_labels = ['LogisticRegressionWithLBGFS','RandomForestClassifier','NaiveBayes']
acc_df = pd.DataFrame(acc_labels)
acc_df.columns=['Accuracy']
acc_df['Dropped NULL Values']=acc_noimpute_arr
acc_df['Using Imputed Data']=acc_impute_arr

acc_df

Unnamed: 0,Accuracy,Dropped NULL Values,Using Imputed Data
0,LogisticRegressionWithLBGFS,0.618421,0.447458
1,RandomForestClassifier,0.671053,0.555932
2,NaiveBayes,0.552632,0.427119
