In [0]:
%fs ls /FileStore/tables

path,name,size
dbfs:/FileStore/tables/EDA.ipynb,EDA.ipynb,3946
dbfs:/FileStore/tables/Zomato_reduced.csv,Zomato_reduced.csv,126883203
dbfs:/FileStore/tables/lab/,lab/,0
dbfs:/FileStore/tables/restaurants_in_India-1.csv,restaurants_in_India-1.csv,113670506
dbfs:/FileStore/tables/restaurants_in_India.csv,restaurants_in_India.csv,113670506


In [0]:
res_df1 = spark.read.format("csv").option("header","true").option("inferSchema", "true").load("/FileStore/tables/Zomato_reduced.csv")
res_df2 = spark.read.format("csv").option("header","true").option("inferSchema", "true").load("/FileStore/tables/restaurants_in_India.csv")

In [0]:
res_df1.columns

In [0]:
res_df2.columns

In [0]:
res_df2.summary

In [0]:
res_df2=res_df2.withColumn("average_cost_for_two",res_df2.average_cost_for_two.cast('double'))
res_df2=res_df2.withColumn("price_range",res_df2.price_range.cast('double'))
res_df2=res_df2.withColumn("aggregate_rating",res_df2.aggregate_rating.cast('double'))
res_df2=res_df2.withColumn("votes",res_df2.votes.cast('double'))

In [0]:
# Finding Count of Null and Missing values in Restaurant Data
from pyspark.sql.functions import isnan, when, count, col
res_df2.select([count(when(isnan(x) | col(x).isNull(), x)).alias(x) for x in res_df2.columns]).show()

In [0]:
res_df2.describe().select('average_cost_for_two','price_range','aggregate_rating','votes').show()

In [0]:
#it is observed that the minimum value for the fields such as "average_cost_for_two", "price_range" are zero (0) and "aggregate_rating", "votes" are -1. Replacing 0 and -1 with NaN in these fields.
import numpy as np
from pyspark.sql.functions import when
res_df2=res_df2.withColumn("average_cost_for_two",when(res_df2.average_cost_for_two==0,np.nan).otherwise(res_df2.average_cost_for_two))
res_df2=res_df2.withColumn("average_cost_for_two",when(res_df2.average_cost_for_two==1,np.nan).otherwise(res_df2.average_cost_for_two))

In [0]:
res_df2=res_df2.withColumn("price_range",when(res_df2.price_range==0,np.nan).otherwise(res_df2.price_range))
#res_df2=res_df2.withColumn("aggregate_rating",when(res_df2.aggregate_rating==-1,np.nan).otherwise(res_df2.aggregate_rating))

In [0]:
res_df2=res_df2.withColumn("votes",when(res_df2.votes==-18,np.nan).otherwise(res_df2.votes))
res_df2=res_df2.withColumn("votes",when(res_df2.votes==-3,np.nan).otherwise(res_df2.votes))

In [0]:
res_df2.describe().select('average_cost_for_two','price_range','aggregate_rating','votes').show()

In [0]:
res_df2.summary

In [0]:
# So we have replaced all "0" and "1" with NaN. Now, we can simply impute the NaN by calling an imputer.
from pyspark.ml.feature import Imputer
imputer=Imputer(inputCols=["average_cost_for_two","price_range","aggregate_rating","votes"],outputCols=["average_cost_for_two","price_range","aggregate_rating","votes"])
model=imputer.fit(res_df2)
res_df2=model.transform(res_df2)
res_df2.select("average_cost_for_two","price_range","aggregate_rating","votes").show(5)

In [0]:
# combining all the features in one single feature vector
cols=res_df2.select("average_cost_for_two","price_range","aggregate_rating","votes").columns
#cols.remove("price_range")
# Import vector assembler
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=cols,outputCol="features2")
# Using Transform method to transform the dataset
res_df2=assembler.transform(res_df2)
res_df2.select("features2").show(truncate=False)

In [0]:
# Using StandardScaler to scalerize the newly created "feature" column
from pyspark.ml.feature import StandardScaler
standardscaler=StandardScaler().setInputCol("features2").setOutputCol("Scaled_features")
res_df2=standardscaler.fit(res_df2).transform(res_df2)
res_df2.select("features2","Scaled_features").show(5)

In [0]:
# splitting the dataset in training and testing set
train, test = res_df2.randomSplit([0.8, 0.2], seed=12345)

In [0]:
# checking whether their is imbalance in the dataset - number of records with 0 rating
dataset_size=float(train.select("aggregate_rating").count())
numPositives=train.select("aggregate_rating").where('aggregate_rating == 0').count()
per_ones=(float(numPositives)/float(dataset_size))*100
numNegatives=float(dataset_size-numPositives)
print('The number of zeros are {}'.format(numPositives))
print('Percentage of zeros are {}'.format(per_ones))

In [0]:
#Handling of imbalance class handling
# Since the percentage of ones in the dataset is just 11.1 % ,their is slight imbalance in the dataset.

In [0]:
#In the dataset (train) we have 11.1 % negatives and 88.9 % positives. Therefore,logistic loss objective function should treat the positive class (aggregate_rating > 0) with slightly higher weight. For this purpose we calculate the BalancingRatio as follows:

#BalancingRatio= numNegatives/dataset_size

#Then against every aggregate_rating > 0, we put BalancingRatio in column "classWeights", and against every aggregate_rating = 0, we put 1-BalancingRatio in column "classWeights"

#In this way, we assign higher weightage to the minority class (i.e. positive class)

In [0]:
BalancingRatio= numNegatives/dataset_size
print('BalancingRatio = {}'.format(BalancingRatio))

In [0]:
train=train.withColumn("classWeights", when(train.aggregate_rating > 0,BalancingRatio).otherwise(1-BalancingRatio))
train.select("classWeights").show(5)

In [0]:
# Feature selection using chisquareSelector
from pyspark.ml.feature import ChiSqSelector
# css=ChiSqSelector().setFeaturesCol('Scaled_features').setLabelCol("Outcome").setOutputCol("Aspect")
css = ChiSqSelector(featuresCol='Scaled_features',outputCol='Aspect',labelCol='aggregate_rating',fpr=0.05)
train=css.fit(train).transform(train)
test=css.fit(test).transform(test)
test.select("Aspect").show(5,truncate=False)