<h3> Create a spark session and load the Incident Management Data set

In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName('Desicion-Tree').getOrCreate()

In [0]:
# File Upload for all the models

file_location = "/FileStore/tables/airbnb.csv"
file_type = "csv"
 
# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","
 
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

In [0]:
from pyspark.ml.classification import LinearSVC
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint



In [0]:
data = df

In [0]:
data=data.dropna()

In [0]:
display(data)

host_is_superhost,host_identity_verified,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,Number_of_amenities,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating,cancellation_policy,price,price_gte_150,price_category
0,1,Roslindale,42.28624082,-71.13437396,Apartment,Private room,2,1.0,1,1,Real Bed,20,0,0,2,36,804,94,moderate,65,0,lte_$75
1,1,Roslindale,42.29243789,-71.13576525,Apartment,Private room,2,1.0,1,1,Real Bed,17,1,20,3,41,2574,98,moderate,65,0,lte_$75
0,0,Roslindale,42.28110619,-71.12102117,House,Private room,4,1.0,1,2,Real Bed,22,2,25,1,1,0,100,moderate,75,0,lte_$75
1,1,Roslindale,42.28451221,-71.13625805,House,Private room,2,1.5,1,2,Real Bed,13,1,0,2,29,380,99,flexible,79,0,btw_$75-$150
1,1,Roslindale,42.2916898,-71.13189277,Condominium,Private room,2,1.0,1,1,Real Bed,12,1,0,2,8,130,100,flexible,75,0,lte_$75
0,1,Roslindale,42.28138963,-71.13119042,Apartment,Entire home/apt,3,1.0,1,2,Real Bed,12,1,25,1,57,421,90,strict,100,0,btw_$75-$150
1,1,Roslindale,42.2819461,-71.14102161,House,Private room,2,2.0,1,1,Real Bed,22,1,15,1,67,840,96,moderate,75,0,lte_$75
1,1,Roslindale,42.28587764,-71.12490956,Condominium,Private room,2,1.0,1,2,Real Bed,9,2,0,2,65,355,96,moderate,58,0,lte_$75
1,1,Roslindale,42.28882028,-71.1395101,Apartment,Entire home/apt,5,1.0,2,2,Real Bed,21,4,25,4,33,876,94,strict,229,1,gte_226
0,0,Roslindale,42.2864482,-71.13932539,House,Private room,2,1.0,1,1,Real Bed,15,1,10,1,1,0,80,flexible,60,0,lte_$75


In [0]:
# Create a 70-30 train test split
 
train_data,test_data=data.randomSplit([0.7,0.3])

In [0]:
df.printSchema()

root
 |-- host_is_superhost: integer (nullable = true)
 |-- host_identity_verified: integer (nullable = true)
 |-- neighbourhood_cleansed: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- property_type: string (nullable = true)
 |-- room_type: string (nullable = true)
 |-- accommodates: integer (nullable = true)
 |-- bathrooms: double (nullable = true)
 |-- bedrooms: integer (nullable = true)
 |-- beds: integer (nullable = true)
 |-- bed_type: string (nullable = true)
 |-- Number_of_amenities: integer (nullable = true)
 |-- guests_included: integer (nullable = true)
 |-- price_per_extra_person: integer (nullable = true)
 |-- minimum_nights: integer (nullable = true)
 |-- number_of_reviews: integer (nullable = true)
 |-- number_days_btw_first_last_review: integer (nullable = true)
 |-- review_scores_rating: integer (nullable = true)
 |-- cancellation_policy: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- 

<h3> Data transformation

In [0]:
# Use StringIndexer to convert the categorical columns to hold numerical data
 
host_is_superhost_indexer = StringIndexer(inputCol='host_is_superhost',outputCol='host_is_superhost_index',handleInvalid='keep')
host_identity_verified_indexer = StringIndexer(inputCol='host_identity_verified',outputCol='host_identity_verified_index',handleInvalid='keep')
neighbourhood_cleansed_indexer = StringIndexer(inputCol='neighbourhood_cleansed',outputCol='neighbourhood_cleansed_index',handleInvalid='keep')
property_type_indexer = StringIndexer(inputCol='property_type',outputCol='property_type_index',handleInvalid='keep')
room_type_indexer = StringIndexer(inputCol='room_type',outputCol='room_type_index',handleInvalid='keep')
bed_type_indexer = StringIndexer(inputCol='bed_type',outputCol='bed_type_index',handleInvalid='keep')
cancellation_policy_group_indexer = StringIndexer(inputCol='cancellation_policy',outputCol='cancellation_policy_index',handleInvalid='keep')
price_category_indexer = StringIndexer(inputCol='price_category',outputCol='price_category_index',handleInvalid='keep')

In [0]:
# Vector assembler is used to create a vector of input features
 
assembler = VectorAssembler(inputCols=['host_is_superhost_index','host_identity_verified_index','neighbourhood_cleansed_index',
                                       'property_type_index','room_type_index','bed_type_index','cancellation_policy_index',
                                       'price_category_index','latitude','longitude','accommodates','bathrooms',
                                       'bedrooms','beds','Number_of_amenities','guests_included','price_per_extra_person',
                                       'minimum_nights','number_of_reviews','number_days_btw_first_last_review',
                                       'review_scores_rating','price','price_gte_150'],
                            outputCol="unscaled_features")

In [0]:
# Standard scaler is used to scale the data for the linear SVC to perform well on the training data
 
scaler = StandardScaler(inputCol="unscaled_features",outputCol="features")

In [0]:
# Create SVM model
svm = LinearSVC(labelCol='price_gte_150',maxIter=10, regParam=0.1)

In [0]:
# Pipeline is used to pass the data through indexer and assembler simultaneously. Also, it helps to pre-rocess the test data
# in the same way as that of the train data.
 
pipe = Pipeline(stages=[host_is_superhost_indexer,host_identity_verified_indexer,neighbourhood_cleansed_indexer,property_type_indexer,room_type_indexer,bed_type_indexer,cancellation_policy_group_indexer,price_category_indexer, assembler,scaler, svm])

In [0]:
fit_model=pipe.fit(train_data)

In [0]:
# Store the results in a dataframe
 
results = fit_model.transform(test_data)
display(results)

host_is_superhost,host_identity_verified,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,Number_of_amenities,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating,cancellation_policy,price,price_gte_150,price_category,host_is_superhost_index,host_identity_verified_index,neighbourhood_cleansed_index,property_type_index,room_type_index,bed_type_index,cancellation_policy_index,price_category_index,unscaled_features,features,rawPrediction,prediction
0,0,Allston,42.34772212,-71.13526355,Apartment,Private room,1,1.0,1,1,Real Bed,17,1,0,1,1,0,100,flexible,90,0,btw_$75-$150,0.0,1.0,4.0,0.0,1.0,0.0,2.0,0.0,"Map(vectorType -> dense, length -> 23, values -> List(0.0, 1.0, 4.0, 0.0, 1.0, 0.0, 2.0, 0.0, 42.34772212, -71.13526355, 1.0, 1.0, 1.0, 1.0, 17.0, 1.0, 0.0, 1.0, 1.0, 0.0, 100.0, 90.0, 0.0))","Map(vectorType -> dense, length -> 23, values -> List(0.0, 2.365320236036661, 0.751063684911178, 0.0, 1.8959610382989898, 0.0, 2.3377264216190516, 0.0, 1685.0309277438566, -2207.417794003932, 0.5554042335982887, 2.018619235736524, 1.3247571465350756, 0.9380813281757762, 3.517895729560787, 0.8860737243233501, 0.0, 0.09746278482615306, 0.02631219862403453, 0.0, 10.10479168494852, 0.8929937130958328, 0.0))","Map(vectorType -> dense, length -> 2, values -> List(1.1524175289739493, -1.1524175289739493))",0.0
0,0,Allston,42.34910682,-71.12858566,Apartment,Private room,2,1.0,1,1,Real Bed,16,1,0,1,2,3,100,flexible,89,0,btw_$75-$150,0.0,1.0,4.0,0.0,1.0,0.0,2.0,0.0,"Map(vectorType -> dense, length -> 23, values -> List(0.0, 1.0, 4.0, 0.0, 1.0, 0.0, 2.0, 0.0, 42.34910682, -71.12858566, 2.0, 1.0, 1.0, 1.0, 16.0, 1.0, 0.0, 1.0, 2.0, 3.0, 100.0, 89.0, 0.0))","Map(vectorType -> dense, length -> 23, values -> List(0.0, 2.365320236036661, 0.751063684911178, 0.0, 1.8959610382989898, 0.0, 2.3377264216190516, 0.0, 1685.0860254494437, -2207.2105705752588, 1.1108084671965774, 2.018619235736524, 1.3247571465350756, 0.9380813281757762, 3.310960686645447, 0.8860737243233501, 0.0, 0.09746278482615306, 0.05262439724806906, 0.007039089726767791, 10.10479168494852, 0.8830715607281013, 0.0))","Map(vectorType -> dense, length -> 2, values -> List(1.1543583991915431, -1.1543583991915431))",0.0
0,0,Allston,42.3504619,-71.12950478,Apartment,Entire home/apt,2,1.0,1,1,Real Bed,10,0,0,1,5,15,96,strict,110,0,btw_$75-$150,0.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,"Map(vectorType -> sparse, length -> 23, indices -> List(1, 2, 8, 9, 10, 11, 12, 13, 14, 17, 18, 19, 20, 21), values -> List(1.0, 4.0, 42.3504619, -71.12950478, 2.0, 1.0, 1.0, 1.0, 10.0, 1.0, 5.0, 15.0, 96.0, 110.0))","Map(vectorType -> sparse, length -> 23, indices -> List(1, 2, 8, 9, 10, 11, 12, 13, 14, 17, 18, 19, 20, 21), values -> List(2.365320236036661, 0.751063684911178, 1685.1399445646937, -2207.239092039039, 1.1108084671965774, 2.018619235736524, 1.3247571465350756, 0.9380813281757762, 2.0693504291534044, 0.09746278482615306, 0.13156099312017266, 0.03519544863383896, 9.700600017550578, 1.0914367604504622))","Map(vectorType -> dense, length -> 2, values -> List(1.1041561544421996, -1.1041561544421996))",0.0
0,0,Allston,42.35053057,-71.12820937,Apartment,Entire home/apt,2,1.0,1,1,Real Bed,11,1,0,2,2,5,90,moderate,66,0,lte_$75,0.0,1.0,4.0,0.0,0.0,0.0,1.0,3.0,"Map(vectorType -> dense, length -> 23, values -> List(0.0, 1.0, 4.0, 0.0, 0.0, 0.0, 1.0, 3.0, 42.35053057, -71.12820937, 2.0, 1.0, 1.0, 1.0, 11.0, 1.0, 0.0, 2.0, 2.0, 5.0, 90.0, 66.0, 0.0))","Map(vectorType -> dense, length -> 23, values -> List(0.0, 2.365320236036661, 0.751063684911178, 0.0, 0.0, 0.0, 1.1688632108095258, 2.610995615430097, 1685.142676968422, -2207.198893817484, 1.1108084671965774, 2.018619235736524, 1.3247571465350756, 0.9380813281757762, 2.2762854720687447, 0.8860737243233501, 0.0, 0.19492556965230612, 0.05262439724806906, 0.011731816211279653, 9.094312516453668, 0.6548620562702774, 0.0))","Map(vectorType -> dense, length -> 2, values -> List(1.0819118234030896, -1.0819118234030896))",0.0
0,0,Allston,42.35122488,-71.13161809,Apartment,Entire home/apt,2,1.0,0,1,Pull-out Sofa,11,1,0,1,4,19,75,flexible,150,1,btw_$75-$150,0.0,1.0,4.0,0.0,0.0,2.0,2.0,0.0,"Map(vectorType -> dense, length -> 23, values -> List(0.0, 1.0, 4.0, 0.0, 0.0, 2.0, 2.0, 0.0, 42.35122488, -71.13161809, 2.0, 1.0, 0.0, 1.0, 11.0, 1.0, 0.0, 1.0, 4.0, 19.0, 75.0, 150.0, 1.0))","Map(vectorType -> dense, length -> 23, values -> List(0.0, 2.365320236036661, 0.751063684911178, 0.0, 0.0, 4.878548790311268, 2.3377264216190516, 0.0, 1685.1703038103128, -2207.304670738905, 1.1108084671965774, 2.018619235736524, 0.0, 0.9380813281757762, 2.2762854720687447, 0.8860737243233501, 0.0, 0.09746278482615306, 0.10524879449613812, 0.044580901602862676, 7.57859376371139, 1.4883228551597214, 1.9996606896278362))","Map(vectorType -> dense, length -> 2, values -> List(-1.0162468235144275, 1.0162468235144275))",1.0
0,0,Allston,42.35158931,-71.13346641,Apartment,Entire home/apt,4,1.0,1,2,Real Bed,11,1,15,3,19,522,94,moderate,90,0,btw_$75-$150,0.0,1.0,4.0,0.0,0.0,0.0,1.0,0.0,"Map(vectorType -> dense, length -> 23, values -> List(0.0, 1.0, 4.0, 0.0, 0.0, 0.0, 1.0, 0.0, 42.35158931, -71.13346641, 4.0, 1.0, 1.0, 2.0, 11.0, 1.0, 15.0, 3.0, 19.0, 522.0, 94.0, 90.0, 0.0))","Map(vectorType -> dense, length -> 23, values -> List(0.0, 2.365320236036661, 0.751063684911178, 0.0, 0.0, 0.0, 1.1688632108095258, 0.0, 1685.1848046096536, -2207.3620264616984, 2.221616934393155, 2.018619235736524, 1.3247571465350756, 1.8761626563515523, 2.2762854720687447, 0.8860737243233501, 0.751295340464255, 0.2923883544784592, 0.4999317738566561, 1.2248016124575956, 9.498504183851608, 0.8929937130958328, 0.0))","Map(vectorType -> dense, length -> 2, values -> List(1.1303438867058793, -1.1303438867058793))",0.0
0,0,Allston,42.35268367,-71.12124797,Apartment,Private room,2,1.0,1,1,Real Bed,16,1,0,3,1,0,100,flexible,50,0,lte_$75,0.0,1.0,4.0,0.0,1.0,0.0,2.0,3.0,"Map(vectorType -> dense, length -> 23, values -> List(0.0, 1.0, 4.0, 0.0, 1.0, 0.0, 2.0, 3.0, 42.35268367, -71.12124797, 2.0, 1.0, 1.0, 1.0, 16.0, 1.0, 0.0, 3.0, 1.0, 0.0, 100.0, 50.0, 0.0))","Map(vectorType -> dense, length -> 23, values -> List(0.0, 2.365320236036661, 0.751063684911178, 0.0, 1.8959610382989898, 0.0, 2.3377264216190516, 2.610995615430097, 1685.2283495833562, -2206.9828727125596, 1.1108084671965774, 2.018619235736524, 1.3247571465350756, 0.9380813281757762, 3.310960686645447, 0.8860737243233501, 0.0, 0.2923883544784592, 0.02631219862403453, 0.0, 10.10479168494852, 0.4961076183865738, 0.0))","Map(vectorType -> dense, length -> 2, values -> List(1.1353165287797364, -1.1353165287797364))",0.0
0,0,Allston,42.35315202,-71.12966786,Apartment,Private room,2,1.0,1,1,Real Bed,12,1,20,2,2,5,90,flexible,77,0,btw_$75-$150,0.0,1.0,4.0,0.0,1.0,0.0,2.0,0.0,"Map(vectorType -> dense, length -> 23, values -> List(0.0, 1.0, 4.0, 0.0, 1.0, 0.0, 2.0, 0.0, 42.35315202, -71.12966786, 2.0, 1.0, 1.0, 1.0, 12.0, 1.0, 20.0, 2.0, 2.0, 5.0, 90.0, 77.0, 0.0))","Map(vectorType -> dense, length -> 23, values -> List(0.0, 2.365320236036661, 0.751063684911178, 0.0, 1.8959610382989898, 0.0, 2.3377264216190516, 0.0, 1685.246985396465, -2207.2441526190646, 1.1108084671965774, 2.018619235736524, 1.3247571465350756, 0.9380813281757762, 2.483220514984085, 0.8860737243233501, 1.0017271206190066, 0.19492556965230612, 0.05262439724806906, 0.011731816211279653, 9.094312516453668, 0.7640057323153236, 0.0))","Map(vectorType -> dense, length -> 2, values -> List(1.2019760550808845, -1.2019760550808845))",0.0
0,0,Allston,42.35412286,-71.13048573,House,Private room,2,1.0,1,1,Real Bed,0,1,0,1,1,0,60,strict,80,0,btw_$75-$150,0.0,1.0,4.0,1.0,1.0,0.0,0.0,0.0,"Map(vectorType -> dense, length -> 23, values -> List(0.0, 1.0, 4.0, 1.0, 1.0, 0.0, 0.0, 0.0, 42.35412286, -71.13048573, 2.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 60.0, 80.0, 0.0))","Map(vectorType -> dense, length -> 23, values -> List(0.0, 2.365320236036661, 0.751063684911178, 0.9125109258282667, 1.8959610382989898, 0.0, 0.0, 0.0, 1685.285615465427, -2207.269532166438, 1.1108084671965774, 2.018619235736524, 1.3247571465350756, 0.9380813281757762, 0.0, 0.8860737243233501, 0.0, 0.09746278482615306, 0.02631219862403453, 0.0, 6.062875010969112, 0.793772189418518, 0.0))","Map(vectorType -> dense, length -> 2, values -> List(1.2353143305217684, -1.2353143305217684))",0.0
0,0,Allston,42.35849966,-71.12822625,Apartment,Entire home/apt,4,1.0,2,2,Real Bed,7,1,0,2,18,426,94,strict,145,0,btw_$75-$150,0.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,"Map(vectorType -> dense, length -> 23, values -> List(0.0, 1.0, 4.0, 0.0, 0.0, 0.0, 0.0, 0.0, 42.35849966, -71.12822625, 4.0, 1.0, 2.0, 2.0, 7.0, 1.0, 0.0, 2.0, 18.0, 426.0, 94.0, 145.0, 0.0))","Map(vectorType -> dense, length -> 23, values -> List(0.0, 2.365320236036661, 0.751063684911178, 0.0, 0.0, 0.0, 0.0, 0.0, 1685.4597698944106, -2207.1994176253747, 2.221616934393155, 2.018619235736524, 2.649514293070151, 1.8761626563515523, 1.448545300407383, 0.8860737243233501, 0.0, 0.19492556965230612, 0.47361957523262155, 0.9995507412010263, 9.498504183851608, 1.438712093321064, 0.0))","Map(vectorType -> dense, length -> 2, values -> List(1.0086303313374447, -1.0086303313374447))",0.0


<h3> Evaluating the model

In [0]:
results.select(['price_gte_150','prediction']).show()

+-------------+----------+
|price_gte_150|prediction|
+-------------+----------+
|            0|       0.0|
|            0|       0.0|
|            0|       0.0|
|            0|       0.0|
|            1|       1.0|
|            0|       0.0|
|            0|       0.0|
|            0|       0.0|
|            0|       0.0|
|            0|       0.0|
|            0|       0.0|
|            0|       0.0|
|            0|       0.0|
|            1|       1.0|
|            1|       1.0|
|            1|       1.0|
|            1|       1.0|
|            0|       0.0|
|            0|       0.0|
|            1|       1.0|
+-------------+----------+
only showing top 20 rows



<h4> 1. Area under the ROC

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [0]:
AUC_evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='price_gte_150',metricName='areaUnderROC')

In [0]:
AUC = AUC_evaluator.evaluate(results)

In [0]:
print("The area under the curve is {}".format(AUC))


The area under the curve is 1.0


<h4> 2. Area under the PR

In [0]:
PR_evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='price_gte_150',metricName='areaUnderPR')

In [0]:
PR = PR_evaluator.evaluate(results)

In [0]:
print("The area under the PR curve is {}".format(PR))


The area under the PR curve is 1.0


<h4> 3. Accuracy

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
ACC_evaluator = MulticlassClassificationEvaluator(
    labelCol="price_gte_150", predictionCol="prediction", metricName="accuracy")

In [0]:
accuracy = ACC_evaluator.evaluate(results)

In [0]:
print("The accuracy of the model is {}".format(accuracy))


The accuracy of the model is 1.0


<h4> 4. Confusion Matrix

In [0]:
from sklearn.metrics import confusion_matrix

In [0]:
y_true = results.select("price_gte_150")
y_true = y_true.toPandas()
 
y_pred = results.select("prediction")
y_pred = y_pred.toPandas()
 
cnf_matrix = confusion_matrix(y_true, y_pred)
print("Below is the confusion matrix: \n {}".format(cnf_matrix))

Below is the confusion matrix: 
 [[441   0]
 [  0 393]]


<h4> Plot

In [0]:
# extract coefficients
coefficients = fit_model.coefficients.toArray()

# plot the SVM graph
x = [-coefficients[1]/coefficients[0], 0]
y = [0, -coefficients[2]/coefficients[0]]
plt.plot(x, y)
plt.show()

[0;31m---------------------------------------------------------------------------[0m
[0;31mAttributeError[0m                            Traceback (most recent call last)
[0;32m<command-2248947621643674>[0m in [0;36m<cell line: 2>[0;34m()[0m
[1;32m      1[0m [0;31m# extract coefficients[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 2[0;31m [0mcoefficients[0m [0;34m=[0m [0mfit_model[0m[0;34m.[0m[0mcoefficients[0m[0;34m.[0m[0mtoArray[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      3[0m [0;34m[0m[0m
[1;32m      4[0m [0;31m# plot the SVM graph[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      5[0m [0mx[0m [0;34m=[0m [0;34m[[0m[0;34m-[0m[0mcoefficients[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m/[0m[0mcoefficients[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m,[0m [0;36m0[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m

[0;31mAttributeError[0m: 'PipelineModel' object has no attribute 'coefficients'