In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, IndexToString, VectorIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [0]:
# File Upload for all the models
file_location = "/FileStore/tables/airbnb.csv"
file_type = "csv"
 
# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","
 
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

df=df.dropna() 

# Use StringIndexer to convert the categorical columns to hold numerical data - PySpark modeling required all categorical data be numerical 
neighbourhood_cleansed_indexer = StringIndexer(inputCol='neighbourhood_cleansed',outputCol='neighbourhood_cleansed_index',handleInvalid='keep')
property_type_indexer = StringIndexer(inputCol='property_type',outputCol='property_type_index',handleInvalid='keep')
room_type_indexer = StringIndexer(inputCol='room_type',outputCol='room_type_index',handleInvalid='keep')
bed_type_indexer = StringIndexer(inputCol='bed_type',outputCol='bed_type_index',handleInvalid='keep')
cancellation_policy_group_indexer = StringIndexer(inputCol='cancellation_policy',outputCol='cancellation_policy_index',handleInvalid='keep')
price_category_indexer = StringIndexer(inputCol='price_category',outputCol='price_category_index',handleInvalid='keep')

# Vector assembler is used to merge multiple input columns into an output vector column named 'features', note we are using the indexed columns created by the StringIndexer calls previously
#NOTE WE ARE NOT INCLUDING THE price_category/index in the vector or other price features because the Decision Tree classifier will key on that to make 100% accurate predictions every time.  We want it to use all the OTHER fields to build a predictive model.
assembler = VectorAssembler(inputCols=['neighbourhood_cleansed_index',
                                       'property_type_index','room_type_index','bed_type_index','cancellation_policy_index',
                                       'host_is_superhost','host_identity_verified','latitude','longitude','accommodates','bathrooms',
                                       'bedrooms','beds','Number_of_amenities','guests_included','price_per_extra_person',
                                       'minimum_nights','number_of_reviews','number_days_btw_first_last_review',
                                       'review_scores_rating'],
                            outputCol="features")

# Construct the pipeline, this will call fit() for each Estimator (the StringIndexers) and transform() for the assembler and decision tree classifier model, in order
pipe = Pipeline(stages=[neighbourhood_cleansed_indexer,property_type_indexer,room_type_indexer,bed_type_indexer,cancellation_policy_group_indexer,price_category_indexer,assembler])
indexed_data = pipe.fit(df)
final_data = indexed_data.transform(df)

#This is the final prepped data with features vector
display(final_data)

host_is_superhost,host_identity_verified,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,Number_of_amenities,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating,cancellation_policy,price,price_gte_150,price_category,neighbourhood_cleansed_index,property_type_index,room_type_index,bed_type_index,cancellation_policy_index,price_category_index,features
0,1,Roslindale,42.28624082,-71.13437396,Apartment,Private room,2,1.0,1,1,Real Bed,20,0,0,2,36,804,94,moderate,65,0,lte_$75,16.0,0.0,1.0,0.0,1.0,3.0,"Map(vectorType -> dense, length -> 20, values -> List(16.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 42.28624082, -71.13437396, 2.0, 1.0, 1.0, 1.0, 20.0, 0.0, 0.0, 2.0, 36.0, 804.0, 94.0))"
1,1,Roslindale,42.29243789,-71.13576525,Apartment,Private room,2,1.0,1,1,Real Bed,17,1,20,3,41,2574,98,moderate,65,0,lte_$75,16.0,0.0,1.0,0.0,1.0,3.0,"Map(vectorType -> dense, length -> 20, values -> List(16.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 42.29243789, -71.13576525, 2.0, 1.0, 1.0, 1.0, 17.0, 1.0, 20.0, 3.0, 41.0, 2574.0, 98.0))"
0,0,Roslindale,42.28110619,-71.12102117,House,Private room,4,1.0,1,2,Real Bed,22,2,25,1,1,0,100,moderate,75,0,lte_$75,16.0,1.0,1.0,0.0,1.0,3.0,"Map(vectorType -> dense, length -> 20, values -> List(16.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 42.28110619, -71.12102117, 4.0, 1.0, 1.0, 2.0, 22.0, 2.0, 25.0, 1.0, 1.0, 0.0, 100.0))"
1,1,Roslindale,42.28451221,-71.13625805,House,Private room,2,1.5,1,2,Real Bed,13,1,0,2,29,380,99,flexible,79,0,btw_$75-$150,16.0,1.0,1.0,0.0,2.0,0.0,"Map(vectorType -> dense, length -> 20, values -> List(16.0, 1.0, 1.0, 0.0, 2.0, 1.0, 1.0, 42.28451221, -71.13625805, 2.0, 1.5, 1.0, 2.0, 13.0, 1.0, 0.0, 2.0, 29.0, 380.0, 99.0))"
1,1,Roslindale,42.2916898,-71.13189277,Condominium,Private room,2,1.0,1,1,Real Bed,12,1,0,2,8,130,100,flexible,75,0,lte_$75,16.0,2.0,1.0,0.0,2.0,3.0,"Map(vectorType -> dense, length -> 20, values -> List(16.0, 2.0, 1.0, 0.0, 2.0, 1.0, 1.0, 42.2916898, -71.13189277, 2.0, 1.0, 1.0, 1.0, 12.0, 1.0, 0.0, 2.0, 8.0, 130.0, 100.0))"
0,1,Roslindale,42.28138963,-71.13119042,Apartment,Entire home/apt,3,1.0,1,2,Real Bed,12,1,25,1,57,421,90,strict,100,0,btw_$75-$150,16.0,0.0,0.0,0.0,0.0,0.0,"Map(vectorType -> dense, length -> 20, values -> List(16.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 42.28138963, -71.13119042, 3.0, 1.0, 1.0, 2.0, 12.0, 1.0, 25.0, 1.0, 57.0, 421.0, 90.0))"
1,1,Roslindale,42.2819461,-71.14102161,House,Private room,2,2.0,1,1,Real Bed,22,1,15,1,67,840,96,moderate,75,0,lte_$75,16.0,1.0,1.0,0.0,1.0,3.0,"Map(vectorType -> dense, length -> 20, values -> List(16.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 42.2819461, -71.14102161, 2.0, 2.0, 1.0, 1.0, 22.0, 1.0, 15.0, 1.0, 67.0, 840.0, 96.0))"
1,1,Roslindale,42.28587764,-71.12490956,Condominium,Private room,2,1.0,1,2,Real Bed,9,2,0,2,65,355,96,moderate,58,0,lte_$75,16.0,2.0,1.0,0.0,1.0,3.0,"Map(vectorType -> dense, length -> 20, values -> List(16.0, 2.0, 1.0, 0.0, 1.0, 1.0, 1.0, 42.28587764, -71.12490956, 2.0, 1.0, 1.0, 2.0, 9.0, 2.0, 0.0, 2.0, 65.0, 355.0, 96.0))"
1,1,Roslindale,42.28882028,-71.1395101,Apartment,Entire home/apt,5,1.0,2,2,Real Bed,21,4,25,4,33,876,94,strict,229,1,gte_226,16.0,0.0,0.0,0.0,0.0,2.0,"Map(vectorType -> dense, length -> 20, values -> List(16.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 42.28882028, -71.1395101, 5.0, 1.0, 2.0, 2.0, 21.0, 4.0, 25.0, 4.0, 33.0, 876.0, 94.0))"
0,0,Roslindale,42.2864482,-71.13932539,House,Private room,2,1.0,1,1,Real Bed,15,1,10,1,1,0,80,flexible,60,0,lte_$75,16.0,1.0,1.0,0.0,2.0,3.0,"Map(vectorType -> dense, length -> 20, values -> List(16.0, 1.0, 1.0, 0.0, 2.0, 0.0, 0.0, 42.2864482, -71.13932539, 2.0, 1.0, 1.0, 1.0, 15.0, 1.0, 10.0, 1.0, 1.0, 0.0, 80.0))"


In [0]:
#This is what we will pass to the Decision Tree Classifier code
data = final_data.select("price_category_index", "features")
data.show()

+--------------------+--------------------+
|price_category_index|            features|
+--------------------+--------------------+
|                 3.0|[16.0,0.0,1.0,0.0...|
|                 3.0|[16.0,0.0,1.0,0.0...|
|                 3.0|[16.0,1.0,1.0,0.0...|
|                 0.0|[16.0,1.0,1.0,0.0...|
|                 3.0|[16.0,2.0,1.0,0.0...|
|                 0.0|[16.0,0.0,0.0,0.0...|
|                 3.0|[16.0,1.0,1.0,0.0...|
|                 3.0|[16.0,2.0,1.0,0.0...|
|                 2.0|[16.0,0.0,0.0,0.0...|
|                 3.0|[16.0,1.0,1.0,0.0...|
|                 3.0|[16.0,1.0,1.0,0.0...|
|                 0.0|[16.0,2.0,0.0,0.0...|
|                 0.0|[16.0,0.0,0.0,0.0...|
|                 0.0|[16.0,1.0,0.0,0.0...|
|                 3.0|[16.0,0.0,1.0,0.0...|
|                 1.0|[16.0,1.0,1.0,0.0...|
|                 3.0|[16.0,2.0,1.0,0.0...|
|                 3.0|[16.0,1.0,1.0,0.0...|
|                 3.0|[16.0,9.0,1.0,0.0...|
|                 0.0|[16.0,1.0,

In [0]:

# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="price_category_index", outputCol="indexedLabel").fit(data)
# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=26).fit(data)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a DecisionTree model.
dt = DecisionTreeClassifier(labelCol="price_category_index", featuresCol="indexedFeatures")

# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="price_category_index", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))

treeModel = model.stages[2]
# summary only
print(treeModel)

Test Error = 0.449489 
DecisionTreeClassificationModel: uid=DecisionTreeClassifier_0d51a2c5b218, depth=5, numNodes=61, numClasses=5, numFeatures=20


In [0]:
print(treeModel.toDebugString)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_0d51a2c5b218, depth=5, numNodes=61, numClasses=5, numFeatures=20
  If (feature 2 in {0.0})
   If (feature 10 in {3.0,4.0,5.0,6.0,7.0,8.0})
    If (feature 0 in {1.0,2.0,4.0,6.0,7.0,10.0,12.0,14.0,15.0,17.0,21.0,22.0})
     If (feature 12 in {2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0})
      If (feature 19 <= 70.5)
       Predict: 1.0
      Else (feature 19 > 70.5)
       Predict: 2.0
     Else (feature 12 not in {2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0})
      If (feature 7 <= 42.33406298)
       Predict: 0.0
      Else (feature 7 > 42.33406298)
       Predict: 1.0
    Else (feature 0 not in {1.0,2.0,4.0,6.0,7.0,10.0,12.0,14.0,15.0,17.0,21.0,22.0})
     If (feature 12 in {2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0})
      If (feature 18 <= 316.0)
       Predict: 1.0
      Else (feature 18 > 316.0)
       Predict: 2.0
     Else (feature 12 not in {2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0})
      If (feature 17 <= 21.5)
       Predict: 0.0
      Else (feature 17 > 2