In [54]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer, Bucketizer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, OneHotEncoder

def loadMongoDF(db, collection):
    '''
    Download data from mongodb and store it in DF format
    '''
    spark = SparkSession \
        .builder \
        .master(f"local[*]") \
        .appName("myApp") \
        .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:3.0.1') \
        .getOrCreate()

    dataDF = spark.read.format("mongo") \
        .option('uri', f"mongodb://10.4.41.48/{db}.{collection}") \
        .load()

    return dataDF, spark


In [None]:
## --------------- KPI 3 DF --------------- 
## --> predict number of rooms given a price and neigbirhood_id
dataDF, spark = loadMongoDF(db='formatted', collection='data')
subsetDF = dataDF.select('Neighborhood Id', 'Price', 'Rooms') \
                .withColumnRenamed("Neighborhood Id","Neighborhood_ID")

# Indexing 'Neighborhood_ID'
featureIndexer = [StringIndexer(inputCol=column, outputCol=column+"_INDEX").fit(subsetDF) for column in ['Neighborhood_ID']]

#pipeline = Pipeline(stages=indexers)
#modelDF = pipeline.fit(subsetDF).transform(subsetDF)

# One-Hot Encoding 'Neighborhood_ID'
single_col_ohe = OneHotEncoder(inputCol="Neighborhood_ID_INDEX", outputCol="Neighborhood_ID_OneHot").fit(subsetDF)
#modelDF_OneHot = single_col_ohe.fit(modelDF).transform(modelDF)

# creating label index from 'Rooms' and Feature Vector from 'features'
labelIndexer = StringIndexer(inputCol="Rooms", outputCol="indexedRooms").fit(subsetDF)
                #.fit(modelDF_OneHot) \
                #.transform(modelDF_OneHot)

labelVector = VectorAssembler(inputCols=['Neighborhood_ID_OneHot', 'Price'], outputCol="indexedFeatures")
                #.transform(modelDF)

#modelDF = modelDF.select('Price', 'Rooms', 'Neighborhood_ID_OneHot', 'indexedRooms', 'indexedFeatures')

# Split the data into training and test sets (30% held out for testing)
(trainingDataDF, testDataDF) = subsetDF.randomSplit([0.7, 0.3])

# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="indexedRooms", featuresCol="indexedFeatures", numTrees=10)

# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelIndexer.labels)

# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, single_col_ohe, labelIndexer, labelVector, rf, labelConverter])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingDataDF)

# Make predictions.
predictions = model.transform(testDataDF)

# Select example rows to display.
predictions.select("predictedLabel", "label", "features").show(5)

In [69]:
## --------------- KPI 3 DF --------------- 
## --> predict number of rooms given a price and neigbirhood_id
dataDF, spark = loadMongoDF(db='formatted', collection='data')
subsetDF = dataDF.select('Neighborhood Id', 'Price', 'Rooms') \
                .withColumnRenamed("Neighborhood Id","Neighborhood_ID")

# Indexing 'Neighborhood_ID'
indexers = [StringIndexer(inputCol=column, outputCol=column+"_INDEX").fit(bucketed) for column in ['Neighborhood_ID']]

pipeline = Pipeline(stages=indexers)
modelDF = pipeline.fit(subsetDF).transform(subsetDF)

# One-Hot Encoding 'Neighborhood_ID'
single_col_ohe = OneHotEncoder(inputCol="Neighborhood_ID_INDEX", outputCol="Neighborhood_ID_OneHot")
modelDF_OneHot = single_col_ohe.fit(modelDF).transform(modelDF)

# creating label index from 'Rooms' and Feature Vector from 'features'
modelDF = StringIndexer(inputCol="Rooms", outputCol="indexedRooms") \
                .fit(modelDF_OneHot) \
                .transform(modelDF_OneHot)

modelDF = VectorAssembler(inputCols=['Neighborhood_ID_OneHot', 'Price'], outputCol="indexedFeatures") \
                .transform(modelDF)

modelDF = modelDF.select('Price', 'Rooms', 'Neighborhood_ID_OneHot', 'indexedRooms', 'indexedFeatures')

# Split the data into training and test sets (30% held out for testing)
(trainingDataDF, testDataDF) = modelDF.randomSplit([0.7, 0.3])

# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="indexedRooms", featuresCol="indexedFeatures", numTrees=10)

# Convert indexed labels back to original labels.
#labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=modelDF.labels)

# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages=[rf])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingDataDF)

# Make predictions.
predictions = model.transform(testDataDF)

# Select example rows to display.
predictions.select("predictedLabel", "label", "features").show(5)


Py4JJavaError: An error occurred while calling o2024.load.
: com.mongodb.MongoTimeoutException: Timed out after 30000 ms while waiting to connect. Client view of cluster state is {type=UNKNOWN, servers=[{address=10.4.41.48:27017, type=UNKNOWN, state=CONNECTING, exception={com.mongodb.MongoSocketOpenException: Exception opening socket}, caused by {java.net.ConnectException: Connection refused (Connection refused)}}]
	at com.mongodb.internal.connection.BaseCluster.getDescription(BaseCluster.java:177)
	at com.mongodb.internal.connection.SingleServerCluster.getDescription(SingleServerCluster.java:41)
	at com.mongodb.client.internal.MongoClientDelegate.getConnectedClusterDescription(MongoClientDelegate.java:147)
	at com.mongodb.client.internal.MongoClientDelegate.createClientSession(MongoClientDelegate.java:98)
	at com.mongodb.client.internal.MongoClientDelegate$DelegateOperationExecutor.getClientSession(MongoClientDelegate.java:278)
	at com.mongodb.client.internal.MongoClientDelegate$DelegateOperationExecutor.execute(MongoClientDelegate.java:182)
	at com.mongodb.client.internal.MongoDatabaseImpl.executeCommand(MongoDatabaseImpl.java:194)
	at com.mongodb.client.internal.MongoDatabaseImpl.runCommand(MongoDatabaseImpl.java:163)
	at com.mongodb.client.internal.MongoDatabaseImpl.runCommand(MongoDatabaseImpl.java:158)
	at com.mongodb.spark.MongoConnector.$anonfun$hasSampleAggregateOperator$1(MongoConnector.scala:234)
	at com.mongodb.spark.MongoConnector.$anonfun$withDatabaseDo$1(MongoConnector.scala:171)
	at com.mongodb.spark.MongoConnector.withMongoClientDo(MongoConnector.scala:154)
	at com.mongodb.spark.MongoConnector.withDatabaseDo(MongoConnector.scala:171)
	at com.mongodb.spark.MongoConnector.hasSampleAggregateOperator(MongoConnector.scala:234)
	at com.mongodb.spark.rdd.MongoRDD.hasSampleAggregateOperator$lzycompute(MongoRDD.scala:221)
	at com.mongodb.spark.rdd.MongoRDD.hasSampleAggregateOperator(MongoRDD.scala:221)
	at com.mongodb.spark.sql.MongoInferSchema$.apply(MongoInferSchema.scala:68)
	at com.mongodb.spark.sql.DefaultSource.constructRelation(DefaultSource.scala:97)
	at com.mongodb.spark.sql.DefaultSource.createRelation(DefaultSource.scala:50)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:354)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:297)
	at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:286)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:286)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:221)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [35]:
## ---------- 'Price' Numeric to Categorical ----------
## only necessary for classifying price from neighborhood and rooms (which would make more sense)

bucketizer = Bucketizer()
bucketizer.setSplits([-float("inf"), 0, 100000, 200000, 300000, 400000, 
                        500000, 600000, 700000, 800000, 900000, 1000000,
                        1100000, 1200000, 1300000, 1400000, 1500000, float("inf")])
bucketizer.setInputCol("Price")
bucketizer.setOutputCol("Price_bucket")
bucketed = bucketizer.setHandleInvalid("keep").transform(subsetDF)
bucketed.show(truncate=False)

+---------------+---------+-----+------------+
|Neighborhood_ID|Price    |Rooms|Price_bucket|
+---------------+---------+-----+------------+
|Q3320722       |115000.0 |3    |2.0         |
|Q3320722       |110000.0 |3    |2.0         |
|Q3320722       |115000.0 |3    |2.0         |
|Q3320722       |129000.0 |3    |2.0         |
|Q3294602       |320000.0 |3    |4.0         |
|Q6746220       |139500.0 |4    |2.0         |
|Q6746220       |215000.0 |4    |3.0         |
|Q6746220       |149000.0 |3    |2.0         |
|Q6746220       |176000.0 |3    |2.0         |
|Q6746220       |90000.0  |3    |1.0         |
|Q6746220       |39000.0  |1    |1.0         |
|Q1758503       |490000.0 |2    |5.0         |
|Q1758503       |495000.0 |0    |5.0         |
|Q1758503       |1125000.0|2    |12.0        |
|Q1758503       |250000.0 |2    |3.0         |
|Q1758503       |209000.0 |3    |3.0         |
|Q1758503       |190000.0 |0    |2.0         |
|Q1758503       |725000.0 |6    |8.0         |
|Q1758503    

In [None]:
'''
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="Rooms", outputCol="indexedRooms").fit(subsetDF)

# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer = VectorIndexer(inputCol="Neighborhood_ID", outputCol="indexedNeighborhoodID", maxCategories=4).fit(subsetDF)'''

# Split the data into training and test sets (30% held out for testing)
(trainingDataDF, testDataDF) = df_indexed.randomSplit([0.7, 0.3])

# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="indexedRooms", featuresCol="indexedFeatures", numTrees=10)

# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelIndexer.labels)

# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingDataDF)

# Make predictions.
predictions = model.transform(testDataDF)

# Select example rows to display.
predictions.select("predictedLabel", "label", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

rfModel = model.stages[2]
print(rfModel)  # summary only