### Merging all paquet files at start of job...


In [3]:
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
from pyspark.sql import SparkSession
from pyspark.mllib.regression import LabeledPoint


def loadMongoRDD(spark, db, collection):
    '''
    Download data from mongodb and store it in RDD format
    '''

    dataRDD = spark.read.format("mongo") \
        .option('uri', f"mongodb://10.4.41.48/{db}.{collection}") \
        .load() \
        .rdd \
        .cache()

    return dataRDD


spark = SparkSession \
    .builder \
    .master(f"local[*]") \
    .appName("myApp") \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:3.0.1') \
    .getOrCreate()

rdd = loadMongoRDD(spark, db='formatted', collection='data')
rdd.take(1)

                                                                                

[Row(Bedrooms=1, Date='2020_09_12', District Code='10', District Name='Sant Martí', Floor='7', Increased POP=119.0, Increased RFD=-1.0, Latitude=41.4126507, Longitude=2.1788903, Monthly Price (€/month)=820.75, Monthly Price Increase=-42.5, Neighborhood="El Camp de l'Arpa del Clot", Neighborhood Code='64', Operation='sale', POP=38371.0, Price=320000.0, Property Code='88750888', PropertyType='flat', RFD=81.7, Rooms=3, Size=88.0, Surface Price (€/m2)=13.0, Surface Price Increase=-0.6, _id=Row(oid='6299e809d1aef32510fb7487'))]

In [4]:
# Analyzing 'Floor'
floorRDD = rdd.map(lambda x: (x['Floor'], 1)).reduceByKey(lambda a,b: a+b)
floorRDD.take(100)

[('7', 100),
 ('5', 293),
 ('2', 525),
 (None, 507),
 ('3', 500),
 ('bj', 389),
 ('4', 360),
 ('1', 779),
 ('en', 150),
 ('6', 143),
 ('st', 6),
 ('8', 90),
 ('10', 24),
 ('9', 25),
 ('16', 1),
 ('11', 4),
 ('13', 4),
 ('43', 1),
 ('-1', 2),
 ('12', 2)]

In [6]:
# Analyzing 'Operation'
operationRDD = rdd.map(lambda x: (x['Operation'], 1)).reduceByKey(lambda a,b: a+b)
operationRDD.take(100)

[('sale', 3905)]

In [7]:
# Analyzing 'Bedrooms'
bedroomsRDD = rdd.map(lambda x: (x['Bedrooms'], 1)).reduceByKey(lambda a,b: a+b)
bedroomsRDD.take(100)

[(1, 1811),
 (0, 2),
 (3, 335),
 (2, 1550),
 (4, 121),
 (6, 21),
 (7, 14),
 (8, 6),
 (5, 44),
 (9, 1)]

In [8]:
# Analyzing 'District Code'
districtRDD = rdd.map(lambda x: (x['District Name'], 1)).reduceByKey(lambda a,b: a+b)
districtRDD.take(100)

[('Sant Martí', 1),
 ('Ciutat Vella', 347),
 ('Nou Barris', 32),
 ('Sant Andreu', 10),
 ('Sants-Montjuïc', 1567),
 ('Sarrià-Sant Gervasi', 352),
 ('Horta-Guinardó', 111),
 ('Les Corts', 495),
 ('Eixample', 896),
 ('Gràcia', 94)]

In [9]:
# Analyzing 'Neighborhood Code'
neighborhoodRDD = rdd.map(lambda x: (x['Neighborhood Code'], 1)).reduceByKey(lambda a,b: a+b)
neighborhoodRDD.take(100)

[('64', 1),
 ('1', 129),
 ('55', 4),
 ('57', 6),
 ('13', 231),
 ('63', 1),
 ('27', 53),
 ('15', 126),
 ('62', 1),
 ('36', 5),
 ('60', 2),
 ('20', 230),
 ('17', 226),
 ('37', 11),
 ('19', 213),
 ('33', 18),
 ('34', 8),
 ('21', 52),
 ('51', 4),
 ('52', 6),
 ('45', 10),
 ('6', 5),
 ('25', 107),
 ('18', 346),
 ('35', 10),
 ('38', 13),
 ('9', 298),
 ('2', 218),
 ('31', 31),
 ('10', 197),
 ('30', 10),
 ('26', 82),
 ('32', 28),
 ('50', 3),
 ('14', 137),
 ('8', 43),
 ('29', 2),
 ('49', 1),
 ('53', 1),
 ('28', 23),
 ('41', 3),
 ('11', 300),
 ('44', 3),
 ('5', 1),
 ('24', 32),
 ('23', 78),
 ('43', 14),
 ('16', 201),
 ('39', 29),
 ('7', 352)]

22/06/03 19:12:08 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 382256 ms exceeds timeout 120000 ms
22/06/03 19:12:08 WARN SparkContext: Killing executors is not supported by current scheduler.


In [234]:
labelRDD = rdd.map(lambda x: LabeledPoint(x[15], [x[2], x[12], x[0], x[9], x[10], x[19], x[20]]))
labelRDD.take(1)

[LabeledPoint(320000.0, [10.0,64.0,1.0,820.75,-42.5,3.0,88.0])]

In [235]:
rdd.count()

3905

In [240]:
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.util import MLUtils

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = labelRDD.randomSplit([0.7, 0.3])

# Train a RandomForest model.
model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={},
                                    numTrees=5, maxDepth=5, maxBins=64)

# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() /\
    float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression forest model:')
print(model.toDebugString())


Test Mean Squared Error = 75107496572.07039
Learned regression forest model:
TreeEnsembleModel regressor with 5 trees

  Tree 0:
    If (feature 2 <= 3.5)
     If (feature 3 <= 1132.54)
      If (feature 2 <= 1.5)
       If (feature 6 <= 70.5)
        If (feature 3 <= 1009.55)
         Predict: 214376.17199391173
        Else (feature 3 > 1009.55)
         Predict: 380735.29411764705
       Else (feature 6 > 70.5)
        If (feature 6 <= 165.5)
         Predict: 310982.6396226415
        Else (feature 6 > 165.5)
         Predict: 660937.5
      Else (feature 2 > 1.5)
       If (feature 6 <= 114.5)
        If (feature 3 <= 942.0999999999999)
         Predict: 367645.03673469386
        Else (feature 3 > 942.0999999999999)
         Predict: 506149.06832298136
       Else (feature 6 > 114.5)
        If (feature 2 <= 2.5)
         Predict: 604659.1875
        Else (feature 2 > 2.5)
         Predict: 867655.9139784946
     Else (feature 3 > 1132.54)
      If (feature 2 <= 2.5)
       If (f