### Apartment price regression testing...


In [18]:
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
from pyspark.sql import SparkSession
from pyspark.mllib.regression import LabeledPoint


def loadMongoRDD(spark, db, collection):
    '''
    Download data from mongodb and store it in RDD format
    '''

    dataRDD = spark.read.format("mongo") \
        .option('uri', f"mongodb://10.4.41.48/{db}.{collection}") \
        .load() \
        .rdd \
        .cache()

    return dataRDD


spark = SparkSession \
    .builder \
    .master(f"local[*]") \
    .appName("myApp") \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:3.0.1') \
    .getOrCreate()

rdd = loadMongoRDD(spark, db='formatted', collection='data')
rdd.take(1)
rdd.count()

                                                                                

3905

### Cheating with pyspark DF

In [113]:
idealistaDF = rdd.toDF().select('Price', 
                                'Floor',
                                'Bedrooms',
                                'Rooms',
                                'Size',
                                'PropertyType',
                                'District Code',
                                'Neighborhood Code').fillna('unavailable')
idealistaDF.printSchema()

root
 |-- Price: double (nullable = true)
 |-- Floor: string (nullable = false)
 |-- Bedrooms: long (nullable = true)
 |-- Rooms: long (nullable = true)
 |-- Size: double (nullable = true)
 |-- PropertyType: string (nullable = false)
 |-- District Code: string (nullable = false)
 |-- Neighborhood Code: string (nullable = false)



In [114]:
#   ##  import the required libraries
from pyspark.sql.functions import udf, col
from pyspark.sql.types import IntegerType

#   ##  gather the distinct values
distinct_values = idealistaDF.select("Floor")\
                    .distinct()\
                    .rdd\
                    .flatMap(lambda x: x).collect()

#   ##  for each of the gathered values create a new column 
for distinct_value in distinct_values:
    function = udf(lambda item: 
                   1 if item == distinct_value else 0, 
                   IntegerType())
    new_column_name = "Floor"+'_'+distinct_value
    idealistaDF = idealistaDF.withColumn(new_column_name, function(col("Floor")))

#   ##  gather the distinct values
distinct_values = idealistaDF.select("PropertyType")\
                    .distinct()\
                    .rdd\
                    .flatMap(lambda x: x).collect()

#   ##  for each of the gathered values create a new column 
for distinct_value in distinct_values:
    function = udf(lambda item: 
                   1 if item == distinct_value else 0, 
                   IntegerType())
    new_column_name = "PropertyType"+'_'+distinct_value
    idealistaDF = idealistaDF.withColumn(new_column_name, function(col("PropertyType")))

idealistaDF.printSchema()



root
 |-- Price: double (nullable = true)
 |-- Floor: string (nullable = false)
 |-- Bedrooms: long (nullable = true)
 |-- Rooms: long (nullable = true)
 |-- Size: double (nullable = true)
 |-- PropertyType: string (nullable = false)
 |-- District Code: string (nullable = false)
 |-- Neighborhood Code: string (nullable = false)
 |-- Floor_7: integer (nullable = true)
 |-- Floor_en: integer (nullable = true)
 |-- Floor_-1: integer (nullable = true)
 |-- Floor_11: integer (nullable = true)
 |-- Floor_3: integer (nullable = true)
 |-- Floor_8: integer (nullable = true)
 |-- Floor_st: integer (nullable = true)
 |-- Floor_16: integer (nullable = true)
 |-- Floor_43: integer (nullable = true)
 |-- Floor_5: integer (nullable = true)
 |-- Floor_6: integer (nullable = true)
 |-- Floor_9: integer (nullable = true)
 |-- Floor_bj: integer (nullable = true)
 |-- Floor_1: integer (nullable = true)
 |-- Floor_10: integer (nullable = true)
 |-- Floor_4: integer (nullable = true)
 |-- Floor_12: integer

                                                                                

In [115]:
len(idealistaDF.columns)

34

In [20]:
# Analyzing 'Floor'
floorRDD = rdd.map(lambda x: (x['Floor'], 1)).reduceByKey(lambda a,b: a+b)
floorRDD.take(100)

[('7', 100),
 ('5', 293),
 ('2', 525),
 (None, 507),
 ('3', 500),
 ('bj', 389),
 ('4', 360),
 ('1', 779),
 ('en', 150),
 ('6', 143),
 ('st', 6),
 ('8', 90),
 ('10', 24),
 ('9', 25),
 ('16', 1),
 ('11', 4),
 ('13', 4),
 ('43', 1),
 ('-1', 2),
 ('12', 2)]

In [21]:
# Analyzing 'Operation'
operationRDD = rdd.map(lambda x: (x['Operation'], 1)).reduceByKey(lambda a,b: a+b)
operationRDD.take(100)

[('sale', 3905)]

In [22]:
# Analyzing 'Bedrooms'
bedroomsRDD = rdd.map(lambda x: (x['Bedrooms'], 1)).reduceByKey(lambda a,b: a+b)
bedroomsRDD.take(100)

[(1, 1811),
 (0, 2),
 (3, 335),
 (2, 1550),
 (4, 121),
 (6, 21),
 (7, 14),
 (8, 6),
 (5, 44),
 (9, 1)]

In [78]:
# Analyzing 'Rooms'
roomsRDD = rdd.map(lambda x: (x['Rooms'], 1)).reduceByKey(lambda a,b: a+b)
roomsRDD.take(100)

[(3, 1450),
 (2, 953),
 (0, 68),
 (6, 67),
 (5, 248),
 (1, 271),
 (4, 789),
 (7, 22),
 (8, 14),
 (10, 13),
 (11, 2),
 (12, 1),
 (13, 2),
 (9, 4),
 (15, 1)]

In [23]:
# Analyzing 'District Code'
districtRDD = rdd.map(lambda x: (x['District Name'], 1)).reduceByKey(lambda a,b: a+b)
districtRDD.take(100)

[('Sant Martí', 1),
 ('Ciutat Vella', 347),
 ('Nou Barris', 32),
 ('Sant Andreu', 10),
 ('Sants-Montjuïc', 1567),
 ('Sarrià-Sant Gervasi', 352),
 ('Horta-Guinardó', 111),
 ('Les Corts', 495),
 ('Eixample', 896),
 ('Gràcia', 94)]

In [88]:
# Analyzing 'propertyType'
propertyRDD = rdd.map(lambda x: (x['PropertyType'], 1)).reduceByKey(lambda a,b: a+b)
propertyRDD.take(50)

[('flat', 3340),
 ('studio', 62),
 ('penthouse', 224),
 ('chalet', 155),
 ('duplex', 123),
 ('countryHouse', 1)]

In [116]:

labelRDD = idealistaDF.rdd.map(lambda x: LabeledPoint(x['Price'], [x["Bedrooms"], x["Rooms"], x["Size"], x["District Code"], 
                                                        x["Neighborhood Code"], x[8], x[9],
                                                        x[10], x[11], x[12], x[13], x[14], x[15], x[16], x[17], x[18], x[19], 
                                                        x[20], x[21], x[22], x[23], x[24], x[25], x[26], x[27], x[28], x[29], 
                                                        x[30], x[31], x[32], x[33]]))


labelRDD.take(1)

[LabeledPoint(320000.0, [1.0,3.0,88.0,10.0,64.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0])]

In [103]:
labelRDD.count()

3905

In [117]:
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = labelRDD.randomSplit([0.7, 0.3], seed=42)

# Train a RandomForest model.
model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={},
                                    numTrees=5, maxDepth=7, maxBins=300, seed=42)

# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() / float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression forest model:')
print(model.toDebugString())


Test Mean Squared Error = 208891452731.2823
Learned regression forest model:
TreeEnsembleModel regressor with 5 trees

  Tree 0:
    If (feature 0 <= 3.5)
     If (feature 2 <= 132.5)
      If (feature 0 <= 1.5)
       If (feature 3 <= 2.5)
        If (feature 27 <= 0.5)
         If (feature 29 <= 0.5)
          If (feature 2 <= 81.5)
           Predict: 324083.3333333333
          Else (feature 2 > 81.5)
           Predict: 522500.0
         Else (feature 29 > 0.5)
          If (feature 20 <= 0.5)
           Predict: 343681.2921348315
          Else (feature 20 > 0.5)
           Predict: 299730.0
        Else (feature 27 > 0.5)
         If (feature 18 <= 0.5)
          Predict: 143545.45454545456
         Else (feature 18 > 0.5)
          Predict: 349000.0
       Else (feature 3 > 2.5)
        If (feature 4 <= 18.5)
         If (feature 1 <= 1.5)
          If (feature 27 <= 0.5)
           Predict: 176583.75
          Else (feature 27 > 0.5)
           Predict: 143026.66666666666
    