In [82]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
import os
import sys
import csv
import time

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

conf = SparkConf().setAppName("SparkTraining").setMaster("local[*]")
ctx = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("myApp") \
    .getOrCreate()

In [2]:
# Establish the connection. If this doesn't work, uncomment this and use local files (snippit below)
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("myApp") \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:3.0.1') \
    .getOrCreate()

opendata_leisureRDD = spark.read.format("mongo") \
    .option('uri', f"mongodb://10.4.41.97:27017/persistent.opendatabcn-leisure") \
    .load() \
    .rdd

opendata_incomeRDD = spark.read.format("mongo") \
    .option('uri', f"mongodb://10.4.41.97:27017/persistent.opendatabcn-income") \
    .load() \
    .rdd

idealista = spark.read.format("mongo") \
    .option('uri', f"mongodb://10.4.41.97:27017/persistent.idealista") \
    .load() \

lookupRDD = spark.read.format("mongo") \
    .option('uri', f"mongodb://10.4.41.97:27017/persistent.lookup_tables") \
    .load() \
    .rdd

In [83]:
# Use the local files if the connection doesn't work
idealista = ctx.read.json('./json/idealista.json')
lookupRDD = ctx.read.json('./json/lookup.json').rdd
opendata_incomeRDD = ctx.read.json('./json/opendata_income.json').rdd
opendata_leisureRDD = ctx.read.json('./json/opendata_leisure.json').rdd

In [4]:
# Save the schema of idealista to be later used when building a df out of the transformed RDD
idealistaSchema = idealista.schema

In [84]:
def flatten(t):
    """
    Transforms a nested tuple into a single flattened tuple, reorganizing the data for a specific structure.
    
    The resulting tuple is organized as follows:
    (propertyID, neighbourhoodID, idealistaData (with each feature appended), leisureDict, IncomeDict).
    
    Parameters:
    t (tuple): The nested tuple to be flattened.
    
    Returns:
    tuple: A flattened tuple with values ordered as specified.
    """
    val = [t[1][0][0], t[0][0], t[0][1]]
    for v in t[1][0][1]:
        val.append(v)
    return tuple(val + [t[1][1][0], t[1][1][1]])


In [85]:
def partition_hash_neighbourhood_id(id):
    """
    Computes a hash partition based on the first character of the given identifier, dividing the data into two partitions.
    
    The function extracts the first character of the identifier, treats it as an integer, and calculates its modulo 2 
    to determine which partition it belongs to. This way, it categorizes the data into two partitions, facilitating
    parallel processing or grouped operations later on.
    
    Parameters:
    id (str): The identifier whose first character is to be used for partitioning.
    
    Returns:
    int: The partition number (0 or 1) the identifier belongs to.
    """
    val = int(id[:1])
    return val % 2


In [86]:
def x_later_than_y(x_date, y_date):
    xy, xm, xd = x_date.split("_")
    yy, ym, yd = y_date.split("_")
    if yy > xy: return False
    elif ym > xm: return False
    elif yd > xd: return False
    else: return True


def reconcile_idealista(x,y):
    if x_later_than_y(x[1], y[1]):
        return x
    else: return y

In [112]:
def get_partition_id(id, n=2):
    val = hash(id)
    return val % n

In [113]:
def merge_income_dict_count(x,y):
    out = {}
    xk = list(x.keys())
    yk = list(y.keys())
    for key in set(xk + yk):
        if key in xk and key in yk:
            out[key] = x[key] + y[key]
        elif key in yk:
            out[key] = y[key]
        elif key in xk:
            out[key] = x[key]
    return out

In [120]:
luRDD = lookupRDD.map(lambda t: (t['neighborhood'], (t['neighborhood_id'], t['neighborhood_n_reconciled'])))\
    .reduceByKey(lambda x, y: x)
    ##select the neighbourhood, the reconciled neighbourhood namme, and the numerical identifier.
    ##Remove all duplicates from the lookup table

openIRDD = opendata_incomeRDD.map(lambda t: (t['Nom_Barri'], (t['Any'], t['Índex RFD Barcelona = 100'])))\
    .filter(lambda t: t[0] != "No consta")\
    .partitionBy(2, lambda k: get_partition_id(k[0]))\
    .join(luRDD)\
    .map(lambda t: (t[1][1], {t[1][0][0]: t[1][0][1]}))\
    .reduceByKey(lambda x, y: {**x, **y})\
    .partitionBy(2, lambda k: get_partition_id(k[0]))
    ## Select the name of the neighbourhood, the year and, the measured income level
    ## Filter out all the "No consta" neighbourhoods as they do not contain data.
    ## Join the data with the lookup table to get the neighborhood ID.
    ## Make the neighborhood ID and the reconciled name the key of the tuple, while also keeping the year and income as value as dict entries.
    ## mapValues not possible as key is being changed
    ## Combine all dictionairies for each neigbhorhood.


openLRDD = opendata_leisureRDD.map(lambda t: (t['addresses_neighborhood_name'],
                                              (t['secondary_filters_name'], 1)))\
    .filter(lambda t: t[0] != '')\
    .partitionBy(2, lambda k: get_partition_id(k[0]))\
    .join(luRDD)\
    .map(lambda t: ((t[1][1][0], t[1][1][1]), {str(t[1][0][0]): t[1][0][1]}))\
    .reduceByKey(lambda x, y: merge_income_dict_count(x, y))\
    .partitionBy(2, lambda k: get_partition_id(k[0]))
    ## It doesn't make sense to filter for 'created' as it does not reflect the true moment of when the amenity was built.
    ## Select the neigbhorhood, the type of amenity as a key, and count 1 for that amenity.
    ## filter out all neighborhoods with values ''
    ## Immediately join with the lookup table so the later operations are easier
    ## filter out all the neighborhoods in the lookup table that were not in the leisure data. Opposite does not have to happen
    ## sice we are doing a leftOuterJoin.
    ## Rearrange the data so that the key is (ID, reconciled name, amenity) and the value is 1.
    ## mapValues not used as we are changing keys
    ## Count the amount of amenities per neighborhood by doing a reduceKey on (neighborhood, amenity) as key.

joinOpenRDD = openLRDD.fullOuterJoin(openIRDD)\
    .mapValues(lambda t: ({} if t[0] == None else t[0], t[1]))\
    .mapValues(lambda t: (t[0], {} if t[1] == None else t[1]))\
    .partitionBy(2, lambda k: get_partition_id(k[0]))\
    .cache()
    ## Join both the open data sources on the neighbourhood ID
    ## As we are doing an FullJoin we have to account for some keys not appearing in either set
    ## Thus we set any None values to empty dictionairies.
    ## Use mapValues as we are not changing which keeps the partition information valid, helping for the join in idealistaRDD

ilRDD = idealista.rdd.map(lambda t: (t['propertyCode'], (t['neighborhood'], t['scrap_date'], t[1:])))\
    .reduceByKey(lambda x, y: reconcile_idealista(x, y))\
    .map(lambda t: (t[1][0], (t[0], t[1][2])))\
    .filter(lambda t: isinstance(t[0], str))\
    .partitionBy(2, lambda k: get_partition_id(k[0]))\
    .join(luRDD)\
    .map(lambda t: (t[1][1], t[1][0]))\
    .partitionBy(2, lambda k: get_partition_id(k[0]))\
    .join(joinOpenRDD)\
    .map(lambda t: flatten(t)).cache()
    ## Create rows with as a key the property ID and save the neighbourhood name
    ## Reduce by key on the property ID to remove duplicates (the one with latest date is kept as reconciliation)
    ## Map the key to be the neighbourhood name so it can be joined with the lookup data
    ## join with the lookup table
    ## Map the key so that it is (neighbourhoodID, neighbourhoodName) reconciled ofcourse...
    ## join with the opendata both Leisure and Income
    ## flatten all values to a single non-nested tuple
## replace duplicate with check on latest scraping data.

In [121]:
##Run first part of the pipeline and cache it. Otherwise, the python worker crashes :3
joinOpenRDD.take(1)

[(('Q3298502', 'Montbau'),
  ({'Biblioteques': 3,
    "Sales d'estudi": 3,
    'Biblioteques municipals': 1,
    'Àrees de jocs infantils': 4,
    "Casals d'avis": 1,
    'Parcs i jardins': 4,
    'WiFi BCN': 1},
   {2007: '85.5',
    2008: '88.6',
    2009: '80.0',
    2010: '82.2',
    2011: '71.1',
    2012: '76.4',
    2013: '71.5',
    2014: '70.0',
    2015: '72.3',
    2016: '82.2',
    2017: '79.8'}))]

In [123]:
from pyspark.sql.types import StringType, StructType, StructField
schema_list = [StructField("propertyCode", StringType(), False)] #Not nullable as it is an ID
schema_list.append(StructField("NeighbourhoodID", StringType(), True)) #Nullable, but should not have any null values due to the pipeline
schema_list.append(StructField("NeighbourhoodName", StringType(), True)) #Nullable, but should not have any null values due to the pipeline
for i, field in enumerate(idealistaSchema):
    if i == 0:
        pass
    else:
        schema_list.append(field)
schema_list.append(StructField("LeisureDict", StringType(), True))#Nullable, but should not have any null values due to the pipeline (at most an empty dict)
schema_list.append(StructField("IncomeDict", StringType(), True))#Nullable, but should not have any null values due to the pipeline (at most an empty dict)
schema = StructType(schema_list)
df = ctx.createDataFrame(data=ilRDD.collect(), schema=schema)

1 --->  StructField(address,StringType,true)
2 --->  StructField(bathrooms,LongType,true)
3 --->  StructField(country,StringType,true)
4 --->  StructField(detailedType,StructType(List(StructField(subTypology,StringType,true),StructField(typology,StringType,true))),true)
5 --->  StructField(distance,StringType,true)
6 --->  StructField(district,StringType,true)
7 --->  StructField(exterior,BooleanType,true)
8 --->  StructField(externalReference,StringType,true)
9 --->  StructField(floor,StringType,true)
10 --->  StructField(has360,BooleanType,true)
11 --->  StructField(has3DTour,BooleanType,true)
12 --->  StructField(hasLift,BooleanType,true)
13 --->  StructField(hasPlan,BooleanType,true)
14 --->  StructField(hasStaging,BooleanType,true)
15 --->  StructField(hasVideo,BooleanType,true)
16 --->  StructField(latitude,DoubleType,true)
17 --->  StructField(longitude,DoubleType,true)
18 --->  StructField(municipality,StringType,true)
19 --->  StructField(neighborhood,StringType,true)
20 ---> 

In [None]:
## KPI 1; information rating of the listing, defined as the summed "commonness" of the provided information: sum(hasVideo, has360, hasPlan, has3Dtour, hasStaging, numPhotos, showAdress).
## Where hasVideo: if the listing has a video / by the total amount of listings with a video
## has360: if the listing has a video / by the total amount of listings with a 360
## hasPlan: if the listing has a video / by the total amount of listings with a plan
## has3Dtour: if the listing has a video / by the total amount of listings with a 3Dtour
## hasStaging: if the listing has a video / by the total amount of listings with a Staging
## numPhotos: numPhotos of the listing / by the average amount of photos for all listings
## showAdress: if the listing shows the adress / by the total amount of listings that show the adress

## also capture the neighbourhood name and the price in order to join them and make meaningful plots in Tableau

def set_value(avg, t, feature):
    if t[feature] == True:
        avg['total_' + feature] = 1
    else: avg['total_' + feature] = 0
    return avg

def init_averages(t):
    avg = {}
    avg = set_value(avg, t, "hasVideo")
    avg = set_value(avg, t, "has360")
    avg = set_value(avg, t, "hasPlan")
    avg = set_value(avg, t, "has3DTour")
    avg = set_value(avg, t, "hasStaging")
    avg = set_value(avg, t, "showAddress")
    avg["avgNumPhotos"] = t["numPhotos"]
    avg["count"] = 1
    return avg

def calc_totals(x, y):
    out = {}
    for key in x.keys():
        out[key] = x[key] + y[key]
    return out

def calc_averages(t):
    out = t
    out["avgNumPhotos"] = out['avgNumPhotos'] / out['count']
    return out

def calc_kpi1(t):
    kpi1 = 0
    for key in t[1][0][0].keys():
        kpi1 += t[1][0][0][key]/t[1][1][key]
    return (t[1][0][1], kpi1, t[1][0][2], t[1][0][3])

KPI1rdd = df.rdd.map(lambda t: ('key', (init_averages(t), t['propertyCode'], t['NeighbourhoodName'], t['price'])))\
    .cache()

averages = KPI1rdd.mapValues(lambda t: t[0])\
    .reduceByKey(lambda x, y: calc_totals(x, y))\
    .mapValues(lambda t: calc_averages(t)).cache()

KPI1rdd = KPI1rdd.join(averages)\
    .map(lambda t: calc_kpi1(t))

kpi1 = KPI1rdd.collect()

features = ['PropertyID', 'InformationScore', 'District', 'Price']
kpi1.sort()
with open('KPIs/kpi1.csv', 'w') as f:
    write = csv.writer(f, lineterminator = '\n')
    write.writerow(features)
    write.writerows(kpi1)

In [None]:
##KPI 2; amenities in the neighbourhood divided by the average listing price, both for selling and renting ... if it would exist D;

def leisure_str2amount(lstr):
    amount = 0
    if lstr != '{}':
        splitted = lstr.split(',')
        for item in splitted:
            item = item[1:]
            if "}" in item:
                item = item[:-1]
            items = item.split("=")
            amount += int(items[1])
    return amount

def init_kpi2(t):
    return ((t['NeighbourhoodName'], t['operation']), (t['price'], leisure_str2amount(t['LeisureDict']), 1))

KPI2rdd = df.rdd.map(lambda t: init_kpi2(t))\
    .reduceByKey(lambda x, y: (x[0]+y[0], x[1], x[2]+y[2]))\
    .map(lambda t: (t[0][0], t[0][1], (t[1][0]/t[1][2])/(t[1][1] + 1)))
## adding 1 to avoid devision by zero

kpi2 = KPI2rdd.collect()

features = ['District', 'Type of offer', 'price to leisure']
kpi2.sort()
with open('KPIs/kpi2.csv', 'w') as f:
    write = csv.writer(f, lineterminator = '\n')
    write.writerow(features)
    write.writerows(kpi2)

In [None]:
##KPI 3; Amount of listings posted per month per municipality

KPI3rdd = df.rdd.map(lambda t: ((t['NeighbourhoodName'], t['scrap_date'][:-3]), 1))\
    .reduceByKey(lambda x, y: x+y)\
    .map(lambda t: (t[0][0], t[0][1], t[1]))

kpi3 = KPI3rdd.collect()

features = ['District', 'month', 'listings']
kpi3.sort()
with open('KPIs/kpi3.csv', 'w') as f:
    write = csv.writer(f, lineterminator = '\n')
    write.writerow(features)
    write.writerows(kpi3)

## Modeling 

In [None]:
from pyspark.ml.feature import VectorAssembler, OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [None]:
train,test=df.randomSplit([0.7,0.3])

indexer = StringIndexer(inputCols=["NeighbourhoodID"], 
                        outputCols=["NeighbourhoodNum"],
                        handleInvalid='keep')

ohe = OneHotEncoder(inputCols=["NeighbourhoodNum"]
                        , outputCols=["NeighbourhoodOhe"],
                        handleInvalid='keep')

vec_assembler = VectorAssembler(inputCols=['NeighbourhoodOhe', 'price'], outputCol='feature', handleInvalid='keep')

model = GeneralizedLinearRegression(featuresCol='feature', labelCol='size', family="gaussian", link="identity", maxIter=50, regParam=0.1)

pipeline = Pipeline(stages=[indexer, ohe, vec_assembler, model])

paramGrid = ParamGridBuilder() \
    .addGrid(model.regParam, [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]) \
        .build()
    
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(labelCol="size", predictionCol="prediction", metricName='r2'),
                          numFolds=5)  # use 3+ folds in practice

cvModel = crossval.fit(train)

results = cvModel.transform(test)
evaluator = RegressionEvaluator(labelCol="size", predictionCol="prediction", metricName='r2')
r2 = evaluator.evaluate(results)
print("r2 = %s" % (r2))

cvModel.write().overwrite().save('pipeline_cv')

In [None]:
cvModel.avgMetrics

In [None]:
list(zip(cvModel.avgMetrics, paramGrid))

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split
import os

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.2.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0 pyspark-shell'
spark = SparkSession.builder.master("local[*]").appName('test').getOrCreate()

from pyspark.ml.tuning import CrossValidatorModel
cvModel = CrossValidatorModel.load("pipeline_cv")

df = spark \
    .readStream \
    .format('kafka') \
    .option('kafka.bootstrap.servers', "venomoth.fib.upc.edu:9092") \
    .option('subscribe', 'bdm_p2') \
    .load() \
    .selectExpr("CAST(value AS STRING)")

In [None]:
from pyspark.sql.types import DoubleType

value_split = df.withColumn('time', split(df['value'], ',').getItem(0)) \
       .withColumnRenamed('NeighborhoodId', 'NeighbourhoodID') \
       .withColumn('NeighbourhoodID', split(df['value'], ',').getItem(1)) \
       .withColumn('price', split(df['value'], ',').getItem(2).cast(DoubleType()).alias("price")) \
       .select('time', 'NeighbourhoodID', 'price') \
       
predict = cvModel.transform(value_split).select("time","NeighbourhoodId", "price", "prediction")

predict \
    .writeStream \
    .queryName("predict") \
    .format("memory") \
    .start()

In [None]:
spark.sql("select * from predict").show()

In [None]:
spark.streams.active[0].stop()

## Modeling 

In [None]:
from pyspark.ml.feature import VectorAssembler, OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [None]:
df

DataFrame[propertyCode: string, NeighbourhoodID: string, NeighbourhoodName: string, address: string, bathrooms: bigint, country: string, detailedType: struct<subTypology:string,typology:string>, distance: string, district: string, exterior: boolean, externalReference: string, floor: string, has360: boolean, has3DTour: boolean, hasLift: boolean, hasPlan: boolean, hasStaging: boolean, hasVideo: boolean, latitude: double, longitude: double, municipality: string, neighborhood: string, newDevelopment: boolean, newDevelopmentFinished: boolean, numPhotos: bigint, operation: string, parkingSpace: struct<hasParkingSpace:boolean,isParkingSpaceIncludedInPrice:boolean,parkingSpacePrice:double>, price: double, priceByArea: double, propertyCode: string, propertyType: string, province: string, rooms: bigint, scrap_date: string, showAddress: boolean, size: double, status: string, suggestedTexts: struct<subtitle:string,title:string>, thumbnail: string, topNewDevelopment: boolean, url: string, LeisureDi

In [None]:
train,test=df.randomSplit([0.7,0.3])

indexer = StringIndexer(inputCols=["NeighbourhoodID"], 
                        outputCols=["NeighbourhoodNum"],
                        handleInvalid='keep')

ohe = OneHotEncoder(inputCols=["NeighbourhoodNum"]
                        , outputCols=["NeighbourhoodOhe"],
                        handleInvalid='keep')

vec_assembler = VectorAssembler(inputCols=['NeighbourhoodOhe', 'price'], outputCol='feature', handleInvalid='keep')

model = GeneralizedLinearRegression(featuresCol='feature', labelCol='size', family="gaussian", link="identity", maxIter=50, regParam=0.1)

pipeline = Pipeline(stages=[indexer, ohe, vec_assembler, model])

paramGrid = ParamGridBuilder() \
    .addGrid(model.regParam, [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]) \
        .build()
    
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(labelCol="size", predictionCol="prediction", metricName='r2'),
                          numFolds=5)  # use 3+ folds in practice

cvModel = crossval.fit(train)

results = cvModel.transform(test)
evaluator = RegressionEvaluator(labelCol="size", predictionCol="prediction", metricName='r2')
r2 = evaluator.evaluate(results)
print("r2 = %s" % (r2))

cvModel.write().overwrite().save('pipeline_cv')

22/06/19 15:58:55 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
22/06/19 15:58:57 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/06/19 15:58:57 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
22/06/19 15:58:57 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


r2 = 0.6892041110109803


                                                                                

In [None]:
cvModel.avgMetrics

[0.7449440412363395,
 0.7449907324497944,
 0.7450483870901407,
 0.7451613519451592,
 0.7452712258669327,
 0.7453780502380525,
 0.7454818657906924]

In [None]:
list(zip(cvModel.avgMetrics, paramGrid))

[(0.7449440412363395,
  {Param(parent='GeneralizedLinearRegression_a59ed1f74940', name='regParam', doc='regularization parameter (>= 0).'): 0.01}),
 (0.7449907324497944,
  {Param(parent='GeneralizedLinearRegression_a59ed1f74940', name='regParam', doc='regularization parameter (>= 0).'): 0.05}),
 (0.7450483870901407,
  {Param(parent='GeneralizedLinearRegression_a59ed1f74940', name='regParam', doc='regularization parameter (>= 0).'): 0.1}),
 (0.7451613519451592,
  {Param(parent='GeneralizedLinearRegression_a59ed1f74940', name='regParam', doc='regularization parameter (>= 0).'): 0.2}),
 (0.7452712258669327,
  {Param(parent='GeneralizedLinearRegression_a59ed1f74940', name='regParam', doc='regularization parameter (>= 0).'): 0.3}),
 (0.7453780502380525,
  {Param(parent='GeneralizedLinearRegression_a59ed1f74940', name='regParam', doc='regularization parameter (>= 0).'): 0.4}),
 (0.7454818657906924,
  {Param(parent='GeneralizedLinearRegression_a59ed1f74940', name='regParam', doc='regularizati

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split
import os

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.2.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0 pyspark-shell'
spark = SparkSession.builder.master("local[*]").appName('test').getOrCreate()

from pyspark.ml.tuning import CrossValidatorModel
cvModel = CrossValidatorModel.load("pipeline_cv")

df = spark \
    .readStream \
    .format('kafka') \
    .option('kafka.bootstrap.servers', "venomoth.fib.upc.edu:9092") \
    .option('subscribe', 'bdm_p2') \
    .load() \
    .selectExpr("CAST(value AS STRING)")

22/06/19 16:00:23 WARN Utils: Your hostname, enricvm resolves to a loopback address: 127.0.1.1; using 192.168.233.128 instead (on interface ens33)
22/06/19 16:00:23 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/enric/Programs/spark-3.2.1-bin-hadoop2.7/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/enric/.ivy2/cache
The jars for the packages stored in: /home/enric/.ivy2/jars
org.apache.spark#spark-streaming-kafka-0-10_2.12 added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-806429d1-316d-40ee-a95a-633ec06c511d;1.0
	confs: [default]
	found org.apache.spark#spark-streaming-kafka-0-10_2.12;3.2.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.2.0 in central
	found org.apache.kafka#kafka-clients;2.8.0 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.1 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-client-api;3.3.1 in central
	found org.apache.htrace#htrace-core4;4.1.0-incubating in central
	found commons-logging#commons-loggi

In [None]:
from pyspark.sql.types import DoubleType

value_split = df.withColumn('time', split(df['value'], ',').getItem(0)) \
       .withColumnRenamed('NeighborhoodId', 'NeighbourhoodID') \
       .withColumn('NeighbourhoodID', split(df['value'], ',').getItem(1)) \
       .withColumn('price', split(df['value'], ',').getItem(2).cast(DoubleType()).alias("price")) \
       .select('time', 'NeighbourhoodID', 'price') \
       
predict = cvModel.transform(value_split).select("time","NeighbourhoodId", "price", "prediction")

predict \
    .writeStream \
    .queryName("predict") \
    .format("memory") \
    .start()

22/06/19 16:00:35 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-6f5eee0d-fee0-4196-b9b6-457d8b019a84. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
22/06/19 16:00:35 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.StreamingQuery at 0x7f34ff718220>

In [None]:
spark.sql("select * from predict").show()

+--------------------+---------------+--------+------------------+
|                time|NeighbourhoodId|   price|        prediction|
+--------------------+---------------+--------+------------------+
|2022-06-19 14:00:...|        Q980253|349953.0| 83.32995366118769|
|2022-06-19 14:00:...|       Q3296693|234947.0|57.669583005795374|
|2022-06-19 14:00:...|       Q2476184|259050.0|  85.3731262128205|
|2022-06-19 14:00:...|       Q1026658|474970.0| 102.2560562879289|
|2022-06-19 14:00:...|       Q1904302|680012.0|116.49956120502503|
|2022-06-19 14:00:...|       Q3596096|390061.0| 90.72801928422808|
|2022-06-19 14:00:...|       Q3297889|184940.0| 154.5006786817108|
|2022-06-19 14:00:...|       Q1026658|590068.0|116.99798714831985|
|2022-06-19 14:00:...|       Q3310216|198997.0|180.49572901418298|
|2022-06-19 14:00:...|       Q2442135|798907.0| 165.2781210963214|
|2022-06-19 14:00:...|       Q2476184|159007.0| 72.55946316857913|
|2022-06-19 14:01:...|       Q3297056|275066.0| 82.05351756964

In [None]:
spark.streams.active[0].stop()

IndexError: list index out of range