In [1]:
from pyspark.sql import functions as f

In [2]:
countryCode = 'nl'

#### transform new OSM data

In [4]:
list_columns = ['operatorID','osmId','placeIdGoogle','ohubId',\
                            'name','address','postalCode','city','latitude','longitude','businessType','website','phone',\
                            'nameGoogle','addressGoogle','postalCodeGoogle','cityGoogle',\
                            'latitudeGoogle','longitudeGoogle','businessTypeGoogle','websiteGoogle',
                            'type','uid','nameOSM','addressOSM','postalCodeOSM','cityOSM',\
                            'latitudeOSM','longitudeOSM','businessTypeOSM','websiteOSM','cuisineTypeOSM',\
                            'nameOHUB','addressOHUB','postalcodeOhub','cityOHUB','concatID']

In [5]:
nl_osm_new = spark.table('dev_sources_osm.cleaned_total_operator_base').where(f.lower(f.col('countryCode'))== countryCode) \
                          .where(f.col('street').isNotNull()) \
                          .where(f.col('name').isNotNull()) \
                          .withColumn('operatorID', f.concat(f.col('osmId'),f.lit(' /  / '))) \
                          .withColumn('osmId',f.col('osmid').cast('string')) \
                          .withColumn('placeIdGoogle',f.lit(None).cast('string')) \
                          .withColumn('name',f.col('name')) \
                          .withColumn('address',(f.concat(f.when(f.col('street').isNotNull(),f.col('street')).otherwise(f.lit('')),f.lit(" "), \
                                                          f.when(f.col('housenumber').isNotNull(),f.col('housenumber')).otherwise(f.lit(''))))) \
                          .withColumn('postalCode',f.regexp_replace(f.when(f.col('postcode').isNotNull(),f.col('postcode')).otherwise(f.lit('')),'[^a-zA-Z0-9]','').cast('string')) \
                          .withColumn('city',f.col('city')) \
                          .withColumn('latitude',f.col('latitude').cast('float')) \
                          .withColumn('longitude',f.col('longitude').cast('float')) \
                          .withColumn('businessType',f.col('BusinessTypesList').cast('string')) \
                          .withColumn('website',f.col('website')) \
                          .withColumn('phone',f.col('phone')) \
                          .withColumn('nameGoogle',f.lit(None).cast('string')) \
                          .withColumn('addressGoogle',f.lit(None).cast('string')) \
                          .withColumn('postalCodeGoogle',f.lit(None).cast('string')) \
                          .withColumn('cityGoogle',f.lit(None).cast('string')) \
                          .withColumn('latitudeGoogle',f.lit(None).cast('string')) \
                          .withColumn('longitudeGoogle',f.lit(None).cast('string')) \
                          .withColumn('businessTypeGoogle',f.lit(None).cast('string')) \
                          .withColumn('websiteGoogle',f.lit(None).cast('string')) \
                          .withColumn('phoneGoogle',f.lit(None).cast('string')) \
                          .withColumn('ohubId',f.lit(None).cast('string'))\
                          .withColumn('nameOHUB',f.lit(None).cast('string'))\
                          .withColumn('addressOHUB',f.lit(None).cast('string'))\
                          .withColumn('postalcodeOhub',f.lit(None).cast('string'))\
                          .withColumn('cityOHUB',f.lit(None).cast('string'))\
                          .withColumn('concatID',f.lit(None).cast('string'))\
                          .withColumn('channelOHUB',f.lit(None).cast('string')) \
                          .select('operatorID',
                                  'osmId',
                                  'placeIdGoogle',
                                  'ohubid',
                                  'name',
                                  'address',
                                  'postalCode',
                                  'city',
                                  'latitude',
                                  'longitude',
                                  'businessType',
                                  'website',
                                  'phone',
                                  'nameGoogle',
                                  'addressGoogle',
                                  'postalCodeGoogle',
                                  'cityGoogle',
                                  'latitudeGoogle',
                                  'longitudeGoogle',
                                  'businessTypeGoogle',
                                  'websiteGoogle',
                                  'type',                                                            
                                  'uid',
                                  f.col('name').alias('nameOSM'),
                                  f.col('address').alias('addressOSM'),
                                  f.col('postalCode').alias('postalCodeOSM'),
                                  f.col('city').alias('cityOSM'),
                                  f.col('latitude').alias('latitudeOSM'),
                                  f.col('longitude').alias('longitudeOSM'),
                                  f.col('businessType').alias('businessTypeOSM'),
                                  f.col('website').alias('websiteOSM'),
                                  f.col('cuisineCleanList').cast('string').alias('cuisineTypeOSM'),
                                  'nameOHUB','addressOHUB','postalcodeOhub','cityOHUB','concatID')

In [6]:
nl_enriched = spark.table('dev_derived_ouniverse.output_total_universe_nl_enriched')
osm_id_list_in_universe = [str(row.osmid) for row in nl_enriched.where(f.col('osmid').isNotNull()).select("osmid").collect()]

print(len(osm_id_list_in_universe))

In [7]:
nl_osm_new = nl_osm_new.select(list_columns)
for col in nl_enriched.columns:
  if col not in nl_osm_new.columns:
    nl_osm_new = nl_osm_new.withColumn(col,f.lit(None).cast(nl_enriched.schema[col].dataType))
    
nl_osm_new = nl_osm_new.select(nl_enriched.columns)

### Check OSM part in Universe and OSM in current OSM data

In [9]:
checkinUniverse = nl_enriched.where(f.col('osmid').isNotNull()).join(nl_osm_new, on = 'osmId', how = 'left_anti')
checkinUniverse.count()

In [10]:
checkinOSM= nl_osm_new.join(nl_enriched.where(f.col('osmid').isNotNull()), on = 'osmId', how = 'left_anti')
checkinOSM.count()

In [11]:
display(nl_osm_new.where(f.col('name') == "Grand Cafe 't Elfde Gebod"))

operatorId,PlaceIDGoogle,osmId,ohubID,name,address,postalCode,city,Latitude,Longitude,businessType,Website,cuisineType,NameGoogle,AddressGoogle,PostalCodeGoogle,CityGoogle,LatitudeGoogle,LongitudeGoogle,businessTypeGoogle,WebsiteGoogle,type,uid,nameOSM,AddressOSM,PostalcodeOSM,cityOSM,LatitudeOSM,LongitudeOSM,cuisineTypeOSM,websiteOSM,concatID,nameOhub,postalcodeOhub,addressOhub,cityOhub,source,globalChannel,chain,buying,valueTier,valueTierDescription,d_buyer,id,keyword,uniqueProductCount,SAP_DEX_ID,is_visited,SAPID,cp_mobileNumber,phone,countryGoogle,InternationalPhoneNumberGoogle,uniqueID,take_away,isTargetUniverse,menuHasTerras,primary_topcategory,businessTypeOSM,channelOhub,menuDishes,globalListChannels,operatorLifeCycle_Stage,SALESREP


In [12]:
display(checkinUniverse)
# https://www.openstreetmap.org/node/2750544386
# Flora Boskoop


osmId,operatorId,PlaceIDGoogle,ohubID,name,address,postalCode,city,Latitude,Longitude,businessType,Website,cuisineType,NameGoogle,AddressGoogle,PostalCodeGoogle,CityGoogle,LatitudeGoogle,LongitudeGoogle,businessTypeGoogle,WebsiteGoogle,type,uid,nameOSM,AddressOSM,PostalcodeOSM,cityOSM,LatitudeOSM,LongitudeOSM,cuisineTypeOSM,websiteOSM,concatID,nameOhub,postalcodeOhub,addressOhub,cityOhub,source,globalChannel,chain,buying,valueTier,valueTierDescription,d_buyer,id,keyword,uniqueProductCount,SAP_DEX_ID,is_visited,SAPID,cp_mobileNumber,phone,countryGoogle,InternationalPhoneNumberGoogle,uniqueID,take_away,isTargetUniverse,menuHasTerras,primary_topcategory,businessTypeOSM,channelOhub,menuDishes,globalListChannels,operatorLifeCycle_Stage,SALESREP
1085657792,1085657792 / /,,,Grand Cafe 't Elfde Gebod,Koningin Wilhelmina Boulevard,2202GV,Noordwijk,52.247893000000005,4.4333085,pub,http://www.hotelvanoranje.nl/,Unknown,,,,,,,,,node,0,Grand Cafe 't Elfde Gebod,,,,52.247893000000005,4.4333085,,http://www.hotelvanoranje.nl/,,,,,,OSM,Cafe,Unknown,,10.0,,0.0,,,,,,,,,,,,False,1.0,True,,pub,,tonijn,"[Other, Pub, Cafe]",E1B0-low,Ron Quartel
1288186491,1288186491 / /,,,OBS Commissaris Gaarlandt,Rembrandtstraat,7948AT,Meppel,52.7332742,6.1666862,school,,Unknown,,,,,,,,,node,0,OBS Commissaris Gaarlandt,,,,52.7332742,6.1666862,,,,,,,,OSM,Schools - Unknown level,Unknown,,9.0,€0.0 - €36.0 (average value = 20.0),0.0,,,,,,,,,,,,False,1.0,,,school,,,"[Other, Schools - Unknown level]",E1B0-low,Geartsje Stegenga
1345799233,1345799233 / /,,,Bodega Bora Bora,Markt,5554CD,Valkenswaard,51.350611400000005,5.4585947,bar,,Unknown,,,,,,,,,node,0,Bodega Bora Bora,,,,51.350611400000005,5.4585947,,,,,,,,OSM,Other,Unknown,,10.0,,0.0,,,,,,,,,,,,False,0.0,,,bar,,,[Other],E0B0-low,Johan van Wageningen
1392117337,1392117337 / /,,,Paalkampeerterrein Klinkenbelt,Klinkenbeltweg,7448SC,,52.3724479,6.4284192,camp_site,,Unknown,,,,,,,,,node,0,Paalkampeerterrein Klinkenbelt,,,,52.3724479,6.4284192,,,,,,,,OSM,Campground,Unknown,,10.0,,0.0,,,,,,,,,,,,False,0.0,,,camp_site,,,"[Other, Campground]",E0B0-low,Danny van de Put
1476416995,1476416995 / /,,,Aloysius,Meester Postlaan,3155BM,Midden-Delfland,51.9340344,4.2729796,school,,Unknown,,,,,,,,,node,0,Aloysius,,,,51.9340344,4.2729796,,,,,,,,OSM,Schools - Unknown level,Unknown,,10.0,,0.0,,,,,,,,,,,,False,1.0,,,school,,,"[Other, Schools - Unknown level]",E1B0-low,Marc Bontekoe
1547889731,1547889731 / /,,,Polderstraat,Polderstraat,2952AK,,51.860169600000006,4.6611454000000005,fuel,,Unknown,,,,,,,,,node,0,Polderstraat,,,,51.860169600000006,4.6611454000000005,,,,,,,,OSM,Other,Unknown,,9.0,€0.0 - €36.0 (average value = 20.0),0.0,,,,,,,,,,,,False,0.0,,,fuel,,,[Other],E0B0-low,Marc Bontekoe
1551002291,1551002291 / /,,,XL&t,Zwanenburgseweg,4384LW,Vlissingen,51.4556807,3.5496652,restaurant,https://www.tenbzeeland.nl/restaurant,Unknown,,,,,,,,,node,0,XL&t,,,,51.4556807,3.5496652,,https://www.tenbzeeland.nl/restaurant,,,,,,OSM,Other Restaurant,Unknown,,10.0,,0.0,,,,,,,,,,,,False,1.0,,,restaurant,,,"[Other, Other Restaurant]",E1B0-low,Bert Timmerman
1587309820,1587309820 / /,,,Ter Linde,Ommerweg,7921TE,De Wolden,52.645704,6.429333700000001,hotel,,Unknown,,,,,,,,,node,0,Ter Linde,,,,52.645704,6.429333700000001,,,,,,,,OSM,Hotel,Unknown,,9.0,€0.0 - €36.0 (average value = 20.0),0.0,,,,,,,,,,,,False,1.0,,,hotel,,,"[Other, Hotel]",E1B0-low,Geartsje Stegenga
1604093241,1604093241 / /,,,De Samensprong,Dokter Beumerstraat,8433MH,Ooststellingwerf,53.0666784,6.3340373,school,,Unknown,,,,,,,,,node,0,De Samensprong,,,,53.0666784,6.3340373,,,,,,,,OSM,Schools - Unknown level,Unknown,,10.0,,0.0,,,,,,,,,,,,False,1.0,,,school,,,"[Other, Schools - Unknown level]",E1B0-low,Geartsje Stegenga
1659074049,1659074049 / /,,,Texaco,Centraleweg,4931NB,Geertruidenberg,51.697101,4.8549717,fuel,,Unknown,,,,,,,,,node,0,Texaco,Centraleweg,4931NB,Geertruidenberg,51.697101,4.8549717,,,,,,,,OSM,Other,Unknown,,4.0,€282.0 - €304.0 (average value = 294.0),0.0,,,,,,,,,,,,False,0.0,,,fuel,,,[Other],E0B0-low,Niels Behrens


#### update OSM in Universe

In [14]:
nl_osm_new.count()

In [15]:
nl_osm_new_not_in_universe = nl_osm_new.where(~f.col('osmid').isin(osm_id_list_in_universe))

In [16]:
display(checkinOSM.where(f.col('osmId').contains('restaurant')))

osmId,operatorId,PlaceIDGoogle,ohubID,name,address,postalCode,city,Latitude,Longitude,businessType,Website,cuisineType,NameGoogle,AddressGoogle,PostalCodeGoogle,CityGoogle,LatitudeGoogle,LongitudeGoogle,businessTypeGoogle,WebsiteGoogle,type,uid,nameOSM,AddressOSM,PostalcodeOSM,cityOSM,LatitudeOSM,LongitudeOSM,cuisineTypeOSM,websiteOSM,concatID,nameOhub,postalcodeOhub,addressOhub,cityOhub,source,globalChannel,chain,buying,valueTier,valueTierDescription,d_buyer,id,keyword,uniqueProductCount,SAP_DEX_ID,is_visited,SAPID,cp_mobileNumber,phone,countryGoogle,InternationalPhoneNumberGoogle,uniqueID,take_away,isTargetUniverse,menuHasTerras,primary_topcategory,businessTypeOSM,channelOhub,menuDishes,globalListChannels,operatorLifeCycle_Stage,SALESREP
95288389,,,,Meram,Pretoriusstraat 22,,Amsterdam,,,[restaurant],,,,,,,,,,,node,0,Meram,,,Amsterdam,,,[turkish],https://meram.nl/,,,,,,,,,,,,,,,,,,,,,,,,,,,,[restaurant],,,,,
251177434,,,,Wing Kee,Zeedijk 78,1012CR,Amsterdam,,,[restaurant],,,,,,,,,,,node,0,Wing Kee,,,Amsterdam,,,[chinese],,,,,,,,,,,,,,,,,,,,,,,,,,,,,[restaurant],,,,,
303274482,,,,The Mart,High Tech Campus 1,5656AE,Eindhoven,,,[restaurant],,,,,,,,,,,node,0,The Mart,,,Eindhoven,,,,https://www.hightechcampus.com/facilities/the-mart,,,,,,,,,,,,,,,,,,,,,,,,,,,,[restaurant],,,,,
303274485,,,,Alfresco,High Tech Campus 1,5656AE,Eindhoven,,,[restaurant],,,,,,,,,,,node,0,Alfresco,,,Eindhoven,,,,https://www.hightechcampus.com/facilities/alfresco/,,,,,,,,,,,,,,,,,,,,,,,,,,,,[restaurant],,,,,
303274488,,,,The Lounge,High Tech Campus 1,5656AE,Eindhoven,,,[restaurant],,,,,,,,,,,node,0,The Lounge,,,Eindhoven,,,,https://www.hightechcampus.com/facilities/the-lounge,,,,,,,,,,,,,,,,,,,,,,,,,,,,[restaurant],,,,,
303797681,,,,Nieuw Schaijk,Rijksweg 46,5374RB,Schaijk,,,[restaurant],,,,,,,,,,,node,0,Nieuw Schaijk,,,Schaijk,,,,https://www.nieuwschaijk.nl,,,,,,,,,,,,,,,,,,,,+31486461294,,,,,,,,[restaurant],,,,,
344917270,,,,De Oude Maas,Ossestraat 11,5367NE,Macharen,,,[restaurant],,,,,,,,,,,node,0,De Oude Maas,,,Macharen,,,[pancake],https://deoudemaas.nl,,,,,,,,,,,,,,,,,,,,+31412647890,,,,,,,,[restaurant],,,,,
347010429,,,,De Knip,Kniplaan 22,,Voorschoten,,,[restaurant],,,,,,,,,,,node,0,De Knip,,,Voorschoten,,,[regional],,,,,,,,,,,,,,,,,,,,,,,,,,,,,[restaurant],,,,,
349194015,,,,Tong Ah,Panweg 118,3705GG,Zeist,,,[restaurant],,,,,,,,,,,node,0,Tong Ah,,,Zeist,,,[chinese],,,,,,,,,,,,,,,,,,,,,,,,,,,,,[restaurant],,,,,
385126457,,,,Louis Hartloopercomplex,Tolsteegbrug 1,3511ZN,Utrecht,,,[restaurant],,,,,,,,,,,node,0,Louis Hartloopercomplex,,,Utrecht,,,,http://www.louishartloopercomplex.nl/,,,,,,,,,,,,,,,,,,,,+31 30 2320450,,,,,,,,[restaurant],,,,,


In [17]:
display(checkinUniverse.where(f.lower(f.col('name')).isin(['joelia', 'garam masala', 'de eenhoorn', 'het stadhuis', 'restaurant osteria'])))

osmId,operatorId,PlaceIDGoogle,ohubID,name,address,postalCode,city,Latitude,Longitude,businessType,Website,cuisineType,NameGoogle,AddressGoogle,PostalCodeGoogle,CityGoogle,LatitudeGoogle,LongitudeGoogle,businessTypeGoogle,WebsiteGoogle,type,uid,nameOSM,AddressOSM,PostalcodeOSM,cityOSM,LatitudeOSM,LongitudeOSM,cuisineTypeOSM,websiteOSM,concatID,nameOhub,postalcodeOhub,addressOhub,cityOhub,source,globalChannel,chain,buying,valueTier,valueTierDescription,d_buyer,id,keyword,uniqueProductCount,SAP_DEX_ID,is_visited,SAPID,cp_mobileNumber,phone,countryGoogle,InternationalPhoneNumberGoogle,uniqueID,take_away,isTargetUniverse,menuHasTerras,primary_topcategory,businessTypeOSM,channelOhub,menuDishes,globalListChannels,operatorLifeCycle_Stage,SALESREP
471037983,471037983 / /,,,Restaurant Osteria,Markt 2a,5492AB,Meierijstad,51.565047400000005,5.4611972,restaurant,,Unknown,,,,,,,,,node,0,Restaurant Osteria,,,,51.565047400000005,5.4611972,,,,,,,,OSM,Other Restaurant,Unknown,,10,,0,,,,,,,,,,,,False,1,,,restaurant,,,"[Other, Other Restaurant]",E1B0-low,Johan van Wageningen


In [18]:
print('current count from new osm', nl_osm_new.select('osmId').count(),'\n',
     'current count in universe nl', nl_enriched.where(f.col('osmid').isNotNull()).count(),'\n',
     'differentation', nl_osm_new_not_in_universe.count())

In [19]:
final = nl_enriched.union(nl_osm_new_not_in_universe)

#### Check the osm part in new universe

In [21]:
print('osm count after:', final.where(f.col('osmid').isNotNull()).count(), '\n',
     'osm count before', nl_enriched.where(f.col('osmid').isNotNull()).count())