### Mapping Table OHubID - SAP_ID

In [2]:
import uuid
import time
from datetime import datetime, timedelta
from pyspark.sql import functions as F
from pyspark.sql.functions import udf, monotonically_increasing_id, lit, unix_timestamp, from_unixtime, date_sub, \
    collect_list, substring, col, regexp_replace, lower, when
from pyspark.sql.types import StringType
import pandas as pd


class CreateEnrichedUniverse(object):

    def __init__(self, fn_universe, fn_channel_tier, fn_prod_map, fn_op_sales, fn_reachable, fn_osm,
                 fn_restaurant_details, fn_ohub_operators_df, fn_rex, fn_dex, fn_sap_table, fn_take_away_mapping,
                 fn_sales_rep, country_code, visit_expiration_threshold=730):
        self.fn_universe = fn_universe
        self.fn_channel_tier = fn_channel_tier
        self.fn_prod_map = fn_prod_map
        self.fn_op_sales = fn_op_sales
        self.fn_reachable = fn_reachable
        self.fn_osm = fn_osm
        self.fn_restaurant_details = fn_restaurant_details
        self.fn_ohub_operators_df = fn_ohub_operators_df
        self.fn_rex = fn_rex
        self.fn_dex = fn_dex
        self.fn_sap_table = fn_sap_table
        self.fn_take_away_mapping = fn_take_away_mapping
        self.fn_sales_rep = fn_sales_rep
        self.country_code = country_code
        self.visit_expiration_threshold = visit_expiration_threshold

    def create_universe(self):
        universe, channel_tier, prod_map = self.load_data()
        universe, op_sales_not_in_universe = BuyingOperators(self.fn_ohub_operators_df, self.fn_op_sales, self.country_code).get_all_buying_operators(universe)
        op_sales_not_in_universe = self.create_op_not_in_universe(op_sales_not_in_universe)
        df = self.enrich_op_in_universe(universe, channel_tier, prod_map)
        #.format("delta")
#         df.write.mode('overwrite').saveAsTable('dev_output.nl_universe_enriched')
        df
        return op_sales_not_in_universe, df

    def create_op_not_in_universe(self, op_sales_not_in_universe):
        op_sales_not_in_universe = VisitedOperators(self.fn_rex, self.fn_dex, self.country_code, self.visit_expiration_threshold).create_visited_column(op_sales_not_in_universe)
        op_sales_not_in_universe = self.combine_with_sap_ids(op_sales_not_in_universe)
        op_sales_not_in_universe = self.make_take_away_boolean(op_sales_not_in_universe)
        op_sales_not_in_universe = self.combine_with_sales_reps(op_sales_not_in_universe, merge_col='zipCode')
        op_sales_not_in_universe = op_sales_not_in_universe.distinct()
        print('op_sales_not_in_universe after duplicates: ' + str(op_sales_not_in_universe.count()))
        return op_sales_not_in_universe

    def enrich_op_in_universe(self, universe, channel_tier, prod_map):
        df = self.combine_tables(universe, channel_tier, prod_map)
        df = VisitedOperators(self.fn_rex, self.fn_dex, self.country_code, self.visit_expiration_threshold).create_visited_column(df)
        df = self.combine_with_sap_ids(df)
        df = PhoneData(self.fn_reachable, self.fn_osm, self.fn_restaurant_details, self.country_code).load_and_combine_phone_data(df)
        df = self.make_take_away_boolean(df)
        df = self.combine_with_sales_reps(df, merge_col='postalCode')
        df = df.distinct()
        print('universe after duplicates: ' + str(df.count()))
        return df

    def load_data(self):
        universe = spark.table(self.fn_universe).select(['osmId','operatorid','PlaceIDGoogle','ohubID','concatID','name','address','postalcode','city','cuisineType','MainBusinessType','AllBusinessTypes','Website','Latitude','Longitude','source','keyword','tagReverseGeocode','rationale'])
        print('whole universe: ' + str(universe.count()))
#         universe = universe.filter((universe['address'].isNotNull()) | (universe['postalCode'].isNotNull()))
#         print('universe with addresses: ' + str(universe.count()))
        channel_tier = spark.table(self.fn_channel_tier)
        prod_map = spark.table(self.fn_prod_map)
        return universe, channel_tier, prod_map
    
#     def write_to_delta_table(df):
#       path_delta_table = "/mnt/delta/test/iris"
#       sql_query1 = "drop table if exists " + hive_table
#       sql_query2 = "create table " + hive_table + " using delta location " + "'" + deltaTable + "'"
#       spark.sql(sql_query1)
#       spark.sql(sql_query2)

    def combine_tables(self, universe, channel_tier, prod_map):
        """
        Combine the universe, the channel and tier scoring data and the product mapping data into one dataframe
        """
        channel_tier = channel_tier.withColumnRenamed("operatorId", "operatorid").select(
            ["operatorid", "valueTier", "valueTierDescription", "d_buyer"])
        df = universe.join(channel_tier, on=['operatorid'], how='left')
        df = df.join(prod_map, df.PlaceIDGoogle == prod_map.id, how='left')
        # df = universe.join(prod_map, universe.PlaceIDGoogle == prod_map.id, how='left')
        print('universe with value tier and menu data : ' + str(df.count()))
        return df

    def combine_with_sap_ids(self, df):
        sap_table = spark.table(self.fn_sap_table).select('ohubID', 'SAPID')
        df = df.join(sap_table, on='ohubID', how='left')
        return df

    def make_take_away_boolean(self, df):
        take_away_df = spark.table(self.fn_take_away_mapping) .withColumn('operatorIDClean', regexp_replace(col('operatorID'), "\s", "")).withColumn('take_away', lit(True)).distinct().select(['operatorIDClean', 'uniqueID', 'take_away'])
        take_away_df = take_away_df.withColumnRenamed('operatorIDClean', 'operatorid').select(['operatorid', 'uniqueID', 'take_away'])
        df = df.withColumn('operatorid', regexp_replace(col('operatorid'), "\s", ""))
        df = df.join(take_away_df, on='operatorid', how='left')
        df = df.withColumn('take_away', when(col('take_away').isNull(), False).otherwise(col('take_away')))
        print('operators with true take away: ' + str(df.filter(col('take_away') == True).count()))
        return df

    def combine_with_sales_reps(self, df, merge_col):
        sales_rep_df = spark.table(self.fn_sales_rep).select(
            ['POSTAL CODE11', 'SALESREP']).withColumnRenamed('POSTAL CODE11', merge_col).distinct()
        sales_rep_df = sales_rep_df.withColumn(merge_col + '_cleaned', regexp_replace(col(merge_col), ' ', '')).withColumn(merge_col + '_cleaned', regexp_replace(col(merge_col + '_cleaned'), '[a-zA-Z]+', '')).select([merge_col+ '_cleaned', 'SALESREP'])
        df = df.withColumn(merge_col + '_cleaned' , regexp_replace(col(merge_col), ' ', '')).withColumn(merge_col + '_cleaned', regexp_replace(col(merge_col + '_cleaned'), '[a-zA-Z]+', ''))
        df = df.join(sales_rep_df, on=merge_col + '_cleaned', how='left')
        return df


class BuyingOperators(object):

    def __init__(self, fn_ohub_operators_df, fn_op_sales, country_code):
        self.fn_ohub_operators_df = fn_ohub_operators_df
        self.fn_op_sales = fn_op_sales
        self.country_code = country_code

    def get_all_buying_operators(self, universe):
        op_sales = self.load_operator_sales().select(['operatorOhubId']).withColumnRenamed('operatorOhubId',
                                                                                           'ohubId').distinct().withColumn(
            'buying', lit('true'))
        universe = universe.join(op_sales, on='ohubID', how='left')
        op_sales_not_in_universe = self.get_operators_not_in_universe(universe, op_sales)
        ohub_operator_df, all_info_no_phone = self.get_operator_details()
        all_ohub_info = self.get_optin_phonenumbers(ohub_operator_df, all_info_no_phone)
        op_sales_not_in_universe = op_sales_not_in_universe.join(all_ohub_info, on='ohubId', how='left').dropDuplicates(
            ['ohubId'])
        op_sales_not_in_universe = self.match_operator_id(universe, op_sales_not_in_universe)
        return universe, op_sales_not_in_universe

    def load_operator_sales(self):
        op_sales_df = spark.table(self.fn_op_sales)
        op_sales_df = op_sales_df.where(lower(col('countryCode')) == self.country_code)
        print('sales : ' + str(op_sales_df.count()))
        return op_sales_df

    def get_operators_not_in_universe(self, universe, op_sales):
        universe_ids = list(set(universe.select(collect_list('ohubID')).first()[0]))
        op_sales_not_in_universe = op_sales.filter(~op_sales['ohubID'].isin(universe_ids))
        return op_sales_not_in_universe

    def get_operator_details(self, ):
        ohub_operator_df = spark.table(self.fn_ohub_operators_df)
        ohub_operator_df = ohub_operator_df.filter(lower(col('countryCode')) == self.country_code)
        all_info_no_phone = ohub_operator_df.select(
            ['ohubId', 'name', 'street', 'city', 'isActive', 'chainId', 'website', 'zipCode',
             'countryCode', 'customerType', 'kitchenType', 'menuKeywords', 'potentialSalesValue',
             'globalChannel'])
        return ohub_operator_df, all_info_no_phone

    def get_optin_phonenumbers(self, ohub_operator_df, all_info_no_phone):
        opt_in_phones = ohub_operator_df.select(['ohubId', 'hasMobileOptIn', 'phoneNumber', 'mobileNumber']).filter(
            lower(col('hasMobileOptIn')) == 'true')
        all_ohub_info = all_info_no_phone.join(opt_in_phones, on='ohubId', how='left')
        return all_ohub_info

    def match_operator_id(self, universe, op_sales_not_in_universe):
        operator_id_match = universe.select(['operatorID', 'ohubID']).distinct().withColumnRenamed('ohubID', 'ohubId')
        op_sales_not_in_universe = op_sales_not_in_universe.join(operator_id_match, on='ohubId', how='left')
        return op_sales_not_in_universe


class VisitedOperators(object):

    def __init__(self, fn_rex, fn_dex, country_code, visit_expiration_threshold):
        self.fn_rex = fn_rex
        self.fn_dex = fn_dex
        self.visit_expiration_threshold = visit_expiration_threshold
        self.country_code = country_code

    def create_visited_column(self, df):
        """
        Filter out the busisses that have been visited in the last 'visit_expiration_threshold' number of months
        """
        # Load SF data
        visited_ohub_ids = self.__get_visited_ohubids()
        df = df.join(visited_ohub_ids, on='ohubId', how='left')
        print('universe with visited operators : ' + str(df.count()))
        return df

    def __get_visited_ohubids(self):
        rex_data = self.__load_rex()
        dex_data, account_ohub_table = self.__load_dex()
        rex_data = rex_data.join(account_ohub_table, on='AccountID', how='left')
        sf_data = rex_data.union(dex_data)
        visited_in_ohub = self.__get_visited_ohub()
        sf_data = sf_data.select(['ohubId', 'AccountID']).union(visited_in_ohub)
#         visited_ohub_ids = list(set(sf_data.select(collect_list('ohubId')).first()[0])) + list(set(visited_in_ohub.select(collect_list('ohubId')).first()[0]))
        visited_ohub_ids = sf_data.withColumn('is_visited', lit('true')).distinct()
        visited_ohub_ids = visited_ohub_ids.withColumnRenamed('AccountID', 'SAP_DEX_ID')
        return visited_ohub_ids

    def __load_rex(self):
        # load and transform rex data
        rex_data = spark.table(self.fn_rex).select(['SAP DEX ID', 'Execution End'])
        rex_data = rex_data.withColumnRenamed('SAP DEX ID', 'AccountID')
        rex_data = rex_data.withColumn('visited_date',
                                       from_unixtime(unix_timestamp('Execution End', 'dd-MM-yyyy HH:mm'))).select(
            ['AccountID', 'visited_date'])
        start_date = datetime.now() - timedelta(days=self.visit_expiration_threshold)
        rex_data = rex_data.filter((rex_data['visited_date'] > start_date))
        return rex_data

    def __clean_years(self, dex_data):
        return (dex_data.withColumn('DateFromCET', regexp_replace(col('DateFromCET'), '-20', '-2020'))
                .withColumn('DateFromCET', regexp_replace(col('DateFromCET'), '-17', '-2017'))
                .withColumn('DateFromCET', regexp_replace(col('DateFromCET'), '-18', '-2018'))
                .withColumn('DateFromCET', regexp_replace(col('DateFromCET'), '-19', '-2019')))

    def __load_dex(self):
        # load and transform dex data
        dex_data = spark.table(self.fn_dex)
        account_ohub_table = dex_data.select(['AccountID', 'ohubId'])
        dex_data = dex_data.filter(lower(col('country')) == self.country_code).select(['AccountID', 'DateFromCET', 'ohubId'])
        dex_data = self.__clean_years(dex_data)
        dex_data = dex_data.withColumn('visited_date',
                                       from_unixtime(unix_timestamp('DateFromCET', 'dd-MM-yyyy'))).select(
            ['AccountID', 'visited_date', 'ohubId'])
        start_date = datetime.now() - timedelta(days=self.visit_expiration_threshold)
        dex_data = dex_data.filter((dex_data['visited_date'] > start_date))
        return dex_data, account_ohub_table

    def __clean_years(self, dex_data):
        return (dex_data.withColumn('DateFromCET', regexp_replace(col('DateFromCET'), '-20', '-2020'))
                .withColumn('DateFromCET', regexp_replace(col('DateFromCET'), '-17', '-2017'))
                .withColumn('DateFromCET', regexp_replace(col('DateFromCET'), '-18', '-2018'))
                .withColumn('DateFromCET', regexp_replace(col('DateFromCET'), '-19', '-2019')))
      
    def __get_visited_ohub(self):
        ohub = spark.table('dev_sources_ohub.raw_activities').filter(lower(col('countryCode')) == 'nl')
        ohub = ohub.withColumn('activity_date', from_unixtime(unix_timestamp('activityDate', 'yyyy-MM-dd')))
        start_date = datetime.now() - timedelta(days=self.visit_expiration_threshold)
        ohub = ohub.filter((ohub['activity_date'] > start_date))
        ohub_visits = ohub.filter(col('actionType') == 'SalesVisit')
        visited_in_ohub = ohub_visits.select('ohubId', 'sourceEntityId').withColumnRenamed('sourceEntityId', 'AccountID').distinct()
        return visited_in_ohub


class PhoneData(object):

    def __init__(self, fn_reachable, fn_osm, fn_restaurant_details, country_code):
        self.fn_reachable = fn_reachable
        self.fn_osm = fn_osm
        self.fn_restaurant_details = fn_restaurant_details
        self.country_code = country_code

    def load_and_combine_phone_data(self, df):
        reachable_df = self.get_reachable_phone_numbers()
        df = df.join(reachable_df, on='ohubId', how='left')
        osm_df = self.get_osm_phone_numbers()
        df = df.join(osm_df, on='osmId', how='left')
        rest_det_df = self.get_rest_det_phone_numbers()
        #         self.check_overlap_pyspark(df, rest_det_df, 'PlaceIDGoogle', 'PlaceIDGoogle', 'phone number df', 'df')
        df = df.join(rest_det_df, on='PlaceIDGoogle', how='left')
        print('universe with phone data : ' + str(df.count()))
        return df

    def get_reachable_phone_numbers(self):
        reachable_df = spark.table(self.fn_reachable)
        reachable_df = reachable_df.filter(lower(col('op_countryCode')) == self.country_code)
        reachable_df = reachable_df.filter(lower(col('op_isActive')) == 'true')
        reachable_df = reachable_df.filter(lower(col('cp_hasMobileOptIn')) == 'true')
        reachable_df = reachable_df.dropDuplicates()
        reachable_df = reachable_df.select(['op_ohubId', 'cp_mobileNumber'])
        reachable_df = reachable_df.withColumnRenamed('op_ohubId', 'ohubId')
        return reachable_df

    def get_osm_phone_numbers(self):
        osm_df = spark.table(self.fn_osm)
        osm_df = self.filter_out_06_numbers(osm_df, colname='phone')
        osm_df = osm_df.select(['id', 'phone'])
        osm_df = osm_df.withColumnRenamed('id', 'osmId')
        return osm_df

    def get_rest_det_phone_numbers(self):
        rest_det_df = spark.table(self.fn_restaurant_details)
        rest_det_df = rest_det_df.filter(lower(col('countryGoogle')) == 'netherlands')
        rest_det_df = self.filter_out_06_numbers(rest_det_df, colname='InternationalPhoneNumberGoogle')
        rest_det_df = rest_det_df.select(
            ['countryGoogle', 'PlaceIDGoogle', 'InternationalPhoneNumberGoogle']).dropDuplicates()
        return rest_det_df

    def filter_out_06_numbers(self, df, colname):
        df = df.withColumn(colname, regexp_replace(col(colname), " ", ""))
        df_06 = df.filter(~df[colname].rlike('^06'))
        df = df_06.filter(~df_06[colname].rlike('^\+316'))
        return df


In [3]:
fn_universe = 'dev_derived_ouniverse.output_total_universe_nl'
fn_channel_tier = 'lead_intelligence_emea.NL_TotalUniverseChannelvalueDeliveryTier'
fn_prod_map = 'lead_management.nl_business_product_mapping_wide_20200312'
fn_op_sales = 'data_model.ohub_operator_sales'
fn_reachable = 'data_fpo.reachable_fpo_golden_20200414'
fn_osm = 'data_osm.osm_processed_nl_20191021'
fn_restaurant_details = 'data_menu.restaurant_details'
fn_ohub_operators_df = 'data_datascience_prod.operators'
fn_rex = 'dev_sources_rexlite.account_with_activity'
fn_dex = 'dev_sources_dex.cleaned_salesvisits_telesales_2019_enriched'
fn_sap_table = 'data_cleaning.nl_sfa_operators'
fn_take_away_mapping = 'data_user_guus.delivery_match_nl_20200320'
fn_sales_rep = 'data_user_celeste.salesrep_file_nl'

op_sales_not_in_universe, df = CreateEnrichedUniverse(fn_universe, fn_channel_tier, fn_prod_map, fn_op_sales, fn_reachable, fn_osm, fn_restaurant_details, fn_ohub_operators_df=fn_ohub_operators_df, fn_rex=fn_rex, fn_dex=fn_dex, fn_sap_table=fn_sap_table, fn_take_away_mapping=fn_take_away_mapping, fn_sales_rep=fn_sales_rep, country_code='nl', visit_expiration_threshold=730).create_universe()

In [4]:
display(df)

postalCode_cleaned,operatorid,PlaceIDGoogle,osmId,ohubID,concatID,name,address,postalcode,city,cuisineType,MainBusinessType,AllBusinessTypes,Website,Latitude,Longitude,source,keyword,tagReverseGeocode,rationale,buying,valueTier,valueTierDescription,d_buyer,id,keyword.1,uniqueProductCount,SAP_DEX_ID,is_visited,SAPID,cp_mobileNumber,phone,countryGoogle,InternationalPhoneNumberGoogle,uniqueID,take_away,SALESREP
3582.0,/ChIJ--YNHAFmxkcRXis8PAxfOm8/7d9c190a-81d3-40a8-b2e3-22f7deb77db0,ChIJ--YNHAFmxkcRXis8PAxfOm8,,7d9c190a-81d3-40a8-b2e3-22f7deb77db0,NL~RFB~118954,Intratuin Utrecht,"Koningsweg 87, 3582 GC Utrecht, Netherlands",3582 GC,Utrecht,Unknown,Other Restaurant,"[Other, Other Restaurant, Cafe]",https://www.intratuin.nl/winkels/intratuin-utrecht,52.0752593,5.1374849,GPL,,,origin,,5.0,€256.0 - €282.0 (average value = 269.0),0.0,,,,,,,,,Netherlands,31302514591.0,,False,Mantel
9301.0,/33fe4e4b-b69a-4494-8b52-bbfaf92590d2,ChIJ-2LZYQo0yEcRKjCgoYfdwAM,,33fe4e4b-b69a-4494-8b52-bbfaf92590d2,NL~RFB~178313,V.D.H. Products BV,,9301ZS,Roden,Unknown,Other,[Other],,53.1489938,6.4455677,GPL,,,origin,,4.0,€282.0 - €304.0 (average value = 294.0),0.0,,,,,,,,,Netherlands,31503028900.0,,False,Renkema
1071.0,2816542936/ChIJ-2p1X_EJxkcRBAvOayKjGfs/,ChIJ-2p1X_EJxkcRBAvOayKjGfs,2816542936.0,,,Bahya,Roelof Hartstraat,1071VM,Amsterdam,moroccan,Fast food restaurant,"[Other, Fast food restaurant ]",https://www.bahya.nl/,52.3521441,4.885117,OSM,,,origin,,7.0,€156.0 - €211.0 (average value = 188.0),0.0,,,,,,,,,Netherlands,31203547757.0,,False,de Graaff
7667.0,/c7c92051-636d-4aea-abfc-a59731793a8f,ChIJ-3dt2TwbuEcRJeANbG3j3pQ,,c7c92051-636d-4aea-abfc-a59731793a8f,NL~RFB~635734,TDI Techniek,,7667PJ,Reutum,Unknown,Other,[Other],,52.3932351,6.8345571,GPL,,,origin,,3.0,€304.0 - €319.0 (average value = 312.0),0.0,,,,,,,,,Netherlands,31541670941.0,,False,Jager
4569.0,/ChIJ-89e0Wx3xEcRC89qQFTJZX0/14a1c08d-b4f3-4eda-af05-7b3dfce2fe95,ChIJ-89e0Wx3xEcRC89qQFTJZX0,,14a1c08d-b4f3-4eda-af05-7b3dfce2fe95,NL~DEX~1003746381,Herberg de Graauw,"Dorpsplein 7, 4569 AL Graauw, Netherlands",4569 AL,Graauw,Unknown,Hotel,"[Other, Hotel, Other Restaurant]",https://www.facebook.com/herberggraauw/,51.3307279,4.1042921,GPL,,,origin,True,4.0,€282.0 - €304.0 (average value = 294.0),0.0,,,,,,1.0037463815100211e+17,,,,,,False,Timmerman
1785.0,/35233395-c258-4547-957d-7065521b179a,ChIJ-_0jNT1Iz0cRtnSXMsIOfZw,,35233395-c258-4547-957d-7065521b179a,NL~RFB~296141,Sporthal de Brug,,1785BC,Den helder,Unknown,Other,[Other],,52.938922,4.7818827,GPL,,,origin,,5.0,€256.0 - €282.0 (average value = 269.0),0.0,,,,,,,,,Netherlands,31223634486.0,,False,Mierlo
4205.0,877756113/ChIJ0Q-9cw-GxkcRO4O6zY76opc/1551e65c-a1c1-4eb5-b286-6aaff4066a15,ChIJ0Q-9cw-GxkcRO4O6zY76opc,877756113.0,1551e65c-a1c1-4eb5-b286-6aaff4066a15,NL~RFB~122066,Kwalitaria Bon Appetit,Kleine Haarsekade,4205VB,Gorinchem,Unknown,Fast food restaurant,"[Other, Fast food restaurant ]",http://www.kwalitaria.nl/,51.8395153,4.970626,OSM,,,origin,,4.0,€282.0 - €304.0 (average value = 294.0),0.0,,,,,,,,,Netherlands,31183620880.0,,False,Wageningen
6511.0,/ChIJ0UrpmUUIx0cRHKFfvivbAVM/,ChIJ0UrpmUUIx0cRHKFfvivbAVM,,,,'t Hoogstraatje,"Hoogstraat 3, 6511 RT Nijmegen, Netherlands",6511 RT,Nijmegen,Unknown,Other Restaurant,"[Other, Other Restaurant]",http://www.hoogstraatje.nl/,51.8467349,5.8681826,GPL,,,origin,,6.0,€211.0 - €256.0 (average value = 234.0),0.0,,,,,,,,,Netherlands,31243604659.0,,False,Weiss
1312.0,/ChIJ0XRGp9YWxkcRLHiCjyRyCM4/,ChIJ0XRGp9YWxkcRLHiCjyRyCM4,,,,"Nautilus College, afdeling PGL","Lierstraat 17, 1312 JZ Almere, Netherlands",1312 JZ,Almere,Unknown,Other,[Other],http://nautilus.eduvier.nl/,52.3708305,5.1950841,GPL,,,origin,,5.0,€256.0 - €282.0 (average value = 269.0),0.0,,,,,,,,,Netherlands,31365368233.0,,False,Vink
4926.0,/ChIJ0eUOKBsnxEcRsQEVupT8_2Q/,ChIJ0eUOKBsnxEcRsQEVupT8_2Q,,,,V.O.F. Café Ligthart,"Loonsedijk 36, 4926 RL Lage Zwaluwe, Netherlands",4926 RL,Lage Zwaluwe,Unknown,Cafe,"[Other, Cafe]",,51.7045825,4.7109184,GPL,,,origin,,4.0,€282.0 - €304.0 (average value = 294.0),0.0,,,,,,,,,Netherlands,31168484475.0,,False,Timmerman


In [5]:
df.select(['SALESREP']).distinct().show(20, False)

In [6]:
# Location where to save the Delta Table in the DBFS
deltaTable = "/mnt/datamodel/dev/derived/ouniverse/output_total_universe_nl_enriched"

# Location where to put the table in the Databricks database menu
hiveTable = "dev_derived_ouniverse.output_total_universe_nl_enriched"

# Write the data to a Delta Table
universe.write \
  .format("delta") \
  .mode("overwrite") \
  .option("overwriteSchema", "true") \
  .save(deltaTable)

# Load the Delta Table into the Data interface within Databricks.
sqlQuery1 = "drop table if exists " + hiveTable
sqlQuery2 = "create table " + hiveTable + " using delta location " + "'" + deltaTable + "'"
spark.sql(sqlQuery1)
spark.sql(sqlQuery2)