In [None]:
from pyspark import SparkConf, SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType, DecimalType, TimestampType, DataType, IntegerType
from pyspark.sql.functions import col, lit, trim, substring, concat, udf, upper, initcap
from datetime import datetime
%run ./lib.py
import os

: 

In [48]:
conf = SparkConf() \
    .setAppName("projeto_pbi") \
    .setSparkHome('./spark/home')

LAKE_HOME = os.getenv("LAKE_HOME", "/spark/home")

sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession(sc)

In [49]:
TABLE_NAME = 'dim_customer'

def create_sk(spark, df, key_column_name, table_name):
    sk = {}
    sk = df.select(col(key_column_name).alias("key")).rdd.zipWithIndex()
    new_sk = sk.map(lambda row: list(row[0]) + [row[1] + 1])
    new_sk_map = new_sk.collectAsMap()

    sk_schema = \
        StructType(
            [StructField('key', StringType(), True),
            StructField('SK', LongType(), True)]
        )

    sk_frame = spark.createDataFrame(new_sk, sk_schema)
    sk_frame.write.mode('overwrite').csv('{}/dataset/e-commerce/02_surrogate_key/sk_{}.csv'.format(LAKE_HOME, table_name), header=True)

    return new_sk_map

def locate_sk(mapping: dict):
    return udf(lambda x: mapping.get(x), IntegerType())

In [50]:
df = spark.read.csv('{}/dataset/e-commerce/01_extract/olist_customers_dataset.csv'.format(LAKE_HOME), header=True)

df_customer = \
    df \
        .withColumnRenamed('customer_id', 'ID_CUSTOMER') \
        .withColumnRenamed('customer_zip_code_prefix', 'COD_ZIP') \
        .withColumnRenamed('customer_city', 'DES_CITY') \
        .withColumnRenamed('customer_state', 'COD_STATE') \
        .drop('customer_unique_id')

In [51]:
sk = create_sk(spark=spark, df=df_customer, key_column_name='ID_CUSTOMER', table_name=TABLE_NAME)
sk

                                                                                

{'06b8999e2fba1a1fbc88172c00ba8bc7': 1,
 '18955e83d337fd6b2def6b18a428ac77': 2,
 '4e7b3e00288586ebd08712fdd0374a03': 3,
 'b2b6027bc5c5109e529d4dc6358b12c3': 4,
 '4f2d8ab171c80ec8364f7c12e35b23ad': 5,
 '879864dab9bc3047522c92c82e1212b8': 6,
 'fd826e7cf63160e536e0908c76c3f441': 7,
 '5e274e7a0c3809e14aba7ad5aae0d407': 8,
 '5adf08e34b2e993982a47070956c5c65': 9,
 '4b7139f34592b3a31687243a302fa75b': 10,
 '9fb35e4ed6f0a14a4977cd9aea4042bb': 11,
 '5aa9e4fdd4dfd20959cad2d772509598': 12,
 'b2d1536598b73a9abd18e0d75d92f0a3': 13,
 'eabebad39a88bb6f5b52376faec28612': 14,
 '1f1c7bf1c9b041b292af6c1c4470b753': 15,
 '206f3129c0e4d7d0b9550426023f0a08': 16,
 'a7c125a0a07b75146167b7f04a7f8e98': 17,
 'c5c61596a3b6bd0cee5766992c48a9a1': 18,
 '9b8ce803689b3562defaad4613ef426f': 19,
 '49d0ea0986edde72da777f15456a0ee0': 20,
 '154c4ded6991bdfa3cd249d11abf4130': 21,
 '690172ab319622688d3b4df42f676898': 22,
 '2938121a40a20953c43caa8c98787fcb': 23,
 '237098a64674ae89babdc426746260fc': 24,
 'cb721d7b4f271fd87011c4c

In [52]:
dim_customer = \
    df_customer \
        .withColumn('SK_CUSTOMER', locate_sk(sk)(col('ID_CUSTOMER')))

In [53]:
dim_customer = \
    dim_customer.select(
        col('SK_CUSTOMER'),
        col('ID_CUSTOMER'),
        col('COD_ZIP'),
        initcap(col('DES_CITY')).alias('DES_CITY'),
        col('COD_STATE')
    )

dim_customer.write.mode('overwrite').csv('{}/dataset/e-commerce/03_dim/{}.csv'.format(LAKE_HOME, TABLE_NAME), header=True)