# Imports e Constantes

In [3]:
import json

from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import col, dense_rank, rank
from pyspark.sql.types import IntegerType, StringType, StructField, StructType, TimestampType

TYPES_MAPPING = "config/types_mapping.json"
INPUT_PATH = "data/input/users/load.csv"
OUTPUT_PATH = "data/output/users/users.parquet"
DEDUPLICATED_PATH = "data/output/users/deduplicated.parquet"

# Spark Session

In [4]:
spark = SparkSession.\
        builder.\
        appName("pyspark-notebook").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "1024m").\
        getOrCreate()

21/07/18 02:31:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# Conversão de Tipos

In [5]:
df_schema = spark.read.option("multiLine", True).json(TYPES_MAPPING)

def str_to_dtypes(type: str):
    """Responsavel por converter para pyspark types"""
    _type = df_schema.select(type).collect()[0][0]
    
    data = { "integer": IntegerType(), "timestamp": TimestampType()}    
    try:
        return data[_type]
    except KeyError:        
        return StringType()

                                                                                

# Criação de Schema. (types_mapping.json)

In [8]:
schema = StructType([
    StructField("id", IntegerType(), True), 
    StructField("name", StringType(), True),
    StructField("email", StringType(), True),
    StructField("phone", StringType(), True),
    StructField("address", StringType(), True),
    StructField("age", str_to_dtypes("age"), True),
    StructField("create_date", str_to_dtypes("create_date"), True),
    StructField("update_date", str_to_dtypes("update_date"), True)
])

schema.simpleString()

                                                                                

'struct<id:int,name:string,email:string,phone:string,address:string,age:int,create_date:timestamp,update_date:timestamp>'

# Criação do DataFrame com Schema

In [9]:
df_raw = spark.read.option("header","true").schema(schema).csv(INPUT_PATH)

In [10]:
df_raw.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- phone: string (nullable = true)
 |-- address: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- create_date: timestamp (nullable = true)
 |-- update_date: timestamp (nullable = true)



# Deduplicação dos dados

In [11]:
window = Window.partitionBy("id").orderBy(col("update_date").desc())
df_raw.withColumn("rank", rank().over(window)).show(truncate=False)

                                                                                

+---+----------------------------------+---------------------+---------------+----------------------------------------------+---+--------------------------+--------------------------+----+
|id |name                              |email                |phone          |address                                       |age|create_date               |update_date               |rank|
+---+----------------------------------+---------------------+---------------+----------------------------------------------+---+--------------------------+--------------------------+----+
|1  |david.lynch@cognitivo.ai          |David Lynch          |(11) 99999-9999|Mulholland Drive, Los Angeles, CA, US         |72 |2018-03-03 18:47:01.954752|2018-05-23 10:13:59.594752|1   |
|1  |david.lynch@cognitivo.ai          |David Lynch          |(11) 99999-9998|Mulholland Drive, Los Angeles, CA, US         |72 |2018-03-03 18:47:01.954752|2018-04-14 17:09:48.558151|2   |
|1  |david.lynch@cognitivo.ai          |David Lynch    

# DataFrame de dados Deduplicados

In [12]:
df_deduplicated = df_raw.withColumn("rank",rank().over(window)).where(col("rank") != 1)

# DataFrame de dados sem Deduplicação

In [13]:
df_no_deduplicated = df_raw.withColumn("rank",rank().over(window)).where(col("rank") == 1)

# Salvando arquivos em formato Parquet

In [14]:
df_deduplicated = df_deduplicated.drop("rank").write.mode("overwrite").parquet(DEDUPLICATED_PATH)
df_no_deduplicated = df_no_deduplicated.drop("rank").write.mode("overwrite").parquet(OUTPUT_PATH)

                                                                                

In [15]:
df_deduplicated_parquet = spark.read.parquet(DEDUPLICATED_PATH)
df_no_deduplicated_parquet = spark.read.parquet(OUTPUT_PATH)

In [16]:
df_deduplicated_parquet.show(truncate=False)

+---+----------------------------------+---------------------+---------------+----------------------------------------------+---+--------------------------+--------------------------+
|id |name                              |email                |phone          |address                                       |age|create_date               |update_date               |
+---+----------------------------------+---------------------+---------------+----------------------------------------------+---+--------------------------+--------------------------+
|3  |spongebob.squarepants@cognitivo.ai|Spongebob Squarepants|(11) 91234-5678|124 Conch Street, Bikini Bottom, Pacific Ocean|13 |2018-05-19 04:07:06.854752|2018-05-19 04:07:06.854752|
|1  |david.lynch@cognitivo.ai          |David Lynch          |(11) 99999-9998|Mulholland Drive, Los Angeles, CA, US         |72 |2018-03-03 18:47:01.954752|2018-04-14 17:09:48.558151|
|1  |david.lynch@cognitivo.ai          |David Lynch          |(11) 99999-9997|Mu

In [18]:
df_no_deduplicated_parquet.show(truncate=False)

+---+----------------------------------+---------------------+---------------+----------------------------------------------+---+--------------------------+--------------------------+
|id |name                              |email                |phone          |address                                       |age|create_date               |update_date               |
+---+----------------------------------+---------------------+---------------+----------------------------------------------+---+--------------------------+--------------------------+
|3  |spongebob.squarepants@cognitivo.ai|Spongebob Squarepants|(11) 98765-4321|122 Conch Street, Bikini Bottom, Pacific Ocean|13 |2018-05-19 04:07:06.854752|2018-05-19 05:08:07.964752|
|1  |david.lynch@cognitivo.ai          |David Lynch          |(11) 99999-9999|Mulholland Drive, Los Angeles, CA, US         |72 |2018-03-03 18:47:01.954752|2018-05-23 10:13:59.594752|
|2  |sherlock.holmes@cognitivo.ai      |Sherlock Holmes      |(11) 94815-1623|22