In [1]:
# Importa funções do PySpark
import os
import sys
import boto3

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from botocore.exceptions import ClientError

In [3]:
# Inicializa a SparkSession
spark = SparkSession.builder.appName("InicializarDynamo").getOrCreate()

In [4]:
# Lê o CSV 
df = spark.read.csv('../data/amostragem.csv', header=True, inferSchema=True, sep=';')

In [5]:
# Substitui status de acorco com a descrição e remove o usuário diferente de Jeferson
df = df.withColumn(
    "status",
    when(col("Status Descrição") == "Concluído", "done")
    .when(col("Status Descrição") == "A Fazer", "todo")
    .otherwise(None)
)

In [6]:
# Mantém apenas linhas com status definido (done, todo)
df = df.filter(col("status").isNotNull())

In [7]:
# Mantém apenas linhas onde o usuário é 'Jeferson Klau'
df = df.filter(col("Usuário") == "Jeferson Klau")

In [8]:
# Remove a coluna "Status Descrição" e "Tipo da Tarefa ID"
df = df.drop("Status Descrição", "Tipo da Tarefa ID")

In [9]:
# Renomeia colunas e adiciona colunas "pk" e "sk"
df = df.withColumnRenamed("Nome da Tarefa", "name") \
       .withColumnRenamed("Tipo da Tarefa", "taskType") \
       .withColumnRenamed("Data de Criação", "date") \
       .withColumnRenamed("Data de Conclusão", "dateCompleted") \
       .withColumnRenamed("Usuário", "user") \
       .withColumnRenamed("ID do Usuário", "userID") \
       .withColumnRenamed("Status", "status") 

In [10]:
# Adicionando a coluna PK
df = df.withColumn("PK", concat(lit("LIST#"), date_format(col("date"), "yyyyMMdd")))

In [11]:
# Cria coluna 'date' formatada
df = df.withColumn("date", date_format(col("date"), "yyyy-MM-dd"))

In [12]:
df = df.withColumn("dateCompleted", date_format(col("dateCompleted"), "yyyy-MM-dd"))

In [13]:
# Visualiza os dados
df.show(100, truncate=False)

+------------------------------------------------------+------------------+----------+-------------+------+-------------+--------------------------------+-------------+
|name                                                  |taskType          |date      |dateCompleted|status|user         |userID                          |PK           |
+------------------------------------------------------+------------------+----------+-------------+------+-------------+--------------------------------+-------------+
|Cum iure exercitationem laboriosam                    |Tarefa a Ser Feita|2024-07-18|2025-02-12   |done  |Jeferson Klau|b4853fc1f03a3a4cec530a98a94d89ad|LIST#20240718|
|Asperiores iusto                                      |Tarefa a Ser Feita|2025-05-29|NULL         |todo  |Jeferson Klau|b4853fc1f03a3a4cec530a98a94d89ad|LIST#20250529|
|Asperiores pariatur voluptatibus magnam               |Tarefa a Ser Feita|2023-09-12|NULL         |todo  |Jeferson Klau|b4853fc1f03a3a4cec530a98a94d89ad|L

In [14]:
df = df.withColumn("row_id", monotonically_increasing_id())

In [15]:
df = df.withColumn("itemId", sha2(col("row_id").cast("string"), 256))

In [16]:
df = df.withColumn("SK", concat(lit("ITEM#"), col("itemId")))

In [17]:
df = df.drop("row_id")

In [18]:
from dotenv import load_dotenv
load_dotenv()
userId = os.getenv("USER_ID")
df = df.withColumn("userID", lit(userId))

In [19]:
df_final = df.select(
    "PK", "SK", "date", "itemId", "name", "status",
    "dateCompleted", "taskType", "user", "userID"
)

In [20]:
df_final.show(100, truncate=False)

+-------------+---------------------------------------------------------------------+----------+----------------------------------------------------------------+------------------------------------------------------+------+-------------+------------------+-------------+------------------------------------+
|PK           |SK                                                                   |date      |itemId                                                          |name                                                  |status|dateCompleted|taskType          |user         |userID                              |
+-------------+---------------------------------------------------------------------+----------+----------------------------------------------------------------+------------------------------------------------------+------+-------------+------------------+-------------+------------------------------------+
|LIST#20240718|ITEM#5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91b46729d73a

In [None]:
dynamodb = boto3.resource('dynamodb', region_name='sa-east-1')  
table = dynamodb.Table("TableTeste")

dados = [row.asDict() for row in df_final.collect()]  

try:
    with table.batch_writer() as batch:
        for item in dados:  
            batch.put_item(Item=item)
    print("Inserção em lote concluída com sucesso!")
except ClientError as e:
    print(f"Erro na inserção em lote: {e.response['Error']['Message']}")
except Exception as e:
    print(f"Ocorreu um erro inesperado: {e}")

Inserção em lote concluída com sucesso!
