In [41]:
import glob
import os
import pyodbc

from dotenv import load_dotenv

In [42]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [43]:
spark = SparkSession.builder \
    .master('local[*]') \
    .appName("Iniciando com Spark") \
    .config('spark.ui.port', '4051') \
    .getOrCreate()

In [44]:
clients = "Data/Clients"

transactions_in = "Data/Transactions-in"
transactions_out = "Data/Transactions-out"

In [45]:
clients_schema = StructType([
        StructField("id", IntegerType(), True),
        StructField("nome", StringType(), True),
        StructField("email", StringType(), True),
        StructField("data_cadastro", TimestampType(), True),
        StructField("telefone", StringType(), True)
    ])

transactions_schema = StructType([
        StructField("id", IntegerType(), True),
        StructField("cliente_id", IntegerType(), True),
        StructField("valor", DoubleType(), True),
        StructField("data", TimestampType(), True),
    ])

In [46]:
def transform_csv_to_df(spark, path, schema):
    if not os.path.isdir(path):
        raise ValueError(f"{path} não é um diretório válido.")

    list_paths_csv = glob.glob(os.path.join(path, '*.csv'))

    if not list_paths_csv:
        raise ValueError(f"Não foram encontrados arquivos csv em {path}.")

    df = spark.read.csv(list_paths_csv, sep=';', schema=schema, inferSchema=True)

    df = df.filter(~col('id').contains('id'))

    return df

In [47]:
def verify_empty_data(df):
    for col_name in df.columns:
        data_type = df.schema[col_name].dataType
        if data_type == StringType():
            count_empty = df.filter((col(col_name) == '') | isnull(col_name) | isnan(col_name) | (col(col_name).isNull())).count()
            if count_empty != 0:
                print(f"Column '{col_name}' has {count_empty} empty/null/none/NaN values.")
        elif data_type == IntegerType():
            count_null = df.filter(col(col_name).isNull()).count()
            if count_null != 0:
                print(f"Column '{col_name}' has {count_null} null values.")
        elif data_type == TimestampType():
            count_null = df.filter(col(col_name).isNull()).count()
            if count_null != 0:
                print(f"Column '{col_name}' has {count_null} null values.")

In [48]:
def correcting_data(df):
    df = df.withColumn("valor", round(col("valor"), 2))
    df = df.withColumn('valor', expr('abs(valor)'))
    return df

In [49]:
def add_state_column(df):
    # Cria coluna do DDD a partir da coluna Telefone
    df = df.withColumn('DDD', split(df['telefone'], r'[()]+').getItem(1))
    # Substitui os DDDs pelo Estado correspondente
    df = df.withColumn('estado', when(col('DDD') == '20', 'Paraíba')
                        .when(col('DDD') == '21', 'Rio de Janeiro')
                        .when(col('DDD') == '22', 'Mato Grosso')
                        .when(col('DDD') == '23', 'Pernambuco')
                        .when(col('DDD') == '24', 'Rio de Janeiro')
                        .when(col('DDD') == '25', 'Bahia')
                        .when(col('DDD') == '26', 'Minas Gerais')
                        .when(col('DDD') == '27', 'Espírito Santo')
                        .when(col('DDD') == '28', 'Roraima')
                        .when(col('DDD') == '29', 'São Paulo')
                        .when(col('DDD') == '30', 'Maranhão')
                        .otherwise('Inválido'))
    # Apagando coluna DDD
    df = df.drop('DDD')
    return df

In [50]:
def format_names(df):
    # Separando os nomes dos sobrenomes
    df = df.withColumn("nome_split", split(df.nome, " "))
    df = df.withColumn("nome", df.nome_split[0])
    df = df.withColumn("sobrenome1", df.nome_split[1])
    df = df.withColumn("sobrenome2", df.nome_split[2])
    df = df.withColumn("sobrenome3", df.nome_split[3])
    df = df.withColumn("sobrenome4", df.nome_split[4])
    df = df.withColumn("sobrenome5", df.nome_split[5])
    df = df.withColumn("sobrenome6", df.nome_split[6])
    df = df.withColumn("sobrenome", concat_ws(" ", "sobrenome1", "sobrenome2", "sobrenome3", "sobrenome4", "sobrenome5", "sobrenome6"))
    df = df.drop("nome_split", "sobrenome1", "sobrenome2", "sobrenome3", "sobrenome4", "sobrenome5", "sobrenome6")
    # Colocando primeira letra de cada nome Maiúscula
    df = df.withColumn("nome", initcap(df.nome))
    df = df.withColumn("sobrenome", initcap(df.sobrenome))
    # Substituindo linhas vazias por Não Informado
    df = df.withColumn("sobrenome", when(df.sobrenome == "", "Não informado").otherwise(df.sobrenome))
    # Ordenando as colunas
    df = df.select("id", "nome", "sobrenome", "email", "data_cadastro", "telefone", "estado")
    return df

In [51]:
def verify_client_id_existence(spark, df_transactions, df_clients):
    df_ids_transactions = df_transactions.select(col('cliente_id'))
    df_ids_clients = df_clients.select(col('id'))
    df_new_clients = df_ids_transactions.join(df_ids_clients, df_ids_transactions.cliente_id == df_ids_clients.id, "leftanti")
    df_new_clients = df_new_clients.distinct()
    df_new_clients = df_new_clients.withColumnRenamed("cliente_id", "id")
    df_new_clients = df_new_clients.withColumn('nome', lit('Não localizado'))
    df_new_clients = df_new_clients.withColumn('sobrenome', lit('Não localizado'))
    df_new_clients = df_new_clients.withColumn('email', lit('Não localizado'))
    df_new_clients = df_new_clients.withColumn('data_cadastro', lit('1900-01-01 00:00:00').cast('timestamp'))
    df_new_clients = df_new_clients.withColumn('telefone', lit('Não localizado'))
    df_new_clients = df_new_clients.withColumn('estado', lit('Não localizado'))
    df_clients = df_clients.unionAll(df_new_clients)
    return df_clients

In [52]:
def union_df_in_out(df_transactions_in, df_transactions_out):
    df_transactions_in = df_transactions_in.withColumn('tipo_transacao', lit('IN'))
    df_transactions_out = df_transactions_out.withColumn('tipo_transacao', lit('OUT'))
    df_transactions = df_transactions_in.unionAll(df_transactions_out)
    return df_transactions

In [53]:
def connection_database():
    load_dotenv()
    server_name = os.environ["server_name"]
    database_name = os.environ["database_name"]
    username = os.environ["username"]
    password = os.environ["password"]

    connection_string = f"Driver={{ODBC Driver 18 for SQL Server}};Server=tcp:{server_name},1433;Database={database_name};Uid={username};Pwd={password};Encrypt=yes;TrustServerCertificate=no;Connection Timeout=30;"
    return pyodbc.connect(connection_string)

In [54]:
def create_table_clients(conn, df):
    cursor = conn.cursor()
    cursor.execute(f"SELECT COUNT(*) FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 'clientes'")
    if cursor.fetchone()[0] == 0:
        create_table_query = '''CREATE TABLE clientes (
                                    id INTEGER PRIMARY KEY,
                                    nome VARCHAR(255),
                                    sobrenome VARCHAR(255),
                                    email VARCHAR(255),
                                    data_hora_cadastro DATETIME,
                                    telefone VARCHAR(255),
                                    estado VARCHAR(255)
                                    );'''

        cursor.execute(create_table_query)
        conn.commit()
        print("Tabela clientes criada com sucesso!")
        insert_df_into_db(conn, df, "clientes")
    else:
        print("A tabela clientes já está no banco de dados!")

In [55]:
def insert_df_into_db(conn, df, table_name):
    cursor = conn.cursor()
    print("Inserindo os dados na tabela...")
    try:
        columns = ",".join(df.columns)
        placeholders = ",".join("?" for _ in df.columns)
        df = df.rdd.collect()

        for values in df:
            cursor = conn.cursor()
            cursor.execute(f"INSERT INTO {table_name} ({columns}) VALUES ({placeholders})", values)
            cursor.commit()
        print("Os dados foram inseridos com sucesso na tabela.")
    except Exception as e:
        print(f"Ocorreu um erro ao inserir os dados na tabela: {e}")
        conn.rollback()
    finally:
        cursor.close()

In [56]:
def create_table_transactions(conn, df, name_table):
    cursor = conn.cursor()
    cursor.execute(f"SELECT COUNT(*) FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = '{name_table}'")
    if cursor.fetchone()[0] == 0:
        create_table_query = f"CREATE TABLE {name_table} (\
                                id INTEGER PRIMARY KEY,\
                                cliente_id INTEGER REFERENCES clientes (id),\
                                valor DECIMAL(10,2),\
                                data_hora DATETIME,\
                            );"

        cursor.execute(create_table_query)
        conn.commit()
        print(f"Tabela {name_table} criada com sucesso!")
        insert_df_into_db(conn, df, name_table)
    else: 
        print(f"A tabela {name_table} já está no banco de dados!")

In [57]:
try:
    print("Transformando os arquivos CSVs em data frames...")
    df_clients = transform_csv_to_df(spark, clients, clients_schema)
    df_transactions_in = transform_csv_to_df(spark, transactions_in, transactions_schema)
    df_transactions_out = transform_csv_to_df(spark, transactions_out , transactions_schema)
    print("OK")

    print("Verificando se há dados não informados nas colunas dos DataFrames...")
    verify_empty_data(df_clients)
    verify_empty_data(df_transactions_in)
    verify_empty_data(df_transactions_out)
    print("OK")

    print("Corrigindo os dados da coluna valor dos DataFrames de transações...")
    df_transactions_in = correcting_data(df_transactions_in)
    df_transactions_out = correcting_data(df_transactions_out)
    print("OK")

    print("Formatando o DataFrame de clientes...")
    df_clients = add_state_column(df_clients)
    df_clients = format_names(df_clients)
    df_clients = verify_client_id_existence(spark, df_transactions_in, df_clients)
    df_clients = verify_client_id_existence(spark, df_transactions_out, df_clients)
    print("OK")

    print("Unindo os dados das transações em um único DataFrame...")
    dt_transactions = union_df_in_out(df_transactions_in, df_transactions_out)
    print("OK")
      
    print("-" * 30)
    print("Transações in")
    df_transactions_in.show()
    print("-" * 30)
    print("Transações out")
    df_transactions_out.show()
    print("-" * 30)
    print("Dados dos clientes")
    df_clients.show()

    '''
    try:
        print("Conectando com o banco de dados...")
        conn = connection_database()
    except Exception:
        print("Não foi possivel se conectar com o banco de dados!")
    else:
        print("Criando tabela de clientes no banco de dados!")
        create_table_clients(conn, df_clients)
        
        df_transactions_in = df_transactions_in.join(df_clients, df_clients.id == df_transactions_in.cliente_id, "leftsemi")
        df_transactions_out = df_transactions_out.join(df_clients, df_clients.id == df_transactions_out.cliente_id, "leftsemi")

        print("Criando as tabelas de transações no banco de dados!")
        create_table_transactions(conn, df_transactions_in, "transactions_in")
        create_table_transactions(conn, df_transactions_out, "transactions_out")
    '''
    
except Exception as e:
    print(f"Ocorreu o seguinte erro: {e}!")

Transformando os arquivos CSVs em data frames...


                                                                                

OK
Verificando se há dados não informados nas colunas dos DataFrames...


                                                                                

OK
Corrigindo os dados da coluna valor dos DataFrames de transações...
OK
Formatando o DataFrame de clientes...
OK
Unindo os dados das transações em um único DataFrame...
OK
------------------------------
Transações in
+----+----------+-----+-------------------+
|  id|cliente_id|valor|               data|
+----+----------+-----+-------------------+
|3120|       533|  5.0|2021-01-28 23:46:47|
|3119|       533| 25.0|2021-01-28 23:46:47|
|3108|       533|  5.0|2021-01-28 13:47:37|
|3107|       533| 12.5|2021-01-28 13:47:36|
|3106|       533| 12.5|2021-01-28 13:47:36|
|3092|       533| 25.0|2021-01-28 13:02:57|
|3091|       533|  5.0|2021-01-28 13:02:57|
|3079|       533| 25.0|2021-01-23 12:44:31|
|3078|       533|  5.0|2021-01-23 12:44:30|
|3069|       574| 20.0|2021-01-23 00:29:52|
|3066|        74|  7.8|2021-01-22 00:11:14|
|3061|        74| 3.82|2021-01-20 23:05:16|
|3060|         4| 5.58|2021-01-20 23:05:16|
|3046|       370| 50.0|2021-01-20 19:07:00|
|3030|       570| 20.0|2021-01-15



+---+----------+--------------------+--------------------+-------------------+----------------+--------------+
| id|      nome|           sobrenome|               email|      data_cadastro|        telefone|        estado|
+---+----------+--------------------+--------------------+-------------------+----------------+--------------+
| 55|  Edmilson|            Da Silva|edmilson-da-silva...|2019-08-30 00:54:33|+55(22)2922-2626|   Mato Grosso|
| 78|    Maxson|    Barros Do Santos|maxson-barros-do-...|2019-09-10 02:03:42|+55(22)2126-2529|   Mato Grosso|
| 61|     Bruno|       Cesar E Silva|bruno-cesar-e-sil...|2019-08-30 01:05:21|+55(23)2528-2729|    Pernambuco|
|106|  Dernival| Passos De Amarantes|dernival-passos-d...|2019-09-27 02:40:14|+55(29)2927-2322|     São Paulo|
|107|      José|      Rubian De Goes|jose-rubian-de-go...|2019-09-28 13:09:43|+55(22)2023-2620|   Mato Grosso|
|108|  Angelica|Dos Santos De Morais|angelica-dos-sant...|2019-09-28 14:48:16|+55(20)2521-3030|       Paraíba|
|

                                                                                