# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [None]:
%stop_session

In [None]:
%idle_timeout 2880
%glue_version 4.0
%worker_type G.1X
%number_of_workers 2
%%configure
{
    "--conf": "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions --conf spark.sql.catalog.glue_catalog.warehouse= --conf spark.sql.catalog.glue_catalog=org.apache.iceberg.spark.SparkCatalog --conf spark.sql.catalog.glue_catalog.catalog-impl=org.apache.iceberg.aws.glue.GlueCatalog --conf spark.sql.catalog.glue_catalog.io-impl=org.apache.iceberg.aws.s3.S3FileIO",
    "--datalake-formats": "iceberg",
    "--additional-python-modules": "requests_ntlm,openpyxl==3.0.10",
    "--JOB_NAME": "sharepoint-sap",
    "--path":"",
    "--config":"",
    "--enable-metrics": "true",
    "--enable-continuous-cloudwatch-log": "true",
    "--enable-spark-ui": "true",
    "--spark-event-logs-path":""
}

####  Run this cell to set up and start your interactive session.


In [72]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import requests
from requests_ntlm import HttpNtlmAuth
import boto3
from botocore.exceptions import NoCredentialsError, PartialCredentialsError, ClientError
from urllib.parse import urlparse
import os
from datetime import date,datetime
from dateutil.relativedelta import relativedelta
import pandas as pd
import s3fs
import json
import ast

fs = s3fs.S3FileSystem(anon=False)
pd.set_option('display.max_columns', None)  

sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
args = getResolvedOptions(sys.argv, ['JOB_NAME','config'])
job.init(args['JOB_NAME'], args)
par_config = args['config']
    
def gerar_lista_meses(data, intervalo):
    # Converter a string de data para objeto datetime
    data_atual = datetime.strptime(data, "%Y-%m-%d")
    
    # Criar uma lista para armazenar os meses
    lista_meses = []
    
    # Iterar mês a mês, retroagindo
    for i in range(intervalo + 1):
        # Calcular o mês para cada iteração
        mes = data_atual - relativedelta(months=i)
        
        # Adicionar o mês à lista no formato YYYYMM
        lista_meses.append(mes.strftime("%Y%m"))
    return lista_meses

def read_json_from_s3(s3_uri):
    # Parse o S3 URI
    parsed_uri = urlparse(s3_uri)
    bucket = parsed_uri.netloc
    key = parsed_uri.path.lstrip('/')

    # Crie um cliente S3
    s3 = boto3.client('s3')

    try:
        # Obtenha o objeto do S3
        response = s3.get_object(Bucket=bucket, Key=key)
        
        # Leia o conteúdo e decodifique
        content = response['Body'].read().decode('utf-8')

        # Parse o JSON
        return json.loads(content)
    except Exception as e:
        print(f"Erro ao ler o arquivo JSON do S3: {str(e)}")
        raise

def arquivo_existe_no_s3(s3_uri):
    """
    Verifica se um arquivo existe no Amazon S3 usando o S3 URI.

    :param s3_uri: O URI do S3 no formato 's3://bucket-name/key/to/object'
    :return: True se o arquivo existe, False caso contrário
    """
    # Parse o S3 URI
    parsed_uri = urlparse(s3_uri)
    if parsed_uri.scheme != 's3':
        raise ValueError("URI inválido. Deve começar com 's3://'")
    
    bucket = parsed_uri.netloc
    key = parsed_uri.path.lstrip('/')

    # Inicializa o cliente S3
    s3_client = boto3.client('s3')

    try:
        # Tenta fazer uma chamada head_object para o arquivo
        s3_client.head_object(Bucket=bucket, Key=key)
        return True
    except ClientError as e:
        if e.response['Error']['Code'] == '404':
            # O arquivo não existe
            return False
        else:
            # Outro erro ocorreu
            raise 
        

def process_dataframe(df, schema_map):
    
    if schema_map is None:
        return df

    for column, rules in schema_map.items():
        if rules['remove'] == "True":
            df.drop(columns=[column], inplace=True, errors='ignore')
        else:
            if rules['rename'] != "-":
                df.rename(columns={column: rules['rename']}, inplace=True)
            if rules['type'] != "-":
                if rules['type'] == "int":
                    df[rules['rename']] = pd.to_numeric(df[rules['rename']], errors='coerce').fillna(0).astype(int)
                elif rules['type'] == "float":
                    df[rules['rename']] = pd.to_numeric(df[rules['rename']], errors='coerce').fillna(0.0).astype(float)
                elif rules['type'] == "str":
                    df[rules['rename']] = df[rules['rename']].astype(str).replace('nan', '').fillna('')
                elif rules['type'] == "date":
                    df[rules['rename']] = pd.to_datetime(df[rules['rename']], errors='coerce').dt.date
                    df[rules['rename']].fillna(pd.Timestamp(0).date(), inplace=True)
                elif rules['type'] == "timestamp":
                    df[rules['rename']] = pd.to_datetime(df[rules['rename']], errors='coerce')
                    df[rules['rename']].fillna(pd.Timestamp(0), inplace=True)
                elif rules['type'] == "timestamp_excel":
                    df[rules['rename']] = pd.to_datetime('1899-12-30') + pd.to_timedelta(df[rules['rename']], 'D')
                    df[rules['rename']].fillna(pd.Timestamp(0), inplace=True)

    return df

def processa_lake(metodo,snapshot,catalog_name,database_name,table_name,match_id,mes,df):
    
    temp_table_name = f"tmp_{table_name}"
    spark_df = spark.createDataFrame(df)
    spark_df.createOrReplaceTempView(temp_table_name)
    #Criação de base de dados caso nao exista
    query_database = f"""CREATE DATABASE IF NOT EXISTS {catalog_name}.{database_name}"""
    spark.sql(query_database)
    print(f"Criando DataBase caso nao exista = {query_database}")

    #Criação de tabela caso nao exista
    query_table = f"""
    CREATE TABLE IF NOT EXISTS {catalog_name}.{database_name}.{table_name}
    USING iceberg 
    TBLPROPERTIES ("format-version"="2")
    AS SELECT * FROM {temp_table_name}"""
    spark.sql(query_table)  
    print(f"Criando Tabela caso nao exista = {catalog_name}.{database_name}.{table_name}")

    if snapshot == False:
        query_snapshot = f"""ALTER TABLE {catalog_name}.{database_name}.{table_name} SET TBLPROPERTIES ('history.expire.max-snapshots' = '10')"""
        spark.sql(query_snapshot)
        print(f"Removendo Snapshot tabela {catalog_name}.{database_name}.{table_name}")

    if metodo == 'merge-full':

        colunas = [coluna for coluna in df.columns if coluna != match_id]
        condicao_select = " ".join([f"b.{c} as {c}," for c in colunas])
        condicao_where = " AND ".join([f"a.{c} = b.{c}" for c in colunas])
        
        query_merge = f"""
        WITH changes AS
        (SELECT
        COALESCE(b.{match_id}, a.{match_id}) AS {match_id}, {condicao_select}
        CASE WHEN b.{match_id} IS NULL THEN 'D' WHEN a.{match_id} IS NULL THEN 'I' ELSE 'U' END as cdc
        FROM {catalog_name}.{database_name}.{table_name} a
        FULL OUTER JOIN {temp_table_name} b ON a.{match_id} = b.{match_id}
        WHERE NOT coalesce(({condicao_where}), false))
        MERGE INTO {catalog_name}.{database_name}.{table_name}
        USING changes
        ON {catalog_name}.{database_name}.{table_name}.{match_id} = changes.{match_id}
        WHEN MATCHED AND changes.cdc = 'D' THEN DELETE
        WHEN MATCHED AND changes.cdc = 'U' THEN UPDATE SET *
        WHEN NOT MATCHED THEN INSERT *
        """
        print(f"Fazendo Merge da tabela tabela {catalog_name}.{database_name}.{table_name}")
        spark.sql(query_merge)
        return True
    
    elif metodo == 'drop-insert':
        
        query_truncate= f"""TRUNCATE TABLE {catalog_name}.{database_name}.{table_name}"""
        spark.sql(query_truncate)
        print(f"Truncando tabela {catalog_name}.{database_name}.{table_name}")
        query_insert = f"""INSERT INTO {catalog_name}.{database_name}.{table_name} SELECT * FROM {temp_table_name}"""
        spark.sql(query_insert)
        print(f"Insert tabela {catalog_name}.{database_name}.{table_name}")
        return True
    
    elif metodo == 'delete-insert-mes':

        query_delete= f"""DELETE FROM {catalog_name}.{database_name}.{table_name} WHERE {match_id} = {mes}"""
        spark.sql(query_delete)
        print(f"DELETANDO tabela {catalog_name}.{database_name}.{table_name} WHERE {match_id} = {mes}")
        query_insert = f"""INSERT INTO {catalog_name}.{database_name}.{table_name} SELECT * FROM {temp_table_name}"""
        spark.sql(query_insert)
        print(f"Insert tabela {catalog_name}.{database_name}.{table_name}")
        return True

    else:
        return False
    
    return True

def get_tipo_by_valor(valor, data):
    
    for item in data:
        if item['valor'] == valor:
            return item['tipo']
    return None

def extrair_tipo(bigint, data):

    primeiro_digito = int(str(bigint)[0])
    return get_tipo_by_valor(primeiro_digito, data)

def transformacao_dataframe(df_trans,function):
    
    if function == 'default':
        
        return df_trans
    
    elif function == 'transformacao_sap_balancete':

        filtro_tipo = (df_trans['Unnamed: 2'] != 2900 ) & (df_trans['Unnamed: 4'] != '')
        df_trans_tipo = df_trans.loc[filtro_tipo]
        df_trans_tipo['mapa'] = df_trans_tipo['Unnamed: 4'].apply(lambda x: x if isinstance(x, str) and len(x.split(' ')[0]) == 1 else None)
        df_trans_tipo = df_trans_tipo.dropna(subset=['mapa'])
        coluna_remove = ["Unnamed: 0", "Unnamed: 1", "Unnamed: 2", "Unnamed: 3", "Unnamed: 4","Unnamed: 5", "Unnamed: 6","Unnamed: 7","Unnamed: 8", "Unnamed: 9","Unnamed: 10","Unnamed: 11","Unnamed: 12","Unnamed: 13", "Unnamed: 14", "Unnamed: 15", "Unnamed: 16", "Unnamed: 17"]
        df_trans_tipo.drop(columns=coluna_remove, inplace=True)
        df_trans_tipo[['valor', 'tipo']] = df_trans_tipo['mapa'].str.split(' ', 1, expand=True)
        df_trans_tipo['valor'] = df_trans_tipo['valor'].astype(int)
        df_trans_tipo.drop(columns=['mapa'], inplace=True)
        data = df_trans_tipo.to_dict(orient='records')
        filtro = df_trans['Unnamed: 2'] == 2900
        df_trans_fitro = df_trans.loc[filtro]
        coluna_remove = ["Unnamed: 0", "Unnamed: 1", "Unnamed: 2", "Unnamed: 3", "Unnamed: 5", "Unnamed: 6","Unnamed: 8", "Unnamed: 9", "Unnamed: 11","Unnamed: 13", "Unnamed: 14", "Unnamed: 15", "Unnamed: 16", "Unnamed: 17"]
        df_trans_fitro.drop(columns=coluna_remove, inplace=True)
        df_trans_fitro['tipo'] = df_trans_fitro['Unnamed: 4'].apply(lambda x: extrair_tipo(x, data))
        df_trans_fitro.rename(columns={"Unnamed: 4": "conta_contabil_10","Unnamed: 7": "desc_conta_contabil","Unnamed: 10": "valor","Unnamed: 12": "valor_1"}, inplace=True)
        dtype = {'conta_contabil_10':'int','desc_conta_contabil': 'str','valor': 'float','valor_1': 'float','tipo': 'str'}
        df_trans_fitro.astype(dtype)

    return df_trans_fitro






In [None]:
list_params = read_json_from_s3(par_config+"config.json")

for params in list_params:
    if params['active'] == True:
        
        file = params['file']
        s3_path = params['s3_path']
        match_id = params['match_id']
        data = params['data']
        if data == 'CURRENT_DATE':
            current_date = datetime.now()
            data = current_date.strftime('%Y-%m-%d')

        intervalo = int(params['intervalo'])
        catalog_name = params['catalog_name']
        database_name = params['database_name']
        table_name = params['table_name']
        metodo = params['metodo']
        sheet_name = params['sheet_name']
        skiprows = int(params['skiprows'])
        function = params['function']
        snapshot = params['snapshot']
        # Abrir o arquivo do S3
        try:
            file_schema_config = par_config+file.split('.')[0]+".json"
            schema_map = read_json_from_s3(file_schema_config)
        except:
            schema_map = None
            print(f"Nao possui arquivo de mapeamento")
    
        if metodo in ["merge-full","drop-insert"]:
            s3_path_file = s3_path+file
            
            if arquivo_existe_no_s3(s3_path_file):
                with fs.open(s3_path_file, 'rb') as f:
                    try:
                        df_excel = pd.read_excel(f, sheet_name=sheet_name,skiprows=skiprows,engine='openpyxl')
                    except:
                        print(f" O arquivo {file} fora de formato")
                        continue
                        
                    df = process_dataframe(df_excel,schema_map)
                    retorno_processa_lake = processa_lake(metodo,snapshot,catalog_name,database_name,table_name,match_id,'',df)
            else:
                print(f" O arquivo {file} nao existe no S3")

        elif metodo in ["delete-insert-mes"]:
            resultado = gerar_lista_meses(data, intervalo)
            for mes in resultado:
                s3_path_file = s3_path+mes+'_'+file
                print(s3_path_file)
                
                if arquivo_existe_no_s3(s3_path_file):
                    with fs.open(s3_path_file, 'rb') as f:
                        try:
                            df_excel = pd.read_excel(f, sheet_name=sheet_name,skiprows=skiprows,engine='openpyxl')
                        except:
                            print(f" O arquivo {file} fora de formato")
                            continue

                        df_excel = transformacao_dataframe(df_excel,function)
                        df = process_dataframe(df_excel,schema_map)
                        df_excel['mes'] = mes
                        retorno_processa_lake = processa_lake(metodo,snapshot,catalog_name,database_name,table_name,match_id,mes,df)
                
                else:
                    print(f" O arquivo {file} nao existe no S3")
        else:
            print(f"Metodo Inexistente")

