In [None]:
"""Cria schemas"""

In [1]:
import csv
import re
from collections import defaultdict, OrderedDict
from pyspark.sql.types import StructType, StructField, StringType, LongType, IntegerType, TimestampType, DoubleType

def clean_structure(structure):
   
    cleaned_structure = re.sub(r'(?<=[\w\d\'"])((?:\s*,\s*){2,})', ',', structure)
    cleaned_structure = cleaned_structure.replace("',,", "',")

    return cleaned_structure

def generate_schemas(file_name):
    schemas = defaultdict(OrderedDict)

    with open(file_name, mode='r', encoding="utf-8") as csv_file:
        csv_reader = csv.DictReader(csv_file, delimiter=';', quotechar='"')

        for row in csv_reader:
            
            table_name = row.get('Tabela', '').lower()
            column_structure = row.get('estrutura', '').strip()

          
            column_structure = column_structure.replace(
                "StructField('", "StructField('").replace(",","").replace(
                    ",, ",",").replace("' ","',").replace("() True)","(), True)")

            if column_structure:
                if table_name not in schemas:
                    schemas[table_name] = OrderedDict()
                schemas[table_name][column_structure] = column_structure


    for table_name, fields in schemas.items():
        column_count = len(fields)
        print(f"Nome da Tabela: {table_name}\n")
        formatted_fields = ', \\\n'.join(fields.values())
        formatted_schema = f"StructType([\n{formatted_fields}])"
        print(f"Critério de Agrupamento: {table_name}, Contagem de Colunas: {column_count}")
        print(f"Colunas:\n{formatted_schema}\n")

file_name = r"C:\mlflowjobs\tabelas_teste.csv"
generate_schemas(file_name)


Nome da Tabela: ft_alerta_operacao_horario_celular_diferente

Critério de Agrupamento: ft_alerta_operacao_horario_celular_diferente, Contagem de Colunas: 9
Colunas:
StructType([
StructField('id_ft_alerta_operacao',LongType(), True), \
StructField('id_url',StringType(), True), \
StructField('id_dim_ambiente',LongType(), True), \
StructField('id_dim_colaborador',LongType(), True), \
StructField('status',IntegerType(), True), \
StructField('detalhe_alerta_operacao',StringType(), True), \
StructField('modelo_celular',StringType(), True), \
StructField('horario_celular',TimestampType(), True), \
StructField('horario_atomico',TimestampType(), True)])

Nome da Tabela: ft_alerta_operacao_nivel_bateria

Critério de Agrupamento: ft_alerta_operacao_nivel_bateria, Contagem de Colunas: 7
Colunas:
StructType([
StructField('id_ft_alerta_operacao',LongType(), True), \
StructField('id_url',StringType(), True), \
StructField('id_dim_ambiente',LongType(), True), \
StructField('id_dim_colaborador',LongTyp

In [None]:
"""Cria Tabelas"""

In [None]:
import csv
import re
from collections import defaultdict, OrderedDict
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType

def clean_structure(structure):
 
    cleaned_structure = re.sub(r'(?<=[\w\d\'"])((?:\s*,\s*){2,})', ',', structure)
    cleaned_structure = cleaned_structure.replace("',,", "',").replace("StructField(","")

    return cleaned_structure

def create_table(spark, table_name, schema):
    spark.sql(f"CREATE OR REPLACE TEMPORARY VIEW {table_name} USING delta\nOPTIONS ('path', '/path_to_delta_folder/{table_name}')\nAS SELECT * FROM VALUES (1)")

def generate_tables(file_name):
    spark = SparkSession.builder.appName("SchemaGenerator").getOrCreate()

    schemas = defaultdict(OrderedDict)

    with open(file_name, mode='r', encoding="utf-8") as csv_file:
        csv_reader = csv.DictReader(csv_file, delimiter=';', quotechar='"')

        for row in csv_reader:
            table_name = row.get('Tabela', '').lower()
            column_structure = row.get('estrutura', '').strip()

           
            column_structure = clean_structure(column_structure)

            if column_structure:
                if table_name not in schemas:
                    schemas[table_name] = OrderedDict()
                schemas[table_name][column_structure] = column_structure


    for table_name, fields in schemas.items():
        column_count = len(fields)
        formatted_fields = ', \\\n'.join(fields.values())
        formatted_schema = f"StructType([\n{formatted_fields}])"
        print(f"Nome da Tabela: {table_name}\n")
        print(f"Critério de Agrupamento: {table_name}, Contagem de Colunas: {column_count}")
        print(f"Colunas:\n{formatted_schema}\n")

      
        create_table(spark, table_name, formatted_schema)

    spark.stop()

file_name = r"C:\mlflowjobs\tabelas_teste.csv"
generate_tables(file_name)
