### Configurações Pyspark

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Window

import pandas as pd

def write_to_postgresl(df, tb_name=None, write_mode='None'):
    df.count()
    if tb_name is None:
        raise Exception('Informe o nome da tabela')
    if write_mode is None:
        raise Exception('Informe o mode de escrita: append ou overwrite')
        
    from datetime import datetime
    start_time = datetime.now()
    
    try:
        df.write.jdbc(url, table=tb_name, mode=write_mode, properties=properties)
    except Exception as e:
        print(f'Erro: {e}')
    end_time = datetime.now()
    
    total_time = str(end_time - start_time)
    
    return f'Total time: {total_time} and - Total rows: {df.count()} - Total columns: {len(df.columns)}'



    # Criando a sessão do Spark
spark  = SparkSession.builder \
    .appName("Data Analysis") \
    .config('spark.jars', '/data/IDAF/DATABASECONNECTOR_JAR_FOLDER/postgresql-42.2.18.jar')\
    .config("spark.executor.memory", "8g")\
    .config("spark.driver.memory", "8g")\
    .config("spark.executor.cores", "4")\
    .config("spark.executor.instances","8")\
    .config("spark.sql.shuffle.partitions","96")\
    .config("spark.default.parallelism","96")\
    .getOrCreate()


url = "jdbc:postgresql://localhost:5432/postgres"

properties = {
    "user" : "postgres",
    "password" : "cidacs",
    "driver" : "org.postgresql.Driver"
}

### Configurações Pandas

In [5]:
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100)
spark.conf.set("spark.sql.repl.eagerEval.maxNumRows", 300)

spark.conf.set("spark.sql.debug.maxToStringFields", 100)

pd.set_option("display.max_columns", None) 
pd.set_option("display.max_rows", None)

### Lendo dados enriquecidos

In [6]:
df_input = (spark
            .read
            .parquet('/data/IDAF/PROJETOS/PARCERIA_CIDACS_PHDC/scripts_omop_tb/basefinal_tb_limp_enriched')
           )

### Visit Occurrence

In [8]:
df_visit_occurrence = (df_input
                         .select('dtnasc_sinasc', 'mesprenat_sinasc', 'person_id', 'codmunres_sinasc', 'visit_occurrence_id')
                        .withColumn('visit_occurrence_id', F.col('visit_occurrence_id'))
                        .withColumn('visit_concept_id', F.lit(9202))
                        .withColumn('visit_start_date', F.when(F.col('mesprenat_sinasc')<=9, F.add_months(F.col('dtnasc_sinasc'), -(9 - F.col('mesprenat_sinasc').cast('int'))))
                                 .when((F.col('mesprenat_sinasc')==99)|(F.col('mesprenat_sinasc').isNull()), F.lit('2099-12-31'))
                                 .when(F.col('mesprenat_sinasc')>9, F.trunc(F.col('dtnasc_sinasc'), 'month')))
                        .withColumn('visit_end_date', F.when(F.col('visit_start_date')=='2099-12-31', F.col('visit_start_date'))
                                                    .otherwise(F.date_add(F.col('visit_start_date'), 2)))
                        .withColumn('visit_type_concept_id', F.lit(32879))
                        .withColumn('care_site_id',  F.col('codmunres_sinasc'))
                        .withColumn('visit_start_datetime', F.lit(None))
                        .withColumn('visit_end_datetime', F.lit(None))
                        .withColumn('provider_id', F.lit(None))
                        .withColumn('visit_source_value', F.lit(None))
                        .withColumn('visit_source_concept_id', F.lit(None))
                        .withColumn('admitted_from_concept_id', F.lit(None))
                        .withColumn('admitted_from_source_value', F.lit(None))
                        .withColumn('discharged_to_concept_id', F.lit(None))
                        .withColumn('discharged_to_source_value', F.lit(None))
                        .withColumn('preceding_visit_occurrence_id', F.lit(None))
                    ).select(
                        F.col('visit_occurrence_id').cast('integer'),
                        F.col('person_id').cast('integer'),
                        F.col('visit_concept_id').cast('integer'),
                        F.col('visit_start_date').cast('date'),
                        F.col('visit_start_datetime').cast('timestamp'),
                        F.col('visit_end_date').cast('date'),
                        F.col('visit_end_datetime' ).cast('timestamp'),
                        F.col('visit_type_concept_id').cast('integer'),
                        F.col('provider_id').cast('integer'),
                        F.col('care_site_id').cast('integer'),
                        F.col('visit_source_value').cast('string'),
                        F.col('visit_source_concept_id').cast('integer'),
                        F.col('admitted_from_concept_id').cast('integer'),
                        F.col('admitted_from_source_value').cast('string'),
                        F.col('discharged_to_concept_id').cast('integer'),
                        F.col('discharged_to_source_value').cast('string'),
                        F.col('preceding_visit_occurrence_id').cast('integer')
                            )


df_visit_occurrence.count()

16590554

## Salvando no Postgresql

In [9]:
write_to_postgresl(df_visit_occurrence, tb_name='tuberculose.visit_occurrence', write_mode='append')

                                                                                

'Total time: 0:06:01.945198 and - Total rows: 16590554 - Total columns: 17'