# Configurações Pyspark

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Window

import pandas as pd

def write_to_postgresl(df, tb_name=None, write_mode='None'):
    df.count()
    if tb_name is None:
        raise Exception('Informe o nome da tabela')
    if write_mode is None:
        raise Exception('Informe o mode de escrita: append ou overwrite')
        
    from datetime import datetime
    start_time = datetime.now()
    
    try:
        df.write.jdbc(url, table=tb_name, mode=write_mode, properties=properties)
    except Exception as e:
        print(f'Erro: {e}')
    end_time = datetime.now()
    
    total_time = str(end_time - start_time)
    
    return f'Total time: {total_time} and - Total rows: {df.count()} - Total columns: {len(df.columns)}'



    # Criando a sessão do Spark
spark  = SparkSession.builder \
    .appName("Data Analysis") \
    .config('spark.jars', '/data/IDAF/DATABASECONNECTOR_JAR_FOLDER/postgresql-42.2.18.jar')\
    .config("spark.executor.memory", "8g")\
    .config("spark.driver.memory", "8g")\
    .config("spark.executor.cores", "4")\
    .config("spark.executor.instances","8")\
    .config("spark.sql.shuffle.partitions","96")\
    .config("spark.default.parallelism","96")\
    .getOrCreate()


url = "jdbc:postgresql://localhost:5432/postgres"

properties = {
    "user" : "postgres",
    "password" : "cidacs",
    "driver" : "org.postgresql.Driver"
}

25/05/26 12:21:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/26 12:21:15 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/05/26 12:21:15 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/05/26 12:21:15 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
25/05/26 12:21:15 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.


# Configurações Pandas

In [2]:
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100)
spark.conf.set("spark.sql.repl.eagerEval.maxNumRows", 300)

spark.conf.set("spark.sql.debug.maxToStringFields", 100)

pd.set_option("display.max_columns", None) 
pd.set_option("display.max_rows", None)

### Lendo dados enriquecidos

In [3]:
df_input = (spark
            .read
            .parquet('/data/IDAF/PROJETOS/PARCERIA_CIDACS_PHDC/basefinal_gest_limp_enriched', header=True)
           )

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

### Observation Period

In [4]:
window_observation_period = Window.partitionBy().orderBy('person_id', 'dtnasc_sinasc')

In [5]:
df_observation_period = (df_input
                         .withColumn('person_id', F.col('person_id'))
                         .withColumn('observation_period_start_date', F.when(F.col('mesprenat_sinasc')<=9, F.add_months(F.col('dtnasc_sinasc'), -(9 - F.col('mesprenat_sinasc').cast('int'))))
                                     .when((F.col('mesprenat_sinasc')==99)|(F.col('mesprenat_sinasc').isNull()), F.lit('2099-12-31'))
                                     .when(F.col('mesprenat_sinasc')>9, F.trunc(F.col('dtnasc_sinasc'), 'month')))
                         .withColumn('observation_period_end_date', F.col('dtnasc_sinasc'))
                         .withColumn('period_type_concept_id', F.lit(32879))
                         .withColumn('observation_period_id', F.row_number().over(window_observation_period))
                        ).select(F.col('observation_period_id').cast('integer'),
                                 F.col('person_id').cast('integer'),
                                 F.col('observation_period_start_date').cast('date'),
                                 F.col('observation_period_end_date').cast('date'),
                                 F.col('period_type_concept_id').cast('integer')
                                )

df_observation_period.count()

                                                                                

16609485

## Salvando no Postgresql

In [10]:
write_to_postgresl(df_observation_period, tb_name='omop.observation_period', write_mode='append')

25/05/06 21:13:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/06 21:13:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/06 21:13:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/06 21:13:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/06 21:13:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

'Total time: 0:07:59.601125 and - Total rows: 16609485 - Total columns: 5'

## Deprecated

### Salvando CSV 

In [6]:
df_observation_period.repartition(1).write.parquet('/data/IDAF/PROJETOS/PARCERIA_CIDACS_PHDC/omop_scripts_base_16mi_karine/csv/observation_period', mode='overwrite')

25/05/26 12:21:39 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/26 12:21:39 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/26 12:21:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/26 12:21:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

## SQL de insertion

In [None]:
# -- CREATE TABLE public.observation_period (
# -- 			observation_period_id integer NOT NULL,
# -- 			person_id integer NOT NULL,
# -- 			observation_period_start_date date NOT NULL,
# -- 			observation_period_end_date date NOT NULL,
# -- 			period_type_concept_id integer NOT NULL );

# -- CREATE TABLE public.observation_period_pyspark (
# -- 			observation_period_id varchar,
# -- 			person_id varchar,
# -- 			observation_period_start_date varchar,
# -- 			observation_period_end_date varchar,
# -- 			period_type_concept_id varchar);

# insert into public.observation_period (
# observation_period_id, 
# person_id,
# observation_period_start_date,
# observation_period_end_date,
# period_type_concept_id
# )
# SELECT 
# cast(observation_period_id as integer), 
# cast(person_id as integer),
# cast(observation_period_start_date as date),
# cast(observation_period_end_date as date),
# cast(period_type_concept_id as integer)
# 	FROM public.observation_period_pyspark;