### Configurações Pyspark

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Window

import pandas as pd

def write_to_postgresl(df, tb_name=None, write_mode='None'):
    df.count()
    if tb_name is None:
        raise Exception('Informe o nome da tabela')
    if write_mode is None:
        raise Exception('Informe o mode de escrita: append ou overwrite')
        
    from datetime import datetime
    start_time = datetime.now()
    
    try:
        df.write.jdbc(url, table=tb_name, mode=write_mode, properties=properties)
    except Exception as e:
        print(f'Erro: {e}')
    end_time = datetime.now()
    
    total_time = str(end_time - start_time)
    
    return f'Total time: {total_time} and - Total rows: {df.count()} - Total columns: {len(df.columns)}'



    # Criando a sessão do Spark
spark  = SparkSession.builder \
    .appName("Data Analysis") \
    .config('spark.jars', '/data/IDAF/DATABASECONNECTOR_JAR_FOLDER/postgresql-42.2.18.jar')\
    .config("spark.executor.memory", "8g")\
    .config("spark.driver.memory", "8g")\
    .config("spark.executor.cores", "4")\
    .config("spark.executor.instances","8")\
    .config("spark.sql.shuffle.partitions","96")\
    .config("spark.default.parallelism","96")\
    .getOrCreate()


url = "jdbc:postgresql://localhost:5432/postgres"

properties = {
    "user" : "postgres",
    "password" : "cidacs",
    "driver" : "org.postgresql.Driver"
}

25/03/13 15:34:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/13 15:34:46 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/03/13 15:34:46 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/03/13 15:34:46 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


## Configurações Pandas

In [2]:
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100)
spark.conf.set("spark.sql.repl.eagerEval.maxNumRows", 300)

spark.conf.set("spark.sql.debug.maxToStringFields", 100)

pd.set_option("display.max_columns", None) 
pd.set_option("display.max_rows", None)

### Lendo dados enriquecidos

In [3]:
df_input = (spark
            .read
            .parquet('/data/IDAF/PROJETOS/PARCERIA_CIDACS_PHDC/banco_original_enriched')
           )

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

### Care Site

In [4]:
df_care_site = (df_input
                         .select('codmunres_sinasc') # exitem valores com 5 e 6 digitos e podem representar o mesmo municipio
                         .distinct()
                         .withColumn('care_site_id', F.col('codmunres_sinasc'))
                         .withColumn('care_site_source_value', F.col('codmunres_sinasc'))
                         .withColumn('care_site_name', F.lit(None))
                         .withColumn('place_of_service_concept_id', F.lit(None).cast('integer'))
                         # criando location_id: precisei reescrever a lógica aqui pois se eu trouxesse o location_id da base original haveria duplicidade ao fazer o distinct
                         .withColumn('county', F.col('codmunres_sinasc'))
                         .withColumn('location_id', F.col('codmunres_sinasc'))
                         .withColumn('place_of_service_source_value', F.lit(None))
                        ).select(F.col('care_site_id').cast('integer'),
                                 F.col('care_site_source_value').cast('string'),
                                 F.col('care_site_name').cast('string'),
                                 F.col('place_of_service_concept_id').cast('integer'),
                                 F.col('location_id').cast('integer'),
                                 F.col('place_of_service_source_value').cast('string')
                                )

df_care_site.count()

                                                                                

11158

### Salvando dados no Postgresql

In [5]:
write_to_postgresl(df_care_site, tb_name='care_site', write_mode='append')

                                                                                

'Total time: 0:00:02.938407 and - Total rows: 11158 - Total columns: 6'

## Deprecated

### Salvando CSV 

In [None]:
# df_care_site.repartition(1).write.option("quoteAll",True).csv('/data/IDAF/SHARED/omop_tables_revisadas/care_site', header=True, mode='overwrite')

## SQL de insertion

In [None]:
# -- CREATE TABLE public.care_site (
# -- 			care_site_id integer NOT NULL,
# -- 			care_site_name varchar(255) NULL,
# -- 			place_of_service_concept_id integer NULL,
# -- 			location_id integer NULL,
# -- 			care_site_source_value varchar(50) NULL,
# -- 			place_of_service_source_value varchar(50) NULL );


# -- CREATE TABLE public.care_site_pyspark (
# -- 			care_site_id varchar,
# -- 			care_site_name varchar,
# -- 			place_of_service_concept_id varchar,
# -- 			location_id varchar,
# -- 			care_site_source_value varchar,
# -- 			place_of_service_source_value varchar);


# insert into public.care_site (
# care_site_id ,
# care_site_name ,
# place_of_service_concept_id ,
# location_id ,
# care_site_source_value ,
# place_of_service_source_value 
# )
# SELECT 
# cast(case when care_site_id = '' then null else care_site_id end as integer) ,
# care_site_name ,
# cast(case when place_of_service_concept_id = '' then null else place_of_service_concept_id end as integer ) ,
# cast(case when location_id = '' then null else location_id end  as integer) ,
# care_site_source_value ,
# place_of_service_source_value 
# 	FROM public.care_site_pyspark;


