### Configurações Pyspark

In [29]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Window

import pandas as pd

def write_to_postgresl(df, tb_name=None, write_mode='None'):
    df.count()
    if tb_name is None:
        raise Exception('Informe o nome da tabela')
    if write_mode is None:
        raise Exception('Informe o mode de escrita: append ou overwrite')
        
    from datetime import datetime
    start_time = datetime.now()
    
    try:
        df.write.jdbc(url, table=tb_name, mode=write_mode, properties=properties)
    except Exception as e:
        print(f'Erro: {e}')
    end_time = datetime.now()
    
    total_time = str(end_time - start_time)
    
    return f'Total time: {total_time} and - Total rows: {df.count()} - Total columns: {len(df.columns)}'



    # Criando a sessão do Spark
spark  = SparkSession.builder \
    .appName("Data Analysis") \
    .config('spark.jars', '/data/IDAF/DATABASECONNECTOR_JAR_FOLDER/postgresql-42.2.18.jar')\
    .config("spark.executor.memory", "8g")\
    .config("spark.driver.memory", "8g")\
    .config("spark.executor.cores", "4")\
    .config("spark.executor.instances","8")\
    .config("spark.sql.shuffle.partitions","96")\
    .config("spark.default.parallelism","96")\
    .getOrCreate()


url = "jdbc:postgresql://localhost:5432/postgres"

properties = {
    "user" : "postgres",
    "password" : "cidacs",
    "driver" : "org.postgresql.Driver"
}

### Configurações Pandas

In [30]:
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100)
spark.conf.set("spark.sql.repl.eagerEval.maxNumRows", 300)

spark.conf.set("spark.sql.debug.maxToStringFields", 100)

pd.set_option("display.max_columns", None) 
pd.set_option("display.max_rows", None)

### Lendo dados enriquecidos

In [31]:
df_input = (spark
            .read
            .parquet('/data/IDAF/PROJETOS/PARCERIA_CIDACS_PHDC/banco_original_enriched'))

### Procedure Occurrence

In [32]:
window_procedure_occurrence = Window.partitionBy().orderBy('person_id', 'dtnasc_sinasc')

In [35]:
df_procedure_occurrence = (df_input
#                           .filter(F.col('id_cidacs_mae_sinasc')==10010232335)
                         .withColumn('person_id', F.col('person_id'))
                         .withColumn('tpconfirma_mae_calc', F.lit(4193412))
                         .withColumn('parto_sinasc_calc', F.when(F.col('parto_sinasc').isin(0,88,99), 0)
                                                            .when(F.col('parto_sinasc')==1, 44784097)
                                                            .when(F.col('parto_sinasc')==1, 4015701)
                                                            .when(F.col('parto_sinasc')==1, 44784097)
                                                            .otherwise(0))
                         .withColumn('procedure', F.concat_ws(",", F.col('tpconfirma_mae_calc'), F.col('parto_sinasc_calc'))) 
                         .withColumn('procedure_concept_id', F.explode(F.split(F.col('procedure'), ',')))
                         .withColumn('procedure_source_concept_id', F.lit(None))
                         .withColumn('procedure_type_concept_id', F.lit(32879))
                         .withColumn('procedure_date', F.when(F.col('procedure_concept_id')==4193412, F.coalesce(F.col('dt_notific_mae'), F.col('dtnasc_sinasc')))
                                     .when(F.col('procedure_concept_id')==44784097, F.col('dtnasc_sinasc'))
                                     .when(F.col('procedure_concept_id')==4015701, F.col('dtnasc_sinasc'))
                                     .otherwise(F.coalesce(F.col('dt_notific_mae'), F.col('dtnasc_sinasc'))) 
                                    )
                         .withColumn('procedure_source_value', F.when(F.col('procedure_concept_id')==4193412, F.col('tpconfirma_mae')).otherwise(None))
                         .withColumn('quantity', F.lit(None))
                           .withColumn('procedure_occurrence_id', F.row_number().over(window_procedure_occurrence))
                           .withColumn('procedure_datetime', F.lit(None))
                           .withColumn('procedure_end_date', F.lit(None))
                           .withColumn('procedure_end_datetime', F.lit(None))
                           .withColumn('modifier_concept_id', F.lit(None))
                           .withColumn('provider_id', F.lit(None))
                           .withColumn('visit_occurrence_id', F.col('visit_occurrence_id'))
                           .withColumn('visit_detail_id', F.lit(None))
                           .withColumn('modifier_source_value', F.lit(None))
                          ).select( F.col('procedure_occurrence_id').cast('integer'),
                                    F.col('person_id').cast('integer'),
                                    F.col('procedure_concept_id').cast('integer'),
                                    F.col('procedure_date').cast('date'),
                                    F.col('procedure_datetime').cast('timestamp'),
                                    F.col('procedure_end_date').cast('date'),
                                    F.col('procedure_end_datetime').cast('timestamp'),
                                    F.col('procedure_type_concept_id').cast('integer'),
                                    F.col('modifier_concept_id').cast('integer'),
                                    F.col('quantity').cast('float'),
                                    F.col('provider_id').cast('integer'),
                                    F.col('visit_occurrence_id').cast('integer'),
                                    F.col('visit_detail_id').cast('integer'),
                                    F.col('procedure_source_value').cast('string'),
                                    F.col('procedure_source_concept_id').cast('integer'),
                                    F.col('modifier_source_value').cast('string')
                                )
df_procedure_occurrence.count()

                                                                                

49392036

## Salvando dados no Postgresql

In [38]:
write_to_postgresl(df_procedure_occurrence, tb_name='procedure_occurrence', write_mode='append')

25/03/19 15:55:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/19 15:55:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/19 15:55:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/19 15:55:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/19 15:55:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/19 16:49:24 ERROR Executor: Exception in task 0.0 in stage 77.0 (TID 2641)
java.sql.BatchUpdateException: Batch entry 0 INSERT INTO procedure_occurrence ("procedure_occu

Erro: An error occurred while calling o1204.jdbc.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 77.0 failed 1 times, most recent failure: Lost task 0.0 in stage 77.0 (TID 2641) (tre2-172-16-1-22.tre2.cidacs executor driver): java.sql.BatchUpdateException: Batch entry 0 INSERT INTO procedure_occurrence ("procedure_occurrence_id","person_id","procedure_concept_id","procedure_date","procedure_datetime","procedure_end_date","procedure_end_datetime","procedure_type_concept_id","modifier_concept_id","quantity","provider_id","visit_occurrence_id","visit_detail_id","procedure_source_value","procedure_source_concept_id","modifier_source_value") VALUES (1,1,4193412,'2002-07-05 -03'::date,NULL,NULL,NULL,32879,NULL,NULL,NULL,1,NULL,NULL,NULL,NULL) was aborted: ERROR: duplicate key value violates unique constraint "xpk_procedure_occurrence"
  Detail: Key (procedure_occurrence_id)=(1) already exists.  Call getNextException to see other errors in the batch.
	at 

                                                                                

'Total time: 0:53:44.114409 and - Total rows: 49392036 - Total columns: 16'

### Deprecated

### Salvando CSV 

In [None]:
# df_procedure_occurrence.repartition(1).write.option("quoteAll",True).csv('/data/IDAF/PROJETOS/PARCERIA_CIDACS_PHDC/omop_tables_revisadas/procedure_occurrence', header=True, mode='overwrite')

## SQL de insertion

In [None]:
# -- CREATE TABLE public.procedure_occurrence (
# -- 			procedure_occurrence_id integer NOT NULL,
# -- 			person_id integer NOT NULL,
# -- 			procedure_concept_id integer NOT NULL,
# -- 			procedure_date date NOT NULL,
# -- 			procedure_datetime TIMESTAMP NULL,
# -- 			procedure_end_date date NULL,
# -- 			procedure_end_datetime TIMESTAMP NULL,
# -- 			procedure_type_concept_id integer NOT NULL,
# -- 			modifier_concept_id integer NULL,
# -- 			quantity integer NULL,
# -- 			provider_id integer NULL,
# -- 			visit_occurrence_id integer NULL,
# -- 			visit_detail_id integer NULL,
# -- 			procedure_source_value varchar(50) NULL,
# -- 			procedure_source_concept_id integer NULL,
# -- 			modifier_source_value varchar(50) NULL );


# -- CREATE TABLE public.procedure_occurrence_pyspark (
# -- 			procedure_occurrence_id varchar,
# -- 			person_id varchar,
# -- 			procedure_concept_id varchar,
# -- 			procedure_date varchar,
# -- 			procedure_datetime varchar,
# -- 			procedure_end_date varchar,
# -- 			procedure_end_datetime varchar,
# -- 			procedure_type_concept_id varchar,
# -- 			modifier_concept_id varchar,
# -- 			quantity varchar,
# -- 			provider_id varchar,
# -- 			visit_occurrence_id varchar,
# -- 			visit_detail_id varchar,
# -- 			procedure_source_value varchar,
# -- 			procedure_source_concept_id varchar,
# -- 			modifier_source_value  varchar);

								   
# insert into public.procedure_occurrence (
# 			procedure_occurrence_id,
# 			person_id,
# 			procedure_concept_id,
# 			procedure_date,
# 			procedure_datetime,
# 			procedure_end_date,
# 			procedure_end_datetime,
# 			procedure_type_concept_id,
# 			modifier_concept_id,
# 			quantity,
# 			provider_id,
# 			visit_occurrence_id,
# 			visit_detail_id,
# 			procedure_source_value,
# 			procedure_source_concept_id,
# 			modifier_source_value
# )
# SELECT 
# 			cast(procedure_occurrence_id as integer),
# 			cast(person_id as integer),
# 			cast(case when procedure_concept_id = '' then null else procedure_concept_id end as integer),
# 			cast(procedure_date as date),
# 			cast(case when procedure_datetime = '' then null else procedure_datetime end as timestamp),
# 			cast(case when procedure_end_date = '' then null else procedure_end_date end as date),
# 			cast(case when procedure_end_datetime = '' then null else procedure_end_datetime end as timestamp),
# 			cast(case when procedure_type_concept_id = '' then null else procedure_type_concept_id end as integer),
# 			cast(case when modifier_concept_id ='' then null else modifier_concept_id end as integer),
# 			cast(case when quantity = '' then null else quantity end as integer),
# 			cast(case when provider_id ='' then null else provider_id end as integer),
# 			cast(case when visit_occurrence_id = '' then null else visit_occurrence_id end as integer),
# 			cast(case when visit_detail_id = '' then null else visit_detail_id end as integer),
# 			cast(procedure_source_value as varchar),
# 			cast(case when procedure_source_concept_id = '' then null else procedure_source_concept_id end as integer),
# 			cast(modifier_source_value as varchar)
# 	FROM public.procedure_occurrence_pyspark;


