### Configurações Pyspark

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Window

import pandas as pd

def write_to_postgresl(df, tb_name=None, write_mode='None'):
    df.count()
    if tb_name is None:
        raise Exception('Informe o nome da tabela')
    if write_mode is None:
        raise Exception('Informe o mode de escrita: append ou overwrite')
        
    from datetime import datetime
    start_time = datetime.now()
    
    try:
        df.write.jdbc(url, table=tb_name, mode=write_mode, properties=properties)
    except Exception as e:
        print(f'Erro: {e}')
    end_time = datetime.now()
    
    total_time = str(end_time - start_time)
    
    return f'Total time: {total_time} and - Total rows: {df.count()} - Total columns: {len(df.columns)}'



    # Criando a sessão do Spark
spark  = SparkSession.builder \
    .appName("Data Analysis") \
    .config('spark.jars', '/data/IDAF/DATABASECONNECTOR_JAR_FOLDER/postgresql-42.2.18.jar')\
    .config("spark.executor.memory", "8g")\
    .config("spark.driver.memory", "8g")\
    .config("spark.executor.cores", "4")\
    .config("spark.executor.instances","8")\
    .config("spark.sql.shuffle.partitions","96")\
    .config("spark.default.parallelism","96")\
    .getOrCreate()


url = "jdbc:postgresql://localhost:5432/postgres"

properties = {
    "user" : "postgres",
    "password" : "cidacs",
    "driver" : "org.postgresql.Driver"
}

In [4]:
from pyspark.sql.types import ArrayType, StringType

def dividir_em_blocos(s):
    import re
    return re.findall('.{1,4}', s)

dividir_udf = F.udf(dividir_em_blocos, ArrayType(StringType()))

### Configurações Pyspark

In [5]:
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100)
spark.conf.set("spark.sql.repl.eagerEval.maxNumRows", 300)

spark.conf.set("spark.sql.debug.maxToStringFields", 100)

pd.set_option("display.max_columns", None) 
pd.set_option("display.max_rows", None)

### Lendo dados enriquecidos

In [6]:
df_input = (spark
            .read
            .parquet('/data/IDAF/PROJETOS/PARCERIA_CIDACS_PHDC/scripts_omop_tb/basefinal_tb_limp_enriched')
           )

                                                                                

### Observation

In [27]:
df_observation = (df_input
                  .withColumn('dtobito_sim', F.to_date(F.col('`dtobito_sim.y`').cast('date')))
#                   .filter(F.col('tipobito_sim').isNotNull()).limit(10)
                  .withColumn('person_id', F.col('person_id_infant'))
                  .withColumn('dtobito_minus_dtnasc', (F.col('dtobito_sim') - F.col('dtnasc_sinasc').cast('date')).cast('int'))
                  .withColumn('tipobito_sim_code', F.when(F.col('tipobito_sim')==1, F.lit(4079844))
                                                    .when((F.col('tipobito_sim')==2) & (F.col('dtobito_minus_dtnasc').between(0,7)), F.lit(4307303))
                                                    .when((F.col('tipobito_sim')==2) & (F.col('dtobito_minus_dtnasc').between(8,28)), F.lit(4173168))
                                                    .when((F.col('tipobito_sim')==2) & (F.col('dtobito_minus_dtnasc')>28), F.lit(4344630))
                                                    .otherwise(None))
                  .withColumn('mesprenat_sinasc_code', F.lit(4311447))
                  .withColumn('consprenat_sinasc_code', F.lit(4313474))
                  .withColumn('observation_concept_id_list', F.concat_ws(',',
                                                                        F.col('tipobito_sim_code'),
                                                                        F.col('mesprenat_sinasc_code'),
                                                                        F.col('consprenat_sinasc_code')))
                  .withColumn('observation_concept_id', F.explode(F.split('observation_concept_id_list', ',')))
                  .withColumn('observation_date', F.when(F.col('observation_concept_id')==4311447, 
                                                         F.add_months(F.col('dtnasc_sinasc').cast('date'), -F.col('mesprenat_sinasc').cast('int')))
                                                         .otherwise(F.col('dtnasc_sinasc')))
                  .withColumn('observation_type_concept_id', F.lit(32879))
                  .withColumn('value_as_number', F.when(F.col('observation_concept_id')==4313474, F.col('consprenat_sinasc')).otherwise(None))
                  .withColumn('observation_id', F.row_number().over(Window.orderBy('person_id', 'dtnasc_sinasc'))+F.lit(1000000000))
                  .withColumn('observation_datetime', F.lit(None))
                  .withColumn('value_as_string', F.lit(None))
                  .withColumn('value_as_concept_id', F.lit(None))
                  .withColumn('qualifier_concept_id', F.lit(None))
                  .withColumn('unit_concept_id', F.lit(None))
                  .withColumn('provider_id', F.lit(None))
                  .withColumn('visit_occurrence_id', F.lit(None))
                  .withColumn('visit_detail_id', F.lit(None))
                  .withColumn('observation_source_value', F.lit(None))
                  .withColumn('observation_source_concept_id', F.lit(None))
                  .withColumn('unit_source_value', F.lit(None))
                  .withColumn('qualifier_source_value', F.lit(None))
                  .withColumn('value_source_value', F.lit(None))
                  .withColumn('observation_event_id', F.lit(None))
                  .withColumn('obs_event_field_concept_id', F.lit(None))
                  .filter(F.col('observation_date').isNotNull())
                 ).select(
                    F.col('observation_id').cast('integer'),
                    F.col('person_id').cast('integer'),
                    F.col('observation_concept_id').cast('integer'),
                    F.col('observation_date').cast('date'),
                    F.col('observation_datetime').cast('timestamp'),
                    F.col('observation_type_concept_id').cast('integer'),
                    F.col('value_as_number').cast('float'),
                    F.col('value_as_string').cast('string'),
                    F.col('value_as_concept_id').cast('integer'),
                    F.col('qualifier_concept_id').cast('integer'),
                    F.col('unit_concept_id').cast('integer'),
                    F.col('provider_id').cast('integer'),
                    F.col('visit_occurrence_id').cast('integer'),
                    F.col('visit_detail_id').cast('integer'),
                    F.col('observation_source_value').cast('string'),
                    F.col('observation_source_concept_id').cast('integer'),
                    F.col('unit_source_value').cast('string'),
                    F.col('qualifier_source_value').cast('string'),
                    F.col('value_source_value').cast('string'),
                    F.col('observation_event_id').cast('integer'),
                    F.col('obs_event_field_concept_id').cast('integer')
                    )
                  
                  
df_observation.count()

                                                                                

31956048

## Salvando no Postgresql

In [28]:
write_to_postgresl(df_observation, tb_name='tuberculose.observation', write_mode='append')

25/06/08 21:14:21 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/08 21:14:21 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/08 21:14:21 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/08 21:14:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/08 21:14:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

'Total time: 0:58:12.409746 and - Total rows: 31956048 - Total columns: 21'