## Configurações Pyspark

In [13]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Window

import pandas as pd

def write_to_postgresl(df, tb_name=None, write_mode='None'):
    df.count()
    if tb_name is None:
        raise Exception('Informe o nome da tabela')
    if write_mode is None:
        raise Exception('Informe o mode de escrita: append ou overwrite')
        
    from datetime import datetime
    start_time = datetime.now()
    
    try:
        df.write.jdbc(url, table=tb_name, mode=write_mode, properties=properties)
    except Exception as e:
        print(f'Erro: {e}')
    end_time = datetime.now()
    
    total_time = str(end_time - start_time)
    
    return f'Total time: {total_time} and - Total rows: {df.count()} - Total columns: {len(df.columns)}'



    # Criando a sessão do Spark
spark  = SparkSession.builder \
    .appName("Data Analysis") \
    .config('spark.jars', '/data/IDAF/DATABASECONNECTOR_JAR_FOLDER/postgresql-42.2.18.jar')\
    .config("spark.executor.memory", "8g")\
    .config("spark.driver.memory", "8g")\
    .config("spark.executor.cores", "4")\
    .config("spark.executor.instances","8")\
    .config("spark.sql.shuffle.partitions","96")\
    .config("spark.default.parallelism","96")\
    .getOrCreate()


url = "jdbc:postgresql://localhost:5432/postgres"

properties = {
    "user" : "postgres",
    "password" : "cidacs",
    "driver" : "org.postgresql.Driver"
}

## Configurações Pandas

In [14]:
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100)
spark.conf.set("spark.sql.repl.eagerEval.maxNumRows", 300)

spark.conf.set("spark.sql.debug.maxToStringFields", 100)

pd.set_option("display.max_columns", None) 
pd.set_option("display.max_rows", None)

### Lendo dados enriquecidos

In [16]:
df_input = (spark
            .read
            .parquet('/data/IDAF/PROJETOS/PARCERIA_CIDACS_PHDC/scripts_omop_tb/basefinal_tb_limp_enriched')
           )

### Measurement

In [17]:
window_measurement = Window.partitionBy().orderBy('person_id', 'dtnasc_sinasc')

In [24]:
df_measurement = (df_input
                 .withColumn('person_id', F.col('person_id'))
                 .withColumn('dtultmenst_code', F.when(F.col('dtultmenst_sinasc').isNotNull(), F.lit(4072438)).otherwise(None))
                 .withColumn('histopatol_code', F.when(F.col('histopatol').isNotNull(), F.lit(4098214)).otherwise(None))
                 .withColumn('teste_tube_code', F.when(F.col('teste_tube').isNotNull(), F.lit(4308939)).otherwise(None))
                 .withColumn('bacilosc_e_code', F.when(F.col('bacilosc_e')==0, None)
                                                 .when(F.col('bacilosc_e')==1, F.lit(44810800))
                                                 .when(F.col('bacilosc_e')==2, F.lit(44810879))
                                                 .when(F.col('bacilosc_e')==3, F.lit(4118638))
                                                 .when(F.col('bacilosc_e')==77, F.lit(36303209))
                                                 .when(F.col('bacilosc_e')==99, F.lit(4127662))
                                                 .otherwise(None))
                 .withColumn('cultura_es_code', F.when(F.col('cultura_es').isNotNull(), F.lit(4015189)).otherwise(None)) 
                 .withColumn('semagestac_sinasc_code', F.when(F.col('semagestac_sinasc').isNotNull(), F.lit(3012266)).otherwise(None)) 
                 .withColumn('gravidez_sinasc_code', F.when(F.col('gravidez_sinasc').isNotNull(), F.lit(4077859)).otherwise(None)) 
                 .withColumn('measurement_concept_id_list', F.concat_ws(",", F.col('dtultmenst_code'), 
                                                                             F.col('histopatol_code'), 
                                                                             F.col('teste_tube_code'), 
                                                                             F.col('bacilosc_e_code'),
                                                                             F.col('cultura_es_code'),
                                                                             F.col('semagestac_sinasc_code'),
                                                                             F.col('gravidez_sinasc_code'))) 
                 .withColumn('measurement_concept_id', F.explode(F.split(F.col('measurement_concept_id_list'), ',')))
                 .withColumn('measurement_date', F.when(F.col('measurement_concept_id').isin(44810800, 44810879, 4118638, 36303209, 4127662, 4308939, 4015189), F.col('dt_noti_at'))
                                                  .when(F.col('measurement_concept_id').isin(3012266, 4077859), F.col('dtnasc_sinasc'))
                                                  .when(F.col('measurement_concept_id').isin(4098214), F.col('dt_diag'))
                                                  .when(F.col('measurement_concept_id').isin(4072438), F.col('dtultmenst_sinasc')))
                 .withColumn('value_as_number', F.when(F.col('measurement_concept_id').isin(3012266), F.col('semagestac_sinasc'))
                                                 .when((F.col('measurement_concept_id').isin(4077859))&(F.col('gravidez_sinasc').isin(1, 2, 3)), F.col('gravidez_sinasc'))
                                                 .when((F.col('measurement_concept_id').isin(4077859))&(F.col('gravidez_sinasc').isin(0, 88,99)), F.lit(None)).otherwise(None))
                 .withColumn('value_as_concept_id', F.when(F.col('histopatol')==0, F.lit(None))
                                    .when((F.col('histopatol')==1)&(F.col('measurement_concept_id')==4098214), F.lit(9191))
                                    .when((F.col('histopatol')==2)&(F.col('measurement_concept_id')==4098214), F.lit(4090826))
                                    .when((F.col('histopatol')==3)&(F.col('measurement_concept_id')==4098214), F.lit(1244617))
                                    .when((F.col('histopatol')==4)&(F.col('measurement_concept_id')==4098214), F.lit(4090959))
                                    .when((F.col('histopatol')==5)&(F.col('measurement_concept_id')==4098214), F.lit(4118638))
                                    .when((F.col('histopatol')==99)&(F.col('measurement_concept_id')==4098214), F.lit(4127662))

                                    .when((F.col('teste_tube')==0)&(F.col('measurement_concept_id')==4308939), F.lit(None))
                                    .when((F.col('teste_tube')==1)&(F.col('measurement_concept_id')==4308939), F.lit(4305306))
                                    .when((F.col('teste_tube')==2)&(F.col('measurement_concept_id')==4308939), F.lit(45878592))
                                    .when((F.col('teste_tube')==3)&(F.col('measurement_concept_id')==4308939), F.lit(45880924))
                                    .when((F.col('teste_tube')==4)&(F.col('measurement_concept_id')==4308939), F.lit(4118638))
                                    .when((F.col('teste_tube')==99)&(F.col('measurement_concept_id')==4308939), F.lit(4127662))

                                    .when((F.col('cultura_es')==0)&(F.col('measurement_concept_id')==4015189), F.lit(None))
                                    .when((F.col('cultura_es')==1)&(F.col('measurement_concept_id')==4015189), F.lit(9191))
                                    .when((F.col('cultura_es')==2)&(F.col('measurement_concept_id')==4015189), F.lit(9189))
                                    .when((F.col('cultura_es')==3)&(F.col('measurement_concept_id')==4015189), F.lit(4090959))
                                    .when((F.col('cultura_es')==4)&(F.col('measurement_concept_id')==4015189), F.lit(4118638))
                                    .when((F.col('cultura_es')==99)&(F.col('measurement_concept_id')==4015189), F.lit(4127662))
                                    )
                 .withColumn('measurement_type_concept_id', F.lit(32879))
                 .withColumn('measurement_id', F.row_number().over(window_measurement))
                 .withColumn('unit_source_value', F.lit(None))
                 .withColumn('unit_concept_id', F.lit(None))
                 .withColumn('measurement_datetime', F.lit(None))
                 .withColumn('measurement_time', F.lit(None))
                 .withColumn('operator_concept_id', F.lit(None))
                 .withColumn('value_as_concept_id', F.lit(None))
                 .withColumn('range_low', F.lit(None))
                 .withColumn('range_high', F.lit(None))
                 .withColumn('provider_id', F.lit(None))
                 .withColumn('visit_occurrence_id', F.col('visit_occurrence_id'))
                 .withColumn('visit_detail_id', F.lit(None))
                 .withColumn('measurement_source_value', F.lit(None))
                 .withColumn('measurement_source_concept_id', F.lit(None))
                 .withColumn('unit_source_concept_id', F.lit(None))
                 .withColumn('value_source_value', F.when(F.col('measurement_concept_id').isin(4072438), F.col('dtultmenst_sinasc')).otherwise(None))
                  .withColumn('measurement_event_id', F.lit(None))
                  .withColumn('meas_event_field_concept_id', F.lit(None))
                  .filter(F.col('measurement_date').isNotNull())
                  ).select( F.col("measurement_id").cast('integer'),
                            F.col("person_id").cast('integer'),
                            F.col("measurement_concept_id").cast('integer'),
                            F.col("measurement_date").cast('date'),
                            F.col("measurement_datetime").cast('timestamp'),
                            F.col("measurement_time").cast('string'),
                            F.col("measurement_type_concept_id").cast('integer'),
                            F.col("operator_concept_id").cast('integer'),
                            F.col("value_as_number").cast('float'),
                            F.col("value_as_concept_id").cast('integer'),
                            F.col("unit_concept_id").cast('integer'),
                            F.col("range_low").cast('float'),
                            F.col("range_high").cast('float'),
                            F.col("provider_id").cast('integer'),
                            F.col("visit_occurrence_id").cast('integer'),
                            F.col("visit_detail_id").cast('integer'),
                            F.col("measurement_source_value").cast('string'),
                            F.col("measurement_source_concept_id").cast('integer'),
                            F.col("unit_source_value").cast('string'),
                            F.col("unit_source_concept_id").cast('integer'),
                            F.col("value_source_value").cast('string'),
                            F.col("measurement_event_id").cast('integer'),
                            F.col("meas_event_field_concept_id").cast('integer')
                            )
df_measurement.count()

                                                                                

41534339

### Salvando no Postgresql

In [25]:
write_to_postgresl(df_measurement, tb_name='tuberculose.measurement', write_mode='append')

25/07/16 13:33:38 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/16 13:33:38 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/16 13:33:38 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/16 13:33:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/16 13:33:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

'Total time: 1:07:30.353026 and - Total rows: 41534339 - Total columns: 23'