## Configurações Pyspark

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Window

import pandas as pd

def write_to_postgresl(df, tb_name=None, write_mode='None'):
    df.count()
    if tb_name is None:
        raise Exception('Informe o nome da tabela')
    if write_mode is None:
        raise Exception('Informe o mode de escrita: append ou overwrite')
        
    from datetime import datetime
    start_time = datetime.now()
    
    try:
        df.write.jdbc(url, table=tb_name, mode=write_mode, properties=properties)
    except Exception as e:
        print(f'Erro: {e}')
    end_time = datetime.now()
    
    total_time = str(end_time - start_time)
    
    return f'Total time: {total_time} and - Total rows: {df.count()} - Total columns: {len(df.columns)}'



    # Criando a sessão do Spark
spark  = SparkSession.builder \
    .appName("Data Analysis") \
    .config('spark.jars', '/data/IDAF/DATABASECONNECTOR_JAR_FOLDER/postgresql-42.2.18.jar')\
    .config("spark.executor.memory", "8g")\
    .config("spark.driver.memory", "8g")\
    .config("spark.executor.cores", "4")\
    .config("spark.executor.instances","8")\
    .config("spark.sql.shuffle.partitions","96")\
    .config("spark.default.parallelism","96")\
    .getOrCreate()


url = "jdbc:postgresql://localhost:5432/postgres"

properties = {
    "user" : "postgres",
    "password" : "cidacs",
    "driver" : "org.postgresql.Driver"
}

25/07/17 19:57:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


## Configurações Pandas

In [2]:
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100)
spark.conf.set("spark.sql.repl.eagerEval.maxNumRows", 300)

spark.conf.set("spark.sql.debug.maxToStringFields", 100)

pd.set_option("display.max_columns", None) 
pd.set_option("display.max_rows", None)

### Lendo dados enriquecidos

In [3]:
df_input = (spark
            .read
            .parquet('/data/IDAF/PROJETOS/PARCERIA_CIDACS_PHDC/scripts_omop_tb/basefinal_tb_limp_enriched')
           )

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

### Measurement

In [4]:
window_measurement = Window.partitionBy().orderBy('person_id_infant', 'dtnasc_sinasc')

In [8]:
df_measurement = (df_input
                 .withColumn('person_id', F.col('person_id_infant'))
                 .withColumn('gravidez_sinasc_code', F.when(F.col('gravidez_sinasc').isNotNull(), F.lit(4077859)).otherwise(None))
                 .withColumn('peso_sinasc_code', F.when(F.col('peso_sinasc').isNotNull(), F.lit(4264825)).otherwise(None))
                 .withColumn('apgar1_sinasc_code', 
                             F.when(F.col('apgar1_sinasc')==0, F.lit(4014468))
                             .when(F.col('apgar1_sinasc')==1, F.lit(4015430))
                             .when(F.col('apgar1_sinasc')==2, F.lit(4015289))
                             .when(F.col('apgar1_sinasc')==3, F.lit(4014305))
                             .when(F.col('apgar1_sinasc')==4, F.lit(4015291))
                             .when(F.col('apgar1_sinasc')==5, F.lit(4014469))
                             .when(F.col('apgar1_sinasc')==6, F.lit(4014306))
                             .when(F.col('apgar1_sinasc')==7, F.lit(4015292))
                             .when(F.col('apgar1_sinasc')==8, F.lit(4016052))
                             .when(F.col('apgar1_sinasc')==9, F.lit(4015431))
                             .when(F.col('apgar1_sinasc')==10, F.lit(4014470))
                             .when(F.col('apgar1_sinasc').isin(19,89,99), F.lit(4127662))
                             .when(F.col('apgar1_sinasc').isNull(), F.lit(40482639))
                             .otherwise(None)
                            )
                 .withColumn('apgar5_sinasc_code', 
                             F.when(F.col('apgar5_sinasc')==0, F.lit(4014307))
                             .when(F.col('apgar5_sinasc')==1, F.lit(4016054))
                             .when(F.col('apgar5_sinasc')==2, F.lit(4014308))
                             .when(F.col('apgar5_sinasc')==3, F.lit(4016055))
                             .when(F.col('apgar5_sinasc')==4, F.lit(4014471))
                             .when(F.col('apgar5_sinasc')==5, F.lit(4014309))
                             .when(F.col('apgar5_sinasc')==6, F.lit(4014310))
                             .when(F.col('apgar5_sinasc')==7, F.lit(4016056))
                             .when(F.col('apgar5_sinasc')==8, F.lit(4016465))
                             .when(F.col('apgar5_sinasc')==9, F.lit(4016466))
                             .when(F.col('apgar5_sinasc')==10, F.lit(4016467))
                             .when(F.col('apgar5_sinasc')==99, F.lit(4127662))
                             .when(F.col('apgar5_sinasc').isNull(), F.lit(40482639))
                             .otherwise(None))
                 .withColumn('semagestac_sinasc_code', F.when(F.col('semagestac_sinasc').isNotNull(), F.lit(3012266)).otherwise(None))
                  
                 .withColumn('measurement_concept_id_list', F.concat_ws(",", 
                                                                        F.col('gravidez_sinasc_code'), 
                                                                        F.col('peso_sinasc_code'), 
                                                                        F.col('apgar1_sinasc_code'),
                                                                        F.col('apgar5_sinasc_code'),
                                                                        F.col('semagestac_sinasc_code')
                                                                       )) 
                 .withColumn('measurement_concept_id', F.explode(F.split(F.col('measurement_concept_id_list'), ',')))
                 .withColumn('measurement_date', F.col('dtnasc_sinasc'))
                 .withColumn('measurement_type_concept_id', F.lit(32879))                  
                 .withColumn('gravidez_sinasc_value', F.when(((F.col('gravidez_sinasc').isNull()) |F.col('gravidez_sinasc').isin(88, 99)), F.lit(None))
                                                       .when(F.col('gravidez_sinasc').isin(0, 1, 2, 3), F.col('gravidez_sinasc')).otherwise(None))                                  
                 .withColumn('value_as_number', F.when(F.col('measurement_concept_id')==4077859, F.col('gravidez_sinasc'))
                                                 .when(F.col('measurement_concept_id')==4264825, F.col('peso_sinasc'))
                                                 .when(F.col('measurement_concept_id')==3012266, F.col('semagestac_sinasc'))
                                                 .otherwise(None))
                 .withColumn('value_as_concept_id', F.lit(None))
                 .withColumn('unit_concept_id', F.when(F.col('measurement_concept_id')==4264825, F.lit(8504)).otherwise(None))
                 .withColumn('measurement_id', F.row_number().over(window_measurement)+F.lit(1000000000))
                 .withColumn('unit_source_value', F.lit(None).cast('string'))
                 .withColumn('measurement_datetime', F.lit(None).cast('timestamp'))
                 .withColumn('measurement_time', F.lit(None).cast('string'))
                 .withColumn('operator_concept_id', F.lit(None).cast('integer'))
                 .withColumn('range_low', F.lit(None).cast('float'))
                 .withColumn('range_high', F.lit(None).cast('float'))
                 .withColumn('provider_id', F.lit(None).cast('integer'))
                 .withColumn('visit_occurrence_id', F.col('visit_occurrence_id_infant').cast('integer'))
                 .withColumn('visit_detail_id', F.lit(None).cast('integer'))
                 .withColumn('measurement_source_value', F.lit(None).cast('string'))
                 .withColumn('measurement_source_concept_id', F.lit(None).cast('integer'))
                 .withColumn('unit_source_concept_id', F.lit(None).cast('integer'))
                 .withColumn('value_source_value', F.lit(None).cast('string'))
                  .withColumn('measurement_event_id', F.lit(None).cast('integer'))
                  .withColumn('meas_event_field_concept_id', F.lit(None).cast('integer'))
                  .filter(F.col('measurement_concept_id').isNotNull())
                  ).select(F.col("measurement_id").cast('integer'),
                        F.col("person_id").cast('integer'),
                        F.col("measurement_concept_id").cast('integer'),
                        F.col("measurement_date").cast('date'),
                        F.col("measurement_datetime").cast('timestamp'),
                        F.col("measurement_time").cast('string'),
                        F.col("measurement_type_concept_id").cast('integer'),
                        F.col("operator_concept_id").cast('integer'),
                        F.col("value_as_number").cast('float'),
                        F.col("value_as_concept_id").cast('integer'),
                        F.col("unit_concept_id").cast('integer'),
                        F.col("range_low").cast('float'),
                        F.col("range_high").cast('float'),
                        F.col("provider_id").cast('integer'),
                        F.col("visit_occurrence_id").cast('integer'),
                        F.col("visit_detail_id").cast('integer'),
                        F.col("measurement_source_value").cast('string'),
                        F.col("measurement_source_concept_id").cast('integer'),
                        F.col("unit_source_value").cast('string'),
                        F.col("unit_source_concept_id").cast('integer'),
                        F.col("value_source_value").cast('string'),
                        F.col("measurement_event_id").cast('integer'),
                        F.col("meas_event_field_concept_id").cast('integer'))

df_measurement.count()

                                                                                

81791937

### Escrevendo no Postgresql

In [9]:
write_to_postgresl(df_measurement, tb_name='tuberculose.measurement', write_mode='append')

25/07/17 19:58:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/17 19:58:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/17 19:58:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/17 19:58:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/17 19:58:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

'Total time: 5:54:02.754903 and - Total rows: 81791937 - Total columns: 23'