## Configurações Pyspark

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Window

import pandas as pd

def write_to_postgresl(df, tb_name=None, write_mode='None'):
    df.count()
    if tb_name is None:
        raise Exception('Informe o nome da tabela')
    if write_mode is None:
        raise Exception('Informe o mode de escrita: append ou overwrite')
        
    from datetime import datetime
    start_time = datetime.now()
    
    try:
        df.write.jdbc(url, table=tb_name, mode=write_mode, properties=properties)
    except Exception as e:
        print(f'Erro: {e}')
    end_time = datetime.now()
    
    total_time = str(end_time - start_time)
    
    return f'Total time: {total_time} and - Total rows: {df.count()} - Total columns: {len(df.columns)}'



    # Criando a sessão do Spark
spark  = SparkSession.builder \
    .appName("Data Analysis") \
    .config('spark.jars', '/data/IDAF/DATABASECONNECTOR_JAR_FOLDER/postgresql-42.2.18.jar')\
    .config("spark.executor.memory", "8g")\
    .config("spark.driver.memory", "8g")\
    .config("spark.executor.cores", "4")\
    .config("spark.executor.instances","8")\
    .config("spark.sql.shuffle.partitions","96")\
    .config("spark.default.parallelism","96")\
    .getOrCreate()


url = "jdbc:postgresql://localhost:5432/postgres"

properties = {
    "user" : "postgres",
    "password" : "cidacs",
    "driver" : "org.postgresql.Driver"
}

25/03/20 09:39:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/20 09:39:32 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/03/20 09:39:32 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


## Configurações Pandas

In [2]:
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100)
spark.conf.set("spark.sql.repl.eagerEval.maxNumRows", 300)

spark.conf.set("spark.sql.debug.maxToStringFields", 100)

pd.set_option("display.max_columns", None) 
pd.set_option("display.max_rows", None)

### Lendo dados enriquecidos

In [3]:
df_input = (spark
            .read
            .parquet('/data/IDAF/PROJETOS/PARCERIA_CIDACS_PHDC/banco_original_enriched')
            .filter((F.col('id_cidacs_mae_sinasc')=='10324544718')&(F.col('tpteste1_mae')==1))
           )

                                                                                

### Measurement

In [5]:
window_measurement = Window.partitionBy().orderBy('person_id', 'dtnasc_sinasc')

In [6]:
df_measurement = (df_input.select('id_cidacs_mae_sinasc', 'tpteste1_mae', 'semagestac_sinasc', 
                                  'gravidez_sinasc', 'dtnasc_sinasc', 'person_id', 'mesprenat_sinasc', 'visit_occurrence_id')
                 .withColumn('person_id', F.col('person_id'))
                 .withColumn('semagestac_sinasc_code', F.when(F.col('semagestac_sinasc').isNotNull(), F.lit(3012266)).otherwise(None))
                 .withColumn('gravidez_sinasc_code', F.when(F.col('gravidez_sinasc').isNotNull(), F.lit(4077859)).otherwise(None))
                 .withColumn('tpteste1_mae_code', F.when(F.col('tpteste1_mae').isNotNull(), F.lit(4299241)).otherwise(None))
                 .withColumn('measurement_concept_id_list', F.concat_ws(",", F.col('semagestac_sinasc_code'), F.col('gravidez_sinasc_code'), F.col('tpteste1_mae_code'))) 
                 .withColumn('measurement_concept_id', F.explode(F.split(F.col('measurement_concept_id_list'), ',')))
                 .withColumn('date_mesprenat_sinasc', F.when(F.col('mesprenat_sinasc')<=9, F.add_months(F.col('dtnasc_sinasc'), -(9 - F.col('mesprenat_sinasc').cast('int'))))
                                     .when((F.col('mesprenat_sinasc')==99)|(F.col('mesprenat_sinasc').isNull()), F.lit('2099-12-31'))
                                     .when(F.col('mesprenat_sinasc')>9, F.trunc(F.col('dtnasc_sinasc'), 'month')))
                 .withColumn('measurement_date', F.when(F.col('measurement_concept_id').isin(3012266, 4077859), F.col('dtnasc_sinasc'))
                                                  .when(F.col('measurement_concept_id').isin(4299241), F.col('date_mesprenat_sinasc')))
                 .withColumn('value_as_number', F.when(F.col('measurement_concept_id').isin(3012266), F.col('semagestac_sinasc'))
                                                 .when(F.col('measurement_concept_id').isin(4299241), F.col('tpteste1_mae'))
                                                 .when((F.col('measurement_concept_id').isin(4077859))&(F.col('gravidez_sinasc').isin(0, 1, 2, 3)), F.col('gravidez_sinasc'))
                                                 .when((F.col('measurement_concept_id').isin(4077859))&(F.col('gravidez_sinasc').isin(88,99)), F.lit(None)).otherwise(None))
                 .withColumn('value_as_concept_id', F.when(F.col('tpteste1_mae')==0, F.lit(0))
                                                     .when(F.col('tpteste1_mae')==1, F.lit(9191))
                                                     .when(F.col('tpteste1_mae')==2, F.lit(9189))
                                                     .when(F.col('tpteste1_mae')==3, F.lit(4118638))
                                                     .when(F.col('tpteste1_mae')==88, F.lit(0))
                                                     .when(F.col('tpteste1_mae').isNull(), F.lit(0))
                            )
                 .withColumn('measurement_type_concept_id', F.lit(32879))
                 .withColumn('measurement_id', F.row_number().over(window_measurement))
                 .withColumn('unit_source_value', F.lit(None))
                 .withColumn('unit_concept_id', F.lit(None))
                 .withColumn('measurement_datetime', F.lit(None))
                 .withColumn('measurement_time', F.lit(None))
                 .withColumn('operator_concept_id', F.lit(None))
                 .withColumn('value_as_concept_id', F.lit(None))
                 .withColumn('range_low', F.lit(None))
                 .withColumn('range_high', F.lit(None))
                 .withColumn('provider_id', F.lit(None))
                 .withColumn('visit_occurrence_id', F.col('visit_occurrence_id'))
                 .withColumn('visit_detail_id', F.lit(None))
                 .withColumn('measurement_source_value', F.lit(None))
                 .withColumn('measurement_source_concept_id', F.lit(None))
                 .withColumn('unit_source_concept_id', F.lit(None))
                 .withColumn('value_source_value', F.lit(None))
                  .withColumn('measurement_event_id', F.lit(None))
                  .withColumn('meas_event_field_concept_id', F.lit(None))
                  ).select( F.col("measurement_id").cast('integer'),
                            F.col("person_id").cast('integer'),
                            F.col("measurement_concept_id").cast('integer'),
                            F.col("measurement_date").cast('date'),
                            F.col("measurement_datetime").cast('timestamp'),
                            F.col("measurement_time").cast('string'),
                            F.col("measurement_type_concept_id").cast('integer'),
                            F.col("operator_concept_id").cast('integer'),
                            F.col("value_as_number").cast('float'),
                            F.col("value_as_concept_id").cast('integer'),
                            F.col("unit_concept_id").cast('integer'),
                            F.col("range_low").cast('float'),
                            F.col("range_high").cast('float'),
                            F.col("provider_id").cast('integer'),
                            F.col("visit_occurrence_id").cast('integer'),
                            F.col("visit_detail_id").cast('integer'),
                            F.col("measurement_source_value").cast('string'),
                            F.col("measurement_source_concept_id").cast('integer'),
                            F.col("unit_source_value").cast('string'),
                            F.col("unit_source_concept_id").cast('integer'),
                            F.col("value_source_value").cast('string'),
                            F.col("measurement_event_id").cast('integer'),
                            F.col("meas_event_field_concept_id").cast('integer')
                            )
df_measurement.count()

                                                                                

3

In [7]:
df_measurement.toPandas()

25/03/20 09:40:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/20 09:40:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/20 09:40:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/20 09:40:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/20 09:40:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

Unnamed: 0,measurement_id,person_id,measurement_concept_id,measurement_date,measurement_datetime,measurement_time,measurement_type_concept_id,operator_concept_id,value_as_number,value_as_concept_id,unit_concept_id,range_low,range_high,provider_id,visit_occurrence_id,visit_detail_id,measurement_source_value,measurement_source_concept_id,unit_source_value,unit_source_concept_id,value_source_value,measurement_event_id,meas_event_field_concept_id
0,1,74836,3012266,2012-07-02,NaT,,32879,,40.0,,,,,,115959,,,,,,,,
1,2,74836,4077859,2012-07-02,NaT,,32879,,1.0,,,,,,115959,,,,,,,,
2,3,74836,4299241,2012-03-02,NaT,,32879,,1.0,,,,,,115959,,,,,,,,


### Salvando no Postgresql

In [7]:
write_to_postgresl(df_measurement, tb_name='measurement', write_mode='append')

25/03/14 07:53:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/14 07:53:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/14 07:53:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/14 07:53:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/14 07:53:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

'Total time: 0:33:48.812641 and - Total rows: 33028498 - Total columns: 23'

## Deprecated

### Salvando CSV 

In [None]:
# df_measurement.repartition(1).write.option("quoteAll",True).csv('/data/IDAF/SHARED/omop_tables_revisadas/measurement', header=True, mode='overwrite')

## SQL de insertion

In [None]:
# CREATE TABLE public.measurement (
# 			measurement_id integer NOT NULL,
# 			person_id integer NOT NULL,
# 			measurement_concept_id integer NOT NULL,
# 			measurement_date date NOT NULL,
# 			measurement_datetime TIMESTAMP NULL,
# 			measurement_time varchar(10) NULL,
# 			measurement_type_concept_id integer NOT NULL,
# 			operator_concept_id integer NULL,
# 			value_as_number NUMERIC NULL,
# 			value_as_concept_id integer NULL,
# 			unit_concept_id integer NULL,
# 			range_low NUMERIC NULL,
# 			range_high NUMERIC NULL,
# 			provider_id integer NULL,
# 			visit_occurrence_id integer NULL,
# 			visit_detail_id integer NULL,
# 			measurement_source_value varchar(50) NULL,
# 			measurement_source_concept_id integer NULL,
# 			unit_source_value varchar(50) NULL,
# 			unit_source_concept_id integer NULL,
# 			value_source_value varchar(50) NULL,
# 			measurement_event_id bigint NULL,
# 			meas_event_field_concept_id integer NULL );


# CREATE TABLE public.measurement_pyspark (
# 			measurement_id varchar,
# 			person_id varchar,
# 			measurement_concept_id varchar,
# 			measurement_date varchar,
# 			measurement_datetime varchar,
# 			measurement_time varchar,
# 			measurement_type_concept_id varchar,
# 			operator_concept_id varchar,
# 			value_as_number varchar,
# 			value_as_concept_id varchar,
# 			unit_concept_id varchar,
# 			range_low varchar,
# 			range_high varchar,
# 			provider_id varchar,
# 			visit_occurrence_id varchar,
# 			visit_detail_id varchar,
# 			measurement_source_value varchar,
# 			measurement_source_concept_id varchar,
# 			unit_source_value varchar,
# 			unit_source_concept_id varchar,
# 			value_source_value varchar,
# 			measurement_event_id varchar,
# 			meas_event_field_concept_id varchar);



  
# insert into public.measurement (
# measurement_id,
# person_id,
# measurement_concept_id,
# measurement_date,
# measurement_datetime,
# measurement_time,
# measurement_type_concept_id,
# operator_concept_id,
# value_as_number,
# value_as_concept_id,
# unit_concept_id,
# range_low,
# range_high,
# provider_id,
# visit_occurrence_id,
# visit_detail_id,
# measurement_source_value,
# measurement_source_concept_id,
# unit_source_value,
# unit_source_concept_id,
# value_source_value,
# measurement_event_id,
# meas_event_field_concept_id
#     )
# SELECT 
#     cast(case when measurement_id='' then null else measurement_id end as integer),
#     cast(case when person_id='' then null else person_id end as integer),
#     cast(case when measurement_concept_id='' then null else measurement_concept_id end as integer),
#     cast(case when measurement_date='' then null else measurement_date end as date),
#     cast(case when measurement_datetime='' then null else measurement_datetime end as timestamp),
#     cast(case when measurement_time='' then null else measurement_time end as varchar),
#     cast(case when measurement_type_concept_id='' then null else measurement_type_concept_id end as integer),
#     cast(case when operator_concept_id='' then null else operator_concept_id end as integer),
#     cast(case when value_as_number='' then null else value_as_number end as float),
#     cast(case when value_as_concept_id='' then null else value_as_concept_id end as integer),
#     cast(case when unit_concept_id='' then null else unit_concept_id end as integer),
#     cast(case when range_low='' then null else range_low end as float),
#     cast(case when range_high='' then null else range_high end as float),
#     cast(case when provider_id='' then null else provider_id end as integer),
#     cast(case when visit_occurrence_id='' then null else visit_occurrence_id end as integer),
#     cast(case when visit_detail_id='' then null else visit_detail_id end as integer),
#     measurement_source_value,
#     cast(case when measurement_source_concept_id='' then null else measurement_source_concept_id end as integer),
#     unit_source_value,
#     cast(case when unit_source_concept_id='' then null else unit_source_concept_id end as integer),
#     value_source_value,
#     cast(case when measurement_event_id='' then null else measurement_event_id end as integer),
#     cast(case when meas_event_field_concept_id='' then null else meas_event_field_concept_id end as integer)
# FROM public.measurement_pyspark;
    
    
    