### Configurações Pyspark

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Window

import pandas as pd

def write_to_postgresl(df, tb_name=None, write_mode='None'):
    df.count()
    if tb_name is None:
        raise Exception('Informe o nome da tabela')
    if write_mode is None:
        raise Exception('Informe o mode de escrita: append ou overwrite')
        
    from datetime import datetime
    start_time = datetime.now()
    
    try:
        df.write.jdbc(url, table=tb_name, mode=write_mode, properties=properties)
    except Exception as e:
        print(f'Erro: {e}')
    end_time = datetime.now()
    
    total_time = str(end_time - start_time)
    
    return f'Total time: {total_time} and - Total rows: {df.count()} - Total columns: {len(df.columns)}'



    # Criando a sessão do Spark
spark  = SparkSession.builder \
    .appName("Data Analysis") \
    .config('spark.jars', '/data/IDAF/DATABASECONNECTOR_JAR_FOLDER/postgresql-42.2.18.jar')\
    .config("spark.executor.memory", "8g")\
    .config("spark.driver.memory", "8g")\
    .config("spark.executor.cores", "4")\
    .config("spark.executor.instances","8")\
    .config("spark.sql.shuffle.partitions","96")\
    .config("spark.default.parallelism","96")\
    .getOrCreate()


url = "jdbc:postgresql://localhost:5432/postgres"

properties = {
    "user" : "postgres",
    "password" : "cidacs",
    "driver" : "org.postgresql.Driver"
}

25/03/31 08:34:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# Configurações Pandas

In [2]:
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100)
spark.conf.set("spark.sql.repl.eagerEval.maxNumRows", 300)

spark.conf.set("spark.sql.debug.maxToStringFields", 100)

pd.set_option("display.max_columns", None) 
pd.set_option("display.max_rows", None)

### Leitura da base enriquecida

In [3]:
df_input = (spark
            .read
            .parquet('/data/IDAF/PROJETOS/PARCERIA_CIDACS_PHDC/banco_original_enriched', header=True)
           )

### Criação das colunas da tabela Person do OMOP

In [4]:
cod_mun = ['11','12','13','14','15','16','17','21','22','23','24','25','26','27','28','29','31','32','33','35','41','42','43','50','51','52','53']

In [5]:
df_person = (df_input
             .withColumn('person_id', F.col('person_id_infant'))
             .withColumn('gender_concept_id', F.when(F.col('sexo_sinasc')==0, 0)
                                               .when(F.col('sexo_sinasc')==1, 8507)
                                               .when(F.col('sexo_sinasc')==2, 8532)
                                               .when(F.col('sexo_sinasc')==88, 4214687)
                                               .when(F.col('sexo_sinasc')==99, 0)
                                               .otherwise(0))
             .withColumn('race_concept_id', F.when(F.col('cod_raca_cor_pessoa_eq') == '0', 0)
                                             .when(F.col('cod_raca_cor_pessoa_eq') == '1', 8527)
                                             .when(F.col('cod_raca_cor_pessoa_eq') == '2', 38003598)
                                             .when(F.col('cod_raca_cor_pessoa_eq') == '3', 8515)
                                             .when(F.col('cod_raca_cor_pessoa_eq') == '4', 4212311)
                                             .when(F.col('cod_raca_cor_pessoa_eq') == '5', 19387526)
                                             .when(F.col('cod_raca_cor_pessoa_eq') == '99', 0)
                                             .otherwise(0))
             # criando campo ethnicity_concept_id
             .withColumn('sg_uf_mae', F.substring(F.col('codmunres_sinasc'), 1, 2))
             .withColumn('ethnicity_concept_id', F.when(F.col('sg_uf_mae').isin(cod_mun), 38003563)
                                                  .otherwise(0))
             .withColumn('year_of_birth', F.year('dtnasc_sinasc'))
             .withColumn('month_of_birth', F.month('dtnasc_sinasc'))
             .withColumn('day_of_birth', F.day('dtnasc_sinasc'))
             .withColumn('location_id', F.col('location_id')) # Mesmo da mãe
             .withColumn('person_source_value', F.col('id_cidacs_sinasc_v4'))
             .groupBy('person_id')
             .agg( # Agrupando por person_id para garantir que o filho apareça apenas uma vez
                 F.max('race_concept_id').alias('race_concept_id'), 
                 F.max('gender_concept_id').alias('gender_concept_id'), 
                 F.max('ethnicity_concept_id').alias('ethnicity_concept_id'),
                 F.max('year_of_birth').alias('year_of_birth'), 
                 F.max('month_of_birth').alias('month_of_birth'), 
                 F.max('day_of_birth').alias('day_of_birth'), 
                 F.max('location_id').alias('location_id'),
                 F.max('person_source_value').alias('person_source_value'))
             .withColumn('care_site_id', F.col('location_id'))
             .withColumn('birth_datetime', F.lit(None))
             .withColumn('provider_id', F.lit(None))
             .withColumn('gender_source_value', F.lit(None))
             .withColumn('gender_source_concept_id', F.lit(None))
             .withColumn('race_source_value', F.lit(None))
             .withColumn('race_source_concept_id', F.lit(None))
             .withColumn('ethnicity_source_value', F.lit(None))
             .withColumn('ethnicity_source_concept_id', F.lit(None))
            ).select(
                F.col('person_id').cast('integer'),
                F.col('gender_concept_id').cast('integer'),
                F.col('year_of_birth').cast('integer'),
                F.col('month_of_birth').cast('integer'),
                F.col('day_of_birth').cast('integer'),
                F.col('birth_datetime').cast('timestamp'),
                F.col('race_concept_id').cast('integer'),
                F.col('location_id').cast('integer'),
                F.col('provider_id').cast('integer'),
                F.col('care_site_id').cast('integer'),
                F.col('person_source_value').cast('string'),
                F.col('gender_source_value').cast('string'),
                F.col('gender_source_concept_id').cast('integer'),
                F.col('race_source_value').cast('string'),
                F.col('race_source_concept_id').cast('integer'),
                F.col('ethnicity_source_value').cast('string'),
                F.col('ethnicity_source_concept_id').cast('integer'),
                F.col('ethnicity_concept_id').cast('integer')
            )

df_person.count()

                                                                                

24690593

In [6]:
write_to_postgresl(df_person, tb_name='person', write_mode='append')

                                                                                

'Total time: 0:06:02.769420 and - Total rows: 24690593 - Total columns: 18'

### Deprecated

In [None]:
# df_person.repartition(1).write.csv('/data/IDAF/SHARED/omop_tables_revisadas/person_infant.csv', header=True, mode='overwrite')

## SQL de insertion

In [None]:
# -- CREATE TABLE public.person (
# -- 			person_id integer NOT NULL,
# -- 			gender_concept_id integer NOT NULL,
# -- 			year_of_birth integer NOT NULL,
# -- 			month_of_birth integer NULL,
# -- 			day_of_birth integer NULL,
# -- 			birth_datetime TIMESTAMP NULL,
# -- 			race_concept_id integer NOT NULL,
# -- 			ethnicity_concept_id integer NOT NULL,
# -- 			location_id integer NULL,
# -- 			provider_id integer NULL,
# -- 			care_site_id integer NULL,
# -- 			person_source_value varchar(50) NULL,
# -- 			gender_source_value varchar(50) NULL,
# -- 			gender_source_concept_id integer NULL,
# -- 			race_source_value varchar(50) NULL,
# -- 			race_source_concept_id integer NULL,
# -- 			ethnicity_source_value varchar(50) NULL,
# -- 			ethnicity_source_concept_id integer NULL );


# -- select distinct year_of_birth from public.person_pyspark

# insert into public.person (
# person_id, 
# gender_concept_id,
# year_of_birth,
# month_of_birth,
# day_of_birth,
# birth_datetime, 
# race_concept_id,
# ethnicity_concept_id,
# location_id,
# provider_id,
# care_site_id,
# person_source_value,
# gender_source_value,
# gender_source_concept_id,
# race_source_value,
# race_source_concept_id,
# ethnicity_source_value,
# ethnicity_source_concept_id
# )
# SELECT 
# cast(nullif(person_id, '') as integer), 
# cast(nullif(gender_concept_id, '') as integer),
# cast(nullif(year_of_birth, '') as integer),
# cast(nullif(month_of_birth, '') as integer),
# cast(nullif(day_of_birth, '') as integer),
# cast(nullif(birth_datetime, '') as timestamp), 
# cast(nullif(race_concept_id, '') as integer),
# cast(ethnicity_concept_id as integer),
# cast(nullif(location_id, '') as integer),
# cast(nullif(provider_id, '') as integer),
# cast(nullif(care_site_id, '') as integer),
# cast(person_source_value as varchar),
# cast(gender_source_value as varchar),
# cast(nullif(gender_source_concept_id, '') as integer),
# cast(race_source_value as varchar),
# cast(nullif(race_source_concept_id, '') as integer),
# cast(ethnicity_source_value as varchar),
# cast(nullif(ethnicity_source_concept_id, '') as integer)
# 	FROM public.person_pyspark;