### Configurações Pyspark

In [16]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Window

import pandas as pd

def write_to_postgresl(df, tb_name=None, write_mode='None'):
    df.count()
    if tb_name is None:
        raise Exception('Informe o nome da tabela')
    if write_mode is None:
        raise Exception('Informe o mode de escrita: append ou overwrite')
        
    from datetime import datetime
    start_time = datetime.now()
    
    try:
        df.write.jdbc(url, table=tb_name, mode=write_mode, properties=properties)
    except Exception as e:
        print(f'Erro: {e}')
    end_time = datetime.now()
    
    total_time = str(end_time - start_time)
    
    return f'Total time: {total_time} and - Total rows: {df.count()} - Total columns: {len(df.columns)}'



    # Criando a sessão do Spark
spark  = SparkSession.builder \
    .appName("Data Analysis") \
    .config('spark.jars', '/data/IDAF/DATABASECONNECTOR_JAR_FOLDER/postgresql-42.2.18.jar')\
    .config("spark.executor.memory", "8g")\
    .config("spark.driver.memory", "8g")\
    .config("spark.executor.cores", "4")\
    .config("spark.executor.instances","8")\
    .config("spark.sql.shuffle.partitions","96")\
    .config("spark.default.parallelism","96")\
    .getOrCreate()


url = "jdbc:postgresql://localhost:5432/postgres"

properties = {
    "user" : "postgres",
    "password" : "cidacs",
    "driver" : "org.postgresql.Driver"
}

# Configurações Pandas

In [17]:
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100)
spark.conf.set("spark.sql.repl.eagerEval.maxNumRows", 300)

spark.conf.set("spark.sql.debug.maxToStringFields", 100)

pd.set_option("display.max_columns", None) 
pd.set_option("display.max_rows", None)

### Leitura da base enriquecida

In [18]:
df_input = (spark
            .read
            .parquet('/data/IDAF/PROJETOS/PARCERIA_CIDACS_PHDC/scripts_omop_tb/basefinal_tb_limp_enriched')
           )

### Criação das colunas da tabela Person do OMOP

In [19]:
cod_mun = ['11','12','13','14','15','16','17','21','22','23','24','25','26','27','28','29','31','32','33','35','41','42','43','50','51','52','53']

In [20]:
df_person = (df_input
             .withColumn('person_id', F.col('person_id'))
             .withColumn('race_concept_id', F.when(F.col('racacormae_sinasc') == '0', 0)
                                             .when(F.col('racacormae_sinasc') == '1', 8527)
                                             .when(F.col('racacormae_sinasc') == '2', 38003598)
                                             .when(F.col('racacormae_sinasc') == '3', 8515)
                                             .when(F.col('racacormae_sinasc') == '4', 4212311)
                                             .when(F.col('racacormae_sinasc') == '5', 19387526)
                                             .when(F.col('racacormae_sinasc') == '99', 0)
                                             .otherwise(0))
             .withColumn('gender_concept_id', F.lit(8532)) # only female
             .withColumn('sg_uf_mae', F.substring(F.col('codmunres_sinasc'), 1, 2))
             .withColumn('ethnicity_concept_id', F.when(F.col('sg_uf_mae').isin(cod_mun), 38003563).otherwise(0))
             .withColumn('year_of_birth', F.year('dt_nascimento_calc_mae'))
             .withColumn('month_of_birth', F.month('dt_nascimento_calc_mae'))
             .withColumn('day_of_birth', F.day('dt_nascimento_calc_mae'))
             .withColumn('location_id', F.col('location_id'))
             .withColumn('person_source_value', F.col('id_cidacs_mae_sinasc'))
             .groupBy('person_id')
             .agg( # Agrupando por person_id para que o dataframe tenha 1 mãe por 
                 F.max('race_concept_id').alias('race_concept_id'), 
                 F.max('gender_concept_id').alias('gender_concept_id'), 
                 F.max('ethnicity_concept_id').alias('ethnicity_concept_id'),
                 F.max('year_of_birth').alias('year_of_birth'), 
                 F.max('month_of_birth').alias('month_of_birth'), 
                 F.max('day_of_birth').alias('day_of_birth'), 
                 F.max('location_id').alias('location_id'),
                 F.max('person_source_value').alias('person_source_value'))
             .withColumn('care_site_id', F.col('location_id')) # Utilizar a mesma regra do location_id: último local onde a mãe foi atendida
             .withColumn('birth_datetime', F.lit(None))
             .withColumn('provider_id', F.lit(None))
             .withColumn('gender_source_value', F.lit(None))
             .withColumn('gender_source_concept_id', F.lit(None))
             .withColumn('race_source_value', F.lit(None))
             .withColumn('race_source_concept_id', F.lit(None))
             .withColumn('ethnicity_source_value', F.lit(None))
             .withColumn('ethnicity_source_concept_id', F.lit(None))
            ).select(
                 F.col('person_id').cast('integer'),
                 F.col('gender_concept_id').cast('integer'),
                 F.col('year_of_birth').cast('integer'),
                 F.col('month_of_birth').cast('integer'),
                 F.col('day_of_birth').cast('integer'),
                 F.col('birth_datetime').cast('timestamp'),
                 F.col('race_concept_id').cast('integer'),
                 F.col('location_id').cast('integer'),
                 F.col('provider_id').cast('integer'),
                 F.col('care_site_id').cast('integer'),
                 F.col('person_source_value').cast('string'),
                 F.col('gender_source_value').cast('string'),
                 F.col('gender_source_concept_id').cast('integer'),
                 F.col('race_source_value').cast('string'),
                 F.col('race_source_concept_id').cast('integer'),
                 F.col('ethnicity_source_value').cast('string'),
                 F.col('ethnicity_source_concept_id').cast('integer'),
                 F.col('ethnicity_concept_id').cast('integer')
            )

df_person.count()

                                                                                

12670996

In [22]:
write_to_postgresl(df_person, tb_name='tuberculose.person', write_mode='append')

                                                                                

'Total time: 0:03:54.511339 and - Total rows: 12670996 - Total columns: 18'