## Configurações Pyspark

In [117]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Window

import pandas as pd

def write_to_postgresl(df, tb_name=None, write_mode='None'):
    df.count()
    if tb_name is None:
        raise Exception('Informe o nome da tabela')
    if write_mode is None:
        raise Exception('Informe o mode de escrita: append ou overwrite')
        
    from datetime import datetime
    start_time = datetime.now()
    
    try:
        df.write.jdbc(url, table=tb_name, mode=write_mode, properties=properties)
    except Exception as e:
        print(f'Erro: {e}')
    end_time = datetime.now()
    
    total_time = str(end_time - start_time)
    
    return f'Total time: {total_time} and - Total rows: {df.count()} - Total columns: {len(df.columns)}'



    # Criando a sessão do Spark
spark  = SparkSession.builder \
    .appName("Data Analysis") \
    .config('spark.jars', '/data/IDAF/DATABASECONNECTOR_JAR_FOLDER/postgresql-42.2.18.jar')\
    .config("spark.executor.memory", "8g")\
    .config("spark.driver.memory", "8g")\
    .config("spark.executor.cores", "4")\
    .config("spark.executor.instances","8")\
    .config("spark.sql.shuffle.partitions","96")\
    .config("spark.default.parallelism","96")\
    .getOrCreate()


url = "jdbc:postgresql://localhost:5432/postgres"

properties = {
    "user" : "postgres",
    "password" : "cidacs",
    "driver" : "org.postgresql.Driver"
}



In [118]:
import re
from pyspark.sql.types import ArrayType, StringType

def dividir_em_blocos(s):
    return re.findall('.{1,4}', s) if s else None

dividir_udf = F.udf(dividir_em_blocos, ArrayType(StringType()))

## Configurações Pandas

In [119]:
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100)
spark.conf.set("spark.sql.repl.eagerEval.maxNumRows", 300)

spark.conf.set("spark.sql.debug.maxToStringFields", 100)

pd.set_option("display.max_columns", None) 
pd.set_option("display.max_rows", None)

### Lendo dados enriquecidos

In [120]:
df_input = (spark
            .read
            .parquet('/data/IDAF/PROJETOS/PARCERIA_CIDACS_PHDC/banco_original_enriched')
           )

In [121]:
df_vocab_map = spark.read.csv('/data/IDAF/PROJETOS/PARCERIA_CIDACS_PHDC/cte_vocab_map_standard.csv', header=True, sep='\t')

df_vocab_map.createOrReplaceTempView('cte_vocab_map_standard')
df_vocab_map.count()

5284745

In [122]:
df_vocab_map_standard = spark.sql("""
        SELECT 
            REPLACE(source_code, '.', '') as source_code,
            target_concept_id,
            source_vocabulary_id,
            source_concept_id
        FROM cte_vocab_map_standard csv
        WHERE true 
        and target_domain_id = 'Condition'
        and target_standard_concept = 'S'
        and source_vocabulary_id = 'ICD10'
""")
df_vocab_map_standard.count()

                                                                                

15762

In [123]:
df_vocab_map_source = spark.sql("""
        SELECT 
            REPLACE(source_code, '.', '') as source_code,
            source_concept_id            
        FROM cte_vocab_map_standard csv2
        WHERE source_vocabulary_id = 'ICD10'
""")
df_vocab_map_source.count()



19103

### Condition Occurrence

In [124]:
window_condition_occurrence = Window.partitionBy().orderBy('person_id_infant', 'dtnasc_sinasc')

In [127]:
df_condition_occurrence = (
df_input
    .withColumn('person_id', F.col('person_id_infant'))
#     .filter((F.col('id_cidacs_sinasc_v4').isin(26754812495)) | (F.col('person_id').isin(27109637, 26526944)))
    .withColumn('cond_concept_tpapresent_sinasc', F.when(F.col('tpapresent_sinasc')==1, 4312344)
                                                   .when(F.col('tpapresent_sinasc')==2, 4195566)
                                                   .when(F.col('tpapresent_sinasc')==3, 81358)
                                                   .when(F.col('tpapresent_sinasc').isin(0, 9), 0)
                                                   .otherwise(0))
    .withColumn('cond_concept_tpapresent_sinasc', F.array(F.col('cond_concept_tpapresent_sinasc')))
    .withColumn('cond_concept_codanomal_sinasc', F.when(F.col('codanomal_sinasc')!='', dividir_udf(df_input['codanomal_sinasc'])).otherwise(F.array(F.col('codanomal_sinasc'))))
    .withColumn('cond_concept_codanomal_sinasc', F.concat(F.col('cond_concept_tpapresent_sinasc'), F.col('cond_concept_codanomal_sinasc')))
    .withColumn('cond_concept_codanomal_sinasc_exploded', F.explode(F.col('cond_concept_codanomal_sinasc')))
    .withColumn('condition_concept_id_source', F.when(F.col('idanomal_sinasc').isin(0, 2, 88), 0)
                                                 .when(F.col('idanomal_sinasc')==1, F.col('cond_concept_codanomal_sinasc_exploded'))
                                                 .otherwise(0))
    .join(df_vocab_map_standard, F.col('condition_concept_id_source')==df_vocab_map_standard.source_code, how='left')
    .join(df_vocab_map_source, F.col('condition_concept_id_source')==df_vocab_map_source.source_code, how='left')
    .withColumn('condition_concept_id', F.col('target_concept_id'))
#     .withColumn('condition_concept_id', F.when(
#                                 F.substring(F.col('condition_concept_id_source'),1,1).isin('Q'), 
#                 F.col('target_concept_id')).otherwise(F.col('condition_concept_id_source')))
    .withColumn('condition_source_concept_id', F.col('csv.source_concept_id'))
    .withColumn('condition_start_date', F.col('dtnasc_sinasc'))
    .withColumn('condition_type_concept_id', F.lit(32879))
    .withColumn('condition_start_datetime', F.lit(None))
    .withColumn('condition_end_date', F.lit(None))
    .withColumn('condition_end_datetime', F.lit(None))
    .withColumn('stop_reason', F.lit(None))
    .withColumn('condition_status_concept_id', F.lit(None))
    .withColumn('provider_id', F.lit(None))
    .withColumn('visit_occurrence_id', F.col('visit_occurrence_id_infant'))
    .withColumn('visit_detail_id', F.lit(None))
    .withColumn('condition_source_value', F.col('condition_concept_id_source'))
    .withColumn('condition_status_source_value', F.lit(None))
    .withColumn('condition_occurrence_id', F.row_number().over(window_condition_occurrence)+F.lit('1000000000'))
    .withColumn('condition_concept_id', F.when((F.col('condition_source_concept_id').isNull())
                                                     &(F.col('condition_source_value').isin('0', '4312344', '4195566', '81358'))
                                                     ,F.col('condition_source_value')).otherwise(F.col('condition_concept_id')))        
    .select(
                F.col('condition_occurrence_id').cast('integer'),
                F.col('person_id').cast('integer'),
                F.col('condition_concept_id').cast('integer'), # Snomed
                F.col('condition_start_date').cast('date'),
                F.col('condition_start_datetime').cast('timestamp'),
                F.col('condition_end_date').cast('date'),
                F.col('condition_end_datetime').cast('timestamp'),
                F.col('condition_type_concept_id').cast('integer'),
                F.col('condition_status_concept_id').cast('integer'),
                F.col('stop_reason').cast('string'),
                F.col('provider_id').cast('integer'),
                F.col('visit_occurrence_id').cast('integer'),
                F.col('visit_detail_id').cast('integer'),
                F.col('condition_source_value').cast('string'),
                F.col('condition_source_concept_id').cast('integer'), # CID10
                F.col('condition_status_source_value').cast('string'))
    
    .filter(F.col('condition_concept_id').isNotNull())
    .filter(F.col('condition_concept_id')!=0)
    
)


df_condition_occurrence.count()

                                                                                

236690

                                                                                

In [107]:
# # Códigos CID que não tem cadastro no Vocabulário
# df = (df_condition_occurrence
#  .select('condition_source_concept_id', 'condition_source_value')
 
#  .filter((F.col('condition_source_concept_id').isNull())
#              &(F.col('condition_source_value').isNotNull())
#              &(F.col('condition_source_value')!='0'))
# #       .limit(10)
#  .select('condition_source_value').distinct()
# #  .toPandas()
# )
# df.toPandas()

In [128]:
# Não foi escrito no banco ainda, pois o CID não pode ser string, precisa ser integer
write_to_postgresl(df_condition_occurrence, tb_name='condition_occurrence', write_mode='append')

25/03/19 12:16:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/19 12:16:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/19 12:16:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/19 12:17:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/19 12:17:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/19 12:17:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/19 1

'Total time: 0:04:21.617083 and - Total rows: 236690 - Total columns: 16'

## Deprecated

### Salvando CSV 

In [None]:
# df_condition_occurrence.repartition(1).write.option("quoteAll",True).csv('/data/IDAF/PROJETOS/PARCERIA_CIDACS_PHDC/omop_tables_revisadas/condition_occurrence', header=True, mode='overwrite')

## SQL de insertion

In [None]:
# -- CREATE TABLE public.condition_occurrence (
# -- 			condition_occurrence_id integer NOT NULL,
# -- 			person_id integer NOT NULL,
# -- 			condition_concept_id integer NOT NULL,
# -- 			condition_start_date date NOT NULL,
# -- 			condition_start_datetime TIMESTAMP NULL,
# -- 			condition_end_date date NULL,
# -- 			condition_end_datetime TIMESTAMP NULL,
# -- 			condition_type_concept_id integer NOT NULL,
# -- 			condition_status_concept_id integer NULL,
# -- 			stop_reason varchar(20) NULL,
# -- 			provider_id integer NULL,
# -- 			visit_occurrence_id integer NULL,
# -- 			visit_detail_id integer NULL,
# -- 			condition_source_value varchar(50) NULL,
# -- 			condition_source_concept_id integer NULL,
# -- 			condition_status_source_value varchar(50) NULL );


# -- CREATE TABLE public.condition_occurrence_pyspark (
# -- 			condition_occurrence_id varchar,
# -- 			person_id varchar,
# -- 			condition_concept_id varchar,
# -- 			condition_start_date varchar,
# -- 			condition_start_datetime varchar,
# -- 			condition_end_date varchar,
# -- 			condition_end_datetime varchar,
# -- 			condition_type_concept_id varchar,
# -- 			condition_status_concept_id varchar,
# -- 			stop_reason varchar,
# -- 			provider_id varchar,
# -- 			visit_occurrence_id varchar,
# -- 			visit_detail_id varchar,
# -- 			condition_source_value varchar,
# -- 			condition_source_concept_id varchar,
# -- 			condition_status_source_value varchar);


# insert into public.condition_occurrence (
# condition_occurrence_id,
# person_id,
# condition_concept_id,
# condition_start_date,
# condition_start_datetime,
# condition_end_date ,
# condition_end_datetime ,
# condition_type_concept_id ,
# condition_status_concept_id ,
# stop_reason ,
# provider_id ,
# visit_occurrence_id ,
# visit_detail_id ,
# condition_source_value ,
# condition_source_concept_id ,
# condition_status_source_value 
# )
# SELECT 
# cast(condition_occurrence_id as integer),
# cast(person_id as integer),
# cast(case when condition_concept_id = '' then null else condition_concept_id end as integer),
# cast(case when  condition_start_date = '' then null else condition_start_date end as date),
# cast(case when condition_start_datetime = '' then null else condition_start_datetime end as timestamp),
# cast(case when condition_end_date ='' then null else condition_end_date end as date) ,
# cast(case when condition_end_datetime = '' then null else condition_end_datetime end as timestamp) ,
# cast(case when condition_type_concept_id = '' then null else condition_type_concept_id end as integer) ,
# cast(case when condition_status_concept_id = '' then null else condition_status_concept_id end as integer) ,
# stop_reason ,
# cast(case when provider_id = '' then null else provider_id end as integer) ,
# cast(case when visit_occurrence_id = '' then null else visit_occurrence_id end as integer) ,
# cast(case when visit_detail_id ='' then null else visit_detail_id end as integer) ,
# condition_source_value ,
# cast(case when condition_source_concept_id = '' then null else condition_source_concept_id end as integer) ,
# condition_status_source_value
# 	FROM public.condition_occurrence_pyspark;


