### Configurações Pyspark

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Window

import pandas as pd

def write_to_postgresl(df, tb_name=None, write_mode='None'):
    df.count()
    if tb_name is None:
        raise Exception('Informe o nome da tabela')
    if write_mode is None:
        raise Exception('Informe o mode de escrita: append ou overwrite')
        
    from datetime import datetime
    start_time = datetime.now()
    
    try:
        df.write.jdbc(url, table=tb_name, mode=write_mode, properties=properties)
    except Exception as e:
        print(f'Erro: {e}')
    end_time = datetime.now()
    
    total_time = str(end_time - start_time)
    
    return f'Total time: {total_time} and - Total rows: {df.count()} - Total columns: {len(df.columns)}'



    # Criando a sessão do Spark
spark  = SparkSession.builder \
    .appName("Data Analysis") \
    .config('spark.jars', '/data/IDAF/DATABASECONNECTOR_JAR_FOLDER/postgresql-42.2.18.jar')\
    .config("spark.executor.memory", "8g")\
    .config("spark.driver.memory", "8g")\
    .config("spark.executor.cores", "4")\
    .config("spark.executor.instances","8")\
    .config("spark.sql.shuffle.partitions","96")\
    .config("spark.default.parallelism","96")\
    .getOrCreate()


url = "jdbc:postgresql://localhost:5432/postgres"

properties = {
    "user" : "postgres",
    "password" : "cidacs",
    "driver" : "org.postgresql.Driver"
}

25/05/08 10:05:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/08 10:05:27 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


### Configurações Pandas

In [2]:
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100)
spark.conf.set("spark.sql.repl.eagerEval.maxNumRows", 300)

spark.conf.set("spark.sql.debug.maxToStringFields", 100)

pd.set_option("display.max_columns", None) 
pd.set_option("display.max_rows", None)

### Lendo dados enriquecidos

In [6]:
df_input = (spark
            .read
            .parquet('/data/IDAF/PROJETOS/PARCERIA_CIDACS_PHDC/banco_original_enriched')
           )

In [8]:
df_input.select('tra_dt_sc').filter(F.col('tra_dt_sc').isNotNull()).count()



44953

### Location

In [4]:
## Quantidade de exposicao a droga muito baixo pois não existe um start date
df_drug_exposure = (df_input
                  .select('person_id', 'tpesquema_mae', 'tra_dt_sc')
                  .withColumn('drug_concept_id', F.when(F.col('tpesquema_mae').isin(1,2,3), 1728416)
                              .when(F.col('tpesquema_mae').isin(4), 1738521)
                              .when(F.col('tpesquema_mae').isin(5), 4118638)
                              .when(F.col('tpesquema_mae').isin(88,99), 0)
                              .otherwise(0))
                  .withColumn('drug_exposure_start_date', F.to_date(F.col('tra_dt_sc'), 'yyyy-MM-dd'))
                  .withColumn('drug_exposure_end_date', F.when(F.col('tpesquema_mae')==1, F.date_add('tra_dt_sc', 1))
                              .when(F.col('tpesquema_mae')==2, F.date_add('tra_dt_sc', 7))
                              .when(F.col('tpesquema_mae').isin(3, 4), F.date_add('tra_dt_sc', 14))
                              .when(F.col('tpesquema_mae')==5, F.col('tra_dt_sc'))
                              .when(F.col('tpesquema_mae')==88, F.col('tra_dt_sc'))
                              .when(F.col('tpesquema_mae')==99, F.col('tra_dt_sc'))
                              .otherwise(F.col('tra_dt_sc')))
                  .withColumn('drug_type_concept_id', F.lit(32879))
                  .withColumn('quantity', F.when(F.col('tpesquema_mae')==0, F.lit(None).cast('int'))
                              .when(F.col('tpesquema_mae')==1, F.lit(2400000).cast('int'))
                              .when(F.col('tpesquema_mae')==2, F.lit(4800000).cast('int'))
                              .when(F.col('tpesquema_mae')==3, F.lit(7200000).cast('int'))
                              .when(F.col('tpesquema_mae')==4, F.lit(100).cast('int'))
                              .when(F.col('tpesquema_mae')==5, F.lit(0).cast('int'))
                              .when(F.col('tpesquema_mae')==88, F.lit(0).cast('int'))
                              .when(F.col('tpesquema_mae')==99, F.lit(0).cast('int'))
                              .otherwise(0))
                  .withColumn('dose_unit_source_value', F.when(F.col('tpesquema_mae')==0, F.lit(None).cast('int'))
                              .when(F.col('tpesquema_mae').isin(1,2,3), F.lit(8718).cast('int'))
                              .when(F.col('tpesquema_mae')==4, F.lit(8576).cast('int'))
                              .when(F.col('tpesquema_mae')==5, F.lit(None).cast('int'))
                              .when(F.col('tpesquema_mae')==88, F.lit(None).cast('int'))
                              .when(F.col('tpesquema_mae')==99, F.lit(None).cast('int'))
                              .otherwise(0))
                    .withColumn('drug_exposure_end_datetime', F.lit(None))
                    .withColumn('drug_exposure_start_datetime', F.lit(None))
                    .withColumn('verbatim_end_date', F.lit(None))
                    .withColumn('stop_reason', F.lit(None))
                    .withColumn('refills', F.lit(None))
                    .withColumn('days_supply', F.lit(None))
                    .withColumn('sig', F.lit(None))
                    .withColumn('route_concept_id', F.lit(None))
                    .withColumn('lot_number', F.lit(None))
                    .withColumn('provider_id', F.lit(None))
                    .withColumn('visit_occurrence_id', F.lit(None))
                    .withColumn('visit_detail_id', F.lit(None))
                    .withColumn('drug_source_value', F.lit(None))
                    .withColumn('drug_source_concept_id', F.lit(None))
                    .withColumn('route_source_value', F.lit(None))
                    .withColumn('drug_exposure_id', F.row_number().over(Window.orderBy('person_id')))
                    .filter(F.col('drug_exposure_start_date').isNotNull())
           ).select(
                    F.col('drug_exposure_id').cast('integer'),
                    F.col('person_id').cast('integer'),
                    F.col('drug_concept_id').cast('integer'),
                    F.col('drug_exposure_start_date').cast('date'),
                    F.col('drug_exposure_start_datetime').cast('timestamp'),
                    F.col('drug_exposure_end_date').cast('date'),
                    F.col('drug_exposure_end_datetime').cast('timestamp'),
                    F.col('verbatim_end_date').cast('date'),
                    F.col('drug_type_concept_id').cast('integer'),
                    F.col('stop_reason').cast('string'),
                    F.col('refills').cast('integer'),
                    F.col('quantity').cast('float'),
                    F.col('days_supply').cast('integer'),
                    F.col('sig').cast('string'),
                    F.col('route_concept_id').cast('integer'),
                    F.col('lot_number').cast('string'),
                    F.col('provider_id').cast('integer'),
                    F.col('visit_occurrence_id').cast('integer'),
                    F.col('visit_detail_id').cast('integer'),
                    F.col('drug_source_value').cast('string'),
                    F.col('drug_source_concept_id').cast('integer'),
                    F.col('route_source_value').cast('string'),
                    F.col('dose_unit_source_value').cast('string')
                )

df_drug_exposure.count()

                                                                                

44953

## Salvando no postgresql

In [7]:
write_to_postgresl(df_drug_exposure, tb_name='drug_exposure', write_mode='append')

25/03/14 08:39:20 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/14 08:39:20 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/14 08:39:20 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/14 08:39:22 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/14 08:39:22 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

'Total time: 0:00:23.827029 and - Total rows: 44953 - Total columns: 23'

### Deprecated

### Salvando CSV 

In [None]:
# df_drug_exposure.repartition(1).write.option("quoteAll",True).csv('/data/IDAF/SHARED/omop_tables_revisadas/drug_exposure', header=True, mode='overwrite')

## SQL de insertion

In [None]:
# CREATE TABLE public.drug_exposure (
# 			drug_exposure_id integer NOT NULL,
# 			person_id integer NOT NULL,
# 			drug_concept_id integer NOT NULL,
# 			drug_exposure_start_date date NOT NULL,
# 			drug_exposure_start_datetime TIMESTAMP NULL,
# 			drug_exposure_end_date date NOT NULL,
# 			drug_exposure_end_datetime TIMESTAMP NULL,
# 			verbatim_end_date date NULL,
# 			drug_type_concept_id integer NOT NULL,
# 			stop_reason varchar(20) NULL,
# 			refills integer NULL,
# 			quantity NUMERIC NULL,
# 			days_supply integer NULL,
# 			sig TEXT NULL,
# 			route_concept_id integer NULL,
# 			lot_number varchar(50) NULL,
# 			provider_id integer NULL,
# 			visit_occurrence_id integer NULL,
# 			visit_detail_id integer NULL,
# 			drug_source_value varchar(50) NULL,
# 			drug_source_concept_id integer NULL,
# 			route_source_value varchar(50) NULL,
# 			dose_unit_source_value varchar(50) NULL );



# CREATE TABLE public.drug_exposure_pyspark (
# 			drug_exposure_id varchar,
# 			person_id varchar,
# 			drug_concept_id varchar,
# 			drug_exposure_start_date varchar,
# 			drug_exposure_start_datetime varchar,
# 			drug_exposure_end_date varchar,
# 			drug_exposure_end_datetime varchar,
# 			verbatim_end_date varchar,
# 			drug_type_concept_id varchar,
# 			stop_reason varchar,
# 			refills varchar,
# 			quantity varchar,
# 			days_supply varchar,
# 			sig varchar,
# 			route_concept_id varchar,
# 			lot_number varchar,
# 			provider_id varchar,
# 			visit_occurrence_id varchar,
# 			visit_detail_id varchar,
# 			drug_source_value varchar,
# 			drug_source_concept_id varchar,
# 			route_source_value varchar,
# 			dose_unit_source_value varchar)


# insert into public.drug_exposure (
# drug_exposure_id,
# person_id,
# drug_concept_id,
# drug_exposure_start_date,
# drug_exposure_start_datetime,
# drug_exposure_end_date,
# drug_exposure_end_datetime,
# verbatim_end_date,
# drug_type_concept_id,
# stop_reason,
# refills,
# quantity,
# days_supply,
# sig,
# route_concept_id,
# lot_number,
# provider_id,
# visit_occurrence_id,
# visit_detail_id,
# drug_source_value,
# drug_source_concept_id,
# route_source_value,
# dose_unit_source_value
# )
# SELECT 
# cast(case when drug_exposure_id='' then null else drug_exposure_id end as integer),
# cast(case when person_id='' then null else person_id end as integer),
# cast(case when drug_concept_id='' then null else drug_concept_id end as integer),
# cast(case when drug_exposure_start_date='' then null else drug_exposure_start_date end as date),
# cast(case when drug_exposure_start_datetime='' then null else drug_exposure_start_datetime end as timestamp),
# cast(case when drug_exposure_end_date='' then null else drug_exposure_end_date end as date),
# cast(case when drug_exposure_end_datetime='' then null else drug_exposure_end_datetime end as timestamp),
# cast(case when verbatim_end_date='' then null else verbatim_end_date end as timestamp),
# cast(case when drug_type_concept_id='' then null else drug_type_concept_id end as integer),
# stop_reason,
# cast(case when refills='' then null else refills end as integer),
# cast(case when quantity='' then null else quantity end as decimal),
# cast(case when days_supply='' then null else days_supply end as integer),
# sig,
# cast(case when route_concept_id='' then null else route_concept_id end as integer),
# cast(case when lot_number='' then null else lot_number end as integer),
# cast(case when provider_id='' then null else provider_id end as integer),
# cast(case when visit_occurrence_id='' then null else visit_occurrence_id end as integer),
# cast(case when visit_detail_id='' then null else visit_detail_id end as integer),
# drug_source_value,
# cast(case when drug_source_concept_id='' then null else drug_source_concept_id end as integer),
# route_source_value,
# dose_unit_source_value
# 	FROM public.drug_exposure_pyspark;
