### Imports e Configs

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Window

import matplotlib.pyplot as plt
import seaborn as sns 
import pandas as pd



In [2]:
# Criando a sessão do Spark
spark  = SparkSession.builder \
    .appName("Data Analysis") \
    .config("spark.executor.memory", "8g")\
    .config("spark.driver.memory", "8g")\
    .config("spark.executor.cores", "4")\
    .config("spark.executor.instances","8")\
    .config("spark.sql.shuffle.partitions","96")\
    .config("spark.default.parallelism","96")\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/17 14:15:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100)
spark.conf.set("spark.sql.repl.eagerEval.maxNumRows", 300)

spark.conf.set("spark.sql.debug.maxToStringFields", 100)

pd.set_option("display.max_columns", None) 
pd.set_option("display.max_rows", None)

### Lendo dados enriquecidos

In [4]:
df_input = (spark
            .read
            .parquet('/home/ricardo.neto/Documents/banco_original_enriched'))



In [5]:
df_input.count()

                                                                                

24696018

In [6]:
df_input.columns

['id_cidacs_sinasc_v4',
 'id_cidacs_mae_sinasc',
 'apgar1_sinasc',
 'apgar5_sinasc',
 'cod_raca_cor_pessoa_eq',
 'codanomal_sinasc',
 'codmunres_sinasc',
 'codocupmae_sinasc',
 'consprenat_sinasc',
 'consultas_sinasc',
 'dt_notific_mae',
 'dtnasc_sinasc',
 'dtnascmae_sinasc',
 'dtobito_sim.y',
 'escmae_sinasc',
 'estcivmae_sinasc',
 'gravidez_sinasc',
 'id_agravo_mae',
 'idademae_sinasc',
 'idanomal_sinasc',
 'mesprenat_sinasc',
 'obitograv_sim.y',
 'obitopuerp_sim.y',
 'parto_sinasc',
 'peso_sinasc',
 'qtdfilmort_sinasc',
 'qtdfilvivo_sinasc',
 'semagestac_sinasc',
 'sexo_sinasc',
 'tipobito_sim.y',
 'tpapresent_sinasc',
 'tpconfirma_mae',
 'tpesquema_mae',
 'tpevidenci_mae',
 'tpteste1_mae',
 'tra_dt_sc',
 'person_id',
 'person_id_infant',
 'location_id',
 'dt_nascimento_calc_mae']

### Death

In [7]:
# Mãe que morreram durante o parto?
df_death = (df_input
            .withColumn('death_date', F.to_date(F.col('`dtobito_sim.y`'), 'yyyy-MM-dd')) # Essa é a data de óbito da mãe ou da criança?
            .withColumn('death_datetime', F.lit(None).cast('timestamp'))
            .withColumn('death_type_concept_id', F.lit(32849))
            .withColumn('cause_concept_id', F.lit(None).cast('int'))
            .withColumn('cause_source_value', F.lit(None).cast('string'))
            .withColumn('cause_source_concept_id', F.lit(None).cast('int'))
            .filter(F.col('death_date').isNotNull())
           ).select('person_id', 'death_date', 'death_datetime',
                    'death_type_concept_id', 'cause_concept_id', 'cause_source_value',
                    'cause_source_concept_id') 

df_death.count()

284291

### Salvando CSV 

In [8]:
df_death.repartition(1).write.option("quoteAll",True).csv('/home/ricardo.neto/Documents/df_death', header=True, mode='overwrite')

                                                                                

## SQL de insertion

In [None]:
-- CREATE TABLE public.death (
-- 			person_id integer NOT NULL,
-- 			death_date date NOT NULL,
-- 			death_datetime TIMESTAMP NULL,
-- 			death_type_concept_id integer NULL,
-- 			cause_concept_id integer NULL,
-- 			cause_source_value varchar(50) NULL,
-- 			cause_source_concept_id integer NULL );



-- CREATE TABLE public.death_pyspark (
-- 			person_id varchar,
-- 			death_date varchar,
-- 			death_datetime varchar,
-- 			death_type_concept_id varchar,
-- 			cause_concept_id varchar,
-- 			cause_source_value varchar,
-- 			cause_source_concept_id varchar);


insert into public.death (
	person_id ,
	death_date ,
	death_datetime ,
	death_type_concept_id ,
	cause_concept_id ,
	cause_source_value ,
	cause_source_concept_id 
)
SELECT 
	cast(person_id as integer),
	cast(death_date as date),
	cast(case when death_datetime='' then null else death_datetime end as timestamp),
	cast(case when death_type_concept_id='' then null else death_type_concept_id end as integer),
	cast(case when cause_concept_id='' then null else cause_concept_id end as integer),
	cause_source_value,
	cast(case when cause_source_concept_id='' then null else cause_source_concept_id end as integer) 
	FROM public.death_pyspark;
			

In [1]:
import os

In [4]:
path = '/data/IDAF/PROJETOS/PARCERIA_CIDACS_PHDC/omop_scripts_base_16mi_karine/csv/observation_batch'

files = os.listdir(path)

In [5]:
files

['_temporary']