In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import types 
from pyspark.sql.types import DateType, StringType
from pyspark.sql.functions import regexp_extract, udf
from pyspark.sql.functions import year, month, col, sum, udf, substring, split, regexp_replace, when, lower, upper, countDistinct
from pyspark.sql.functions import isnan, count, lit
import glob as gb
import glob
#from tqdm import tqdm
import re 
from functools import reduce
import numpy as np
import sys
import pandas as pd

In [2]:
sc = SparkContext()
sc.stop()

conf = SparkConf().setAppName("SRAG_cases")

conf = (conf.setMaster("local[*]")
       .set("spark.executor.memory", "3GB")
       .set("spark.driver.memory", "20GB"))

sc = SparkContext(conf = conf)
spark = SparkSession(sc)
spark

In [3]:
#define paths to files
spark = SparkSession.builder.appName("SRAG").getOrCreate()

# Read data

## Get path names

In [4]:
path = '/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG'
filenames = glob.glob(path + "/*.csv")
filenames

['/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD11.csv',
 '/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD10.csv',
 '/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD12.csv',
 '/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD13.csv',
 '/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD17.csv',
 '/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD16.csv',
 '/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD14.csv',
 '/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD15.csv',
 '/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD18.csv',
 '/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD19.csv',
 '/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD22-12-12-2022.csv',
 '/Users/julianeoliveira/Documents/Projects/AESOP/AESO

## Read and get dataframe columns for variables' harmonization 

In [5]:
dataframes = []
lst_df_columns = []
for f in filenames:
    
    print(f)
    
    #Get dataframe delimiter
    delimiter=spark.createDataFrame(sc.textFile(f).take(1),StringType()).\
    withColumn("chars",regexp_extract(col("value"),"(,|;|\\|)",1)).\
    select("chars").\
    collect()[0][0]
    
    print(delimiter)
    
    df = spark.read.\
    option("delimiter",delimiter).\
    option("header",True).\
    csv(f)
    
    dataframes.append(df)
    lst_df_columns.append([f[-17:]] + df.columns)
    
# Old method
#lst_df_columns = []
#for f in filenames:
#    print(f)
#    df = pd.read_csv(f, low_memory = False, encoding="iso-8859-1")  
#    df = spark.read.format("csv").option("header",True).load(f)
#    lst_df_columns.append([f[-17:]] + df.columns)

/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD11.csv
,
/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD10.csv
,
/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD12.csv
;
/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD13.csv
;
/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD17.csv
;
/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD16.csv
;
/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD14.csv
;
/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD15.csv
;
/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD18.csv
;
/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD19.csv
;
/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD22-12-12-2022.csv
;
/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD09

### Save columns names

In [None]:
#sc.parallelize([lst_df_columns]).saveAsTextFile("/Users/julianeoliveira/Documents/Projects/AESOP/Documentation - Data on Respiratory diseases/dic_srag.csv")

### Search variable function

In [6]:
def check_variables(variable): 

    for value in lst_df_columns:
        print(value[0], variable in value)

In [7]:
check_variables('NU_ANO')

SRAG/INFLUD11.csv True
SRAG/INFLUD10.csv False
SRAG/INFLUD12.csv True
SRAG/INFLUD13.csv True
SRAG/INFLUD17.csv True
SRAG/INFLUD16.csv True
SRAG/INFLUD14.csv True
SRAG/INFLUD15.csv True
SRAG/INFLUD18.csv True
SRAG/INFLUD19.csv False
22-12-12-2022.csv False
SRAG/INFLUD09.csv True
20-12-12-2022.csv False
21-12-12-2022.csv False


In [8]:
dataframes[0].select(countDistinct('NU_ANO')).show()

+----------------------+
|count(DISTINCT NU_ANO)|
+----------------------+
|                     2|
+----------------------+



In [10]:
dataframes[0].groupBy('NU_ANO').count().show()

+------+-----+
|NU_ANO|count|
+------+-----+
|  2012|   82|
|  2011| 4333|
+------+-----+



In [11]:
dataframes[1] = dataframes[1].withColumn("ano", lit(2010))

dataframes[9] = dataframes[9].withColumn("ano", lit(2019))

dataframes[10] = dataframes[10].withColumn("ano", lit(2022))

dataframes[12] = dataframes[12].withColumn("ano", lit(2020))

dataframes[13] = dataframes[13].withColumn("ano", lit(2021))

In [None]:
dataframes[1].select('ano').count()

# Initial counts

In [12]:
for i in range(0,len(dataframes)):
    print(lst_df_columns[i][0], dataframes[i].count())

SRAG/INFLUD11.csv 4415
SRAG/INFLUD10.csv 11318
SRAG/INFLUD12.csv 21163
SRAG/INFLUD13.csv 36563
SRAG/INFLUD17.csv 29551
SRAG/INFLUD16.csv 54380
SRAG/INFLUD14.csv 18996
SRAG/INFLUD15.csv 14553
SRAG/INFLUD18.csv 47756
SRAG/INFLUD19.csv 48528
22-12-12-2022.csv 516626
SRAG/INFLUD09.csv 88354
20-12-12-2022.csv 1200995
21-12-12-2022.csv 1733910


In [13]:
def count_distinct_variable(lst_dfs,var):
    for i in range(0,len(lst_dfs)):
        print(lst_df_columns[i][0], lst_dfs[i].select(countDistinct(var)).show())

In [14]:
def count_group_variable(lst_dfs,var):
    for i in range(0,len(lst_dfs)):
        print(lst_df_columns[i][0], lst_dfs[i].groupBy(var).count().show(25))

In [15]:
def count_null(df):
    Dict_Null = {col:df.filter(df[col].isNull()).count() for col in df.columns}
    print(Dict_Null)


# Clean and extract variables from datasets

In [16]:
cols_to_select = ['sg_uf_not', 'id_municip','sg_uf', 'id_mn_resi',
                 'dt_notific','sem_not', 'nu_ano','dt_sin_pri','sem_pri',
                  'classi_fin','criterio', 'co_mun_not']

#id_municip - Município onde está localizada a Unidade Sentinela que realizou a notificação. 
# This variable is substituted by 'co_mun_not' for the years of 2019 and so

# Number of states that reported a case
#count_distinct_variable(dataframes,"sg_uf_not")

# Number of cases by UF of notification
#count_group_variable(dataframes,"sg_uf_not")

In [None]:
check_variables('CO_MUN_NOT')

## Convert columns name to lowercase

In [17]:
for j in range(0, len(dataframes)):
    df = dataframes[j]
    for col in df.columns:
        df = df.withColumnRenamed(col, col.lower())
        dataframes[j] = df  

### Rename columns

#### Municipality of residence

In [18]:
for i in range(0,9):
    if "id_mn_resi" in dataframes[i].columns:
        dataframes[i] = dataframes[i].withColumnRenamed("id_mn_resi", "codmunres")

In [19]:
dataframes[11] = dataframes[11].withColumnRenamed("id_mn_resi", "codmunres")

In [20]:
for i in range(len(dataframes)):
    if "co_mun_res" in dataframes[i].columns:
        dataframes[i] = dataframes[i].withColumnRenamed("co_mun_res", "codmunres")

#### Municipality of notification

In [21]:
for i in range(0,9):
    if "id_municip" in dataframes[i].columns:
        dataframes[i] = dataframes[i].withColumnRenamed("id_municip", "codmunnot")

In [22]:
dataframes[11] = dataframes[11].withColumnRenamed("id_municip", "codmunnot")

In [23]:
for i in range(len(dataframes)):
    if "co_mun_not" in dataframes[i].columns:
        dataframes[i] = dataframes[i].withColumnRenamed("co_mun_not", "codmunnot")

#### Year

In [24]:
for i in range(len(dataframes)):
    if "nu_ano" in dataframes[i].columns:
        dataframes[i] = dataframes[i].withColumnRenamed("nu_ano", "ano")

In [None]:
'codmunnot' in dataframes[0].columns

# Select variables from dataframes

In [25]:
dfs = []

for i in range(len(dataframes)):
    
    cols = ['ano','sg_uf','sg_uf_not','codmunres','codmunnot','dt_sin_pri','dt_notific','sem_not','classi_fin','criterio']
    
    df_new = dataframes[i].select(*cols)
    
    dfs.append(df_new)

In [26]:
dfs[1].show()

+----+-----+---------+---------+---------+----------+----------+-------+----------+--------+
| ano|sg_uf|sg_uf_not|codmunres|codmunnot|dt_sin_pri|dt_notific|sem_not|classi_fin|criterio|
+----+-----+---------+---------+---------+----------+----------+-------+----------+--------+
|2010|   11|       11|   110002|   110020|15/04/2010|21/04/2010| 201016|       3.0|    null|
|2010|   11|       11|   110002|   110020|30/04/2010|04/05/2010| 201018|       1.0|     1.0|
|2010|   11|       11|   110002|   110020|03/05/2010|11/05/2010| 201019|       3.0|     1.0|
|2010|   11|       11|   110004|   110020|08/03/2010|22/03/2010| 201012|       1.0|     2.0|
|2010|   11|       11|   110010|   110020|28/01/2010|02/02/2010| 201005|       3.0|     1.0|
|2010|   11|       11|   110010|   110010|14/03/2010|17/03/2010| 201011|       3.0|     2.0|
|2010|   11|       11|   110010|   110010|03/03/2010|07/03/2010| 201010|       3.0|    null|
|2010|   11|       11|   110010|   110010|18/04/2010|22/04/2010| 20101

In [27]:
#function to combine all dfs
def unionAll(dfs):
    return reduce(lambda df1,df2: df1.union(df2.select(df1.columns)), dfs)

In [28]:
final_data = unionAll(dfs)

In [29]:
final_data.show()

+----+-----+---------+---------+---------+----------+----------+-------+----------+--------+
| ano|sg_uf|sg_uf_not|codmunres|codmunnot|dt_sin_pri|dt_notific|sem_not|classi_fin|criterio|
+----+-----+---------+---------+---------+----------+----------+-------+----------+--------+
|2011|   11|       11|   110020|   110020|15/02/2011|21/02/2011| 201108|       3.0|     1.0|
|2011|   11|       53|   110020|   530010|29/03/2011|07/04/2011| 201114|       3.0|     1.0|
|2011|   11|       11|   110020|   110020|28/04/2011|04/05/2011| 201118|       3.0|     1.0|
|2011|   11|       11|   110020|   110020|23/08/2011|27/08/2011| 201134|       1.0|     1.0|
|2011|   11|       11|   110012|   110020|18/08/2011|31/08/2011| 201135|       3.0|     1.0|
|2011|   11|       11|   110020|   110020|01/08/2011|05/08/2011| 201131|       3.0|     1.0|
|2011|   11|       11|   110011|   110020|19/08/2011|19/09/2011| 201138|       3.0|    null|
|2011|   11|       11|   110011|   110020|06/10/2011|11/10/2011| 20114

In [30]:
final_data.count()

3827108

# Save data

In [31]:
final_data.coalesce(1).write.format("csv").save("/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/Filtered_raw/srag_2009_2022.csv")