In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import types 
from pyspark.sql.types import DateType, StringType
from pyspark.sql.functions import regexp_extract, udf
from pyspark.sql.functions import year, month, col, sum, udf, substring, split, regexp_replace, when, lower, upper, countDistinct
import glob as gb
import glob
#from tqdm import tqdm
import re 
from functools import reduce
import numpy as np
import sys
import pandas as pd

In [2]:
sc = SparkContext()
sc.stop()

conf = SparkConf().setAppName("SRAG_cases")

conf = (conf.setMaster("local[*]")
       .set("spark.executor.memory", "3GB")
       .set("spark.driver.memory", "20GB"))

sc = SparkContext(conf = conf)
spark = SparkSession(sc)
spark

In [3]:
#define paths to files
spark = SparkSession.builder.appName("SRAG").getOrCreate()

# Read data

## Get path names

In [4]:
path = '/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG'
filenames = glob.glob(path + "/*.csv")
filenames

['/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD11.csv',
 '/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD10.csv',
 '/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD12.csv',
 '/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD13.csv',
 '/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD17.csv',
 '/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD16.csv',
 '/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD14.csv',
 '/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD15.csv',
 '/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD18.csv',
 '/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD19.csv',
 '/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD22-12-12-2022.csv',
 '/Users/julianeoliveira/Documents/Projects/AESOP/AESO

## Read and get dataframe columns for variables' harmonization 

In [5]:
dataframes = []
lst_df_columns = []
for f in filenames:
    
    print(f)
    
    #Get dataframe delimiter
    delimiter=spark.createDataFrame(sc.textFile(f).take(1),StringType()).\
    withColumn("chars",regexp_extract(col("value"),"(,|;|\\|)",1)).\
    select("chars").\
    collect()[0][0]
    
    print(delimiter)
    
    df = spark.read.\
    option("delimiter",delimiter).\
    option("header",True).\
    csv(f)
    
    dataframes.append(df)
    lst_df_columns.append([f[-17:]] + df.columns)
    
# Old method
#lst_df_columns = []
#for f in filenames:
#    print(f)
#    df = pd.read_csv(f, low_memory = False, encoding="iso-8859-1")  
#    df = spark.read.format("csv").option("header",True).load(f)
#    lst_df_columns.append([f[-17:]] + df.columns)

/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD11.csv
,
/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD10.csv
,
/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD12.csv
;
/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD13.csv
;
/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD17.csv
;
/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD16.csv
;
/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD14.csv
;
/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD15.csv
;
/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD18.csv
;
/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD19.csv
;
/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD22-12-12-2022.csv
;
/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD09

### Save columns names

In [None]:
#sc.parallelize([lst_df_columns]).saveAsTextFile("/Users/julianeoliveira/Documents/Projects/AESOP/Documentation - Data on Respiratory diseases/dic_srag.csv")

### Search variable function

In [6]:
def check_variables(variable): 

    for value in lst_df_columns:
        print(value[0], variable in value)

In [7]:
check_variables('CO_MUN_NOT')

SRAG/INFLUD11.csv False
SRAG/INFLUD10.csv False
SRAG/INFLUD12.csv False
SRAG/INFLUD13.csv False
SRAG/INFLUD17.csv False
SRAG/INFLUD16.csv False
SRAG/INFLUD14.csv False
SRAG/INFLUD15.csv False
SRAG/INFLUD18.csv False
SRAG/INFLUD19.csv True
22-12-12-2022.csv True
SRAG/INFLUD09.csv False
20-12-12-2022.csv True
21-12-12-2022.csv True


# Initial counts

In [8]:
for i in range(0,len(dataframes)):
    print(lst_df_columns[i][0], dataframes[i].count())

SRAG/INFLUD11.csv 4415
SRAG/INFLUD10.csv 11318
SRAG/INFLUD12.csv 21163
SRAG/INFLUD13.csv 36563
SRAG/INFLUD17.csv 29551
SRAG/INFLUD16.csv 54380
SRAG/INFLUD14.csv 18996
SRAG/INFLUD15.csv 14553
SRAG/INFLUD18.csv 47756
SRAG/INFLUD19.csv 48528
22-12-12-2022.csv 516626
SRAG/INFLUD09.csv 88354
20-12-12-2022.csv 1200995
21-12-12-2022.csv 1733910


In [9]:
def count_distinct_variable(lst_dfs,var):
    for i in range(0,len(lst_dfs)):
        print(lst_df_columns[i][0], lst_dfs[i].select(countDistinct(var)).show())

In [10]:
def count_group_variable(lst_dfs,var):
    for i in range(0,len(lst_dfs)):
        print(lst_df_columns[i][0], lst_dfs[i].groupBy(var).count().show(25))

# Clean and extract variables from datasets

In [11]:
cols_to_select = ['sg_uf_not', 'id_municip','seg_uf', 'id_mn_resi',
                 'dt_notific','sem_not', 'nu_ano','dt_sin_pri','sem_pri',
                  'classi_fin','criterio', 'co_mun_not']

#id_municip - Município onde está localizada a Unidade Sentinela que realizou a notificação. 
# This variable is substituted by 'co_mun_not' for the years of 2019 and so

## Convert columns name to lowercase

In [12]:
for j in range(0, len(dataframes)):
    df = dataframes[j]
    for col in df.columns:
        df = df.withColumnRenamed(col, col.lower())
        dataframes[j] = df  

In [None]:
#Change variable names
for df in dataframes:
    if "id_mn_resi" in df.columns:
        df = df.withColumnRenamed("id_mn_resi", "codmunres")
        
    if "co_mun_res" in df.columns:
        df = df.withColumnRenamed("co_mun_res", "codmunres")

In [None]:
# Number of states that reported a case
#count_distinct_variable(dataframes,"sg_uf_not")

In [None]:
# Number of cases by UF of notification
#count_group_variable(dataframes,"sg_uf_not")

In [None]:
dfs = []

for df in dataframes:
    
    cols = ['sg_uf_not','id_municip']
    
    df_new = df.select(*cols)
    
    dfs.append(df_new)

In [None]:
dfs[0].show()

### 'co_mun_not' and  'id_municip'

In [11]:
filenames

['/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD11.csv',
 '/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD10.csv',
 '/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD12.csv',
 '/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD13.csv',
 '/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD17.csv',
 '/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD16.csv',
 '/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD14.csv',
 '/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD15.csv',
 '/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD18.csv',
 '/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD19.csv',
 '/Users/julianeoliveira/Documents/Projects/AESOP/AESOP datalake/SRAG/INFLUD22-12-12-2022.csv',
 '/Users/julianeoliveira/Documents/Projects/AESOP/AESO

In [24]:
dataframes[9].select('co_mun_not').show()

+----------+
|co_mun_not|
+----------+
|    310620|
|    355030|
|    261160|
|    350950|
|    260410|
|    261160|
|    500270|
|    261160|
|    261160|
|    261160|
|    352590|
|    261160|
|    330455|
|    261160|
|    261160|
|    330455|
|    261160|
|    261160|
|    261160|
|    354980|
+----------+
only showing top 20 rows



In [40]:
dataframes[13].select('id_municip').show()

+-------------------+
|         id_municip|
+-------------------+
|             MACEIO|
|       CAMPO GRANDE|
|        JOAO PESSOA|
|           CASCAVEL|
|          GUARULHOS|
|          SAO PAULO|
|SAO JOSE DOS CAMPOS|
|            ITURAMA|
|            JACAREI|
|             CUIABA|
|          FORTALEZA|
|     RIO DE JANEIRO|
|          SAO PAULO|
|           LONDRINA|
|              SOUSA|
|           CAMPINAS|
|           SOROCABA|
|           SANTAREM|
|          SAO PAULO|
|          PESQUEIRA|
+-------------------+
only showing top 20 rows



In [20]:
data19.select(countDistinct('flag')).show()

+--------------------+
|count(DISTINCT flag)|
+--------------------+
|                   1|
+--------------------+



### We will extract the variables for fist analysis

In [None]:
#Define functions used
#function to combine all dfs
def unionAll(dfs):
    return reduce(lambda df1,df2: df1.union(df2.select(df1.columns)), dfs)

#function to correct date values
def correcting_data(x):
    try:
        if len(x) == 7:
            return x[0:9] + 0 + x[-1]
        else:
            return x
    except:
        return None

In [None]:
#Convert to udf all function that will be used
udf_correcting_data = udf(correcting_data, StringType())

In [None]:
#dataframes[0] = dataframes[0].withColumn("DT_SIN_PRI", df['DT_SIN_PRI'].cast(StringType()))

In [None]:
#Create a empty list to store results of transformations
dfs = []
#Standardize each file
for j in range(0, len(dataframes)):
    df = dataframes[j]
    
    # Convert columns name to lowercase
    for col in df.columns:
        df = df.withColumnRenamed(col, col.lower())
        
    #Change variable names
    if "id_mn_resi" in df.columns:
        df = df.withColumnRenamed("id_mn_resi", "codmunres")
        
    if "co_mun_res" in df.columns:
        df = df.withColumnRenamed("co_mun_res", "codmunres")
        
    #Group data according to variables used
    
    #list with columns that will be used to aggregate data
    cols_to_group = ["dt_sin_pri", "sem_pri" ,
                 "dt_notific", "sem_not", "sg_uf_not",
                 "classi_fin", "criterio", 
                 "id_municip", "cs_sexo", "cs_gestant", "cs_raca", "cs_escol_n",
                 "codmunres", "dt_nasc", "nu_idade_n", "comuninf"]
    
    dfs[j] = df.select(cols_to_group).groupBy(cols_to_group).count()
        

In [None]:
dataframes[0].select('ID_MN_RESI').show()

In [None]:
 #Change variable names
        if "id_mn_resi" in df.columns:
            df = df.withColumnRenamed("id_mn_resi", "codmunres")

In [None]:
filenames[10]

In [None]:
df = spark.read.csv(filenames[0], sep =',', header = True)

In [None]:
df.head() 

In [None]:
dataframes[10].toPandas().head()