# Rio Grande do Sul COVID cases Data Analysis
---

In [2]:
import dateutil.utils
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import re
import scipy
from unidecode import unidecode
import seaborn as sns

## Step 1: Merging all data-sets

In [5]:
d1 = pd.read_csv("../db/dados-rs-1.csv", delimiter=";", on_bad_lines='skip', encoding='latin1', low_memory=True)
d2 = pd.read_csv("../db/dados-rs-2.csv", delimiter=";", on_bad_lines='skip', encoding='latin1', low_memory=True)
d3 = pd.read_csv("../db/dados-rs-3.csv", delimiter=";", on_bad_lines='skip', encoding='latin1', low_memory=True)
d4 = pd.read_csv("../db/dados-rs-4.csv", delimiter=";", on_bad_lines='skip', encoding='latin1', low_memory=True)
d5 = pd.read_csv("../db/dados-rs-5.csv", delimiter=";", on_bad_lines='skip', encoding='latin1', low_memory=True)
d6 = pd.read_csv("../db/dados-rs-6.csv", delimiter=";", on_bad_lines='skip', encoding='latin1', low_memory=True)

df = pd.concat([d1, d2, d3, d4, d5, d6])
df.to_csv('dados-rs.csv', index=False)

# CBO - Classificação Brasileira de Ocupações

          ÿid           dataNotificacao        dataInicioSintomas  \
0  HE8B8WiBpk  2020-08-25T03:00:37.280Z  2020-08-24T03:00:00.000Z   
1  IfUV4kqF1I  2020-08-17T03:00:48.914Z  2020-08-03T03:00:00.000Z   
2  auCqY2SQ4w  2020-06-26T17:37:33.237Z  2020-06-10T03:00:00.000Z   
3  R8MGrEmw1z  2020-06-26T03:00:00.000Z  2020-06-26T03:00:00.000Z   
4  w6ClV1iTLg  2020-08-21T03:00:51.066Z  2020-08-11T03:00:00.000Z   

  dataNascimento       sintomas profissionalSaude  \
0      undefined         Outros               Não   
1      undefined         Outros               Não   
2      undefined         Outros               Não   
3      undefined  Tosse, Outros               Sim   
4      undefined         Outros               Não   

                                                 cbo  \
0                                                NaN   
1                                                NaN   
2                                                NaN   
3  5151 - Outro tipo de agente de saúde ou

## Step 2: Data Processing

In [None]:
# Loading the dataset
df = pd.read_csv("dados-rs.csv", delimiter=",", on_bad_lines='skip', encoding='utf-8', low_memory=True)
print(df.head(5))

In [6]:
# Some of the columns have values different from null or NaN
# but they are defined as 'undefined'

# After the analysis, the columns that must be droped are:
#   CBO
#   PaisOrigem
#   estado
#   estadoIBGE
#   origem
#   excluido
#   validado
#   dataNascimento

# Dropping columns that we deem to be unserviceable to our data analysis
df.drop(columns=["cbo", "paisOrigem", "estado", "estadoIBGE", "origem", "excluido", "validado", "dataNascimento"],
        inplace=True)

# Some of the values are defined as 'undefined', which are not
# classified as null or NaN for the pandas function
# Then, we are going to change this value to 'null'
df.sexo.replace("Indefinido", "null")

# Changing the data type of some columns to datetime
df.dataNotificacao = pd.to_datetime(df.dataNotificacao, errors='coerce')
df.dataTeste = pd.to_datetime(df.dataTeste, errors='coerce')
df.dataEncerramento = pd.to_datetime(df.dataEncerramento, errors='coerce')
df.dataInicioSintomas = pd.to_datetime(df.dataInicioSintomas, errors='coerce')

# Some lines (44510 [pd.isnull(df.sintomas).sum()]) have no symptoms described (== 'null')
# then, we are going to drop those lines
df = df[pd.notnull(df.sintomas)]

# There are some inconsistencies into de age date (i.e. ages up to 320)
# Since the population in RS with +100 is less than 0.0009% of the total population
# we are going to drop those lines with 100+ years
df = df[df.idade <= 100]

# Replacing variations of 'Dispneia' to only the correct spelling
df.sintomas.replace(regex=['Dispineia', 'Dispinéia', 'Dispnéia', 'Dificuldade De Respirar'], value='Dispneia',
                    inplace=True)

# Replacing variants of 'Assintomático'
df.sintomas.replace(regex=['Outros: Paciente assintomático'], value='Assintomático', inplace=True)

# Replacing variants of 'Dor de Garganta'
df.sintomas.replace(regex=['Dor De Garganta'], value='Dor de Garganta', inplace=True)

# Listing the symptoms
sintomas = []

# Splitting symptom column values into a tuple of symptoms
for i in df.sintomas:
    sintoma = re.split(",|, ", str(i))
    for j in sintoma:
        if j.strip() not in sintomas and j.strip() != '':
            sintomas.append(j.strip())

# Create the new columns to the symptoms variants
aux = 4
for i in sintomas:
    df.insert(aux, i, 'False')
    aux += 1

# For each symptom list, modify the target column for each symptom
for index, row in df.iterrows():
    for sim in sintomas:
        if sim in row['sintomas']:
            df.at[index, sim] = 'True'

# Saving the dataset without the selected columns
df.to_csv('../db/dados-rs-clean.csv', index=False)

In [None]:
# Loading the clean dataframe
df = pd.read_csv("../db/dados-rs-clean.csv", delimiter=",", on_bad_lines='skip', encoding='utf-8', low_memory=True)

In [3]:
# Sorting DataFrame by dataNotificacao
df.sort_values(by=['dataNotificacao'], inplace=True)
print(df.head(10))

FileNotFoundError: [Errno 2] No such file or directory: 'dados-rs.csv'

In [None]:
# Saving sorted dataframe into a new .csv
df.to_csv('../db/dados-rs-clean-sorted.csv', index=False)

In [None]:
# Loading the clean and sorted dataframe
df = pd.read_csv("../db/dados-rs-clean-sorted.csv", delimiter=",", on_bad_lines='skip', encoding='utf-8',
                 low_memory=True)

In [None]:
# Converting date columns to Date (yyyy-MM-dd)
df.dataNotificacao = pd.to_datetime(df.dataNotificacao, errors='coerce').dt.date
df.dataTeste = pd.to_datetime(df.dataTeste, errors='coerce').dt.date
df.dataEncerramento = pd.to_datetime(df.dataEncerramento, errors='coerce').dt.date
df.dataInicioSintomas = pd.to_datetime(df.dataInicioSintomas, errors='coerce').dt.date

In [None]:
# Dropping lines without 'dataNotificacao'
df = df[pd.notnull(df.dataNotificacao)]

In [None]:
# creating new columns to describe week and year number
df.insert(31, 'WeekNumber', 'null')
df.insert(32, 'YearNumber', 'null')

df['WeekNumber'] = df.dataNotificacao.apply(lambda x: x.isocalendar()[1])
df['YearNumber'] = df.dataNotificacao.apply(lambda x: x.isocalendar()[0])

# Step 3: Data Analysis
## 26. Has there been a significant surge in the number of COVID-19 cases in dates close to public (national and state) holidays?

In [None]:
#df1 = df.loc[df.YearNumber == 2020]
sns.histplot(df1.WeekNumber, kde=True, hue="species")
plt.show()

#week = df.groupby(df.dataNotificacao.where((df.WeekNumber > 12) & (df.WeekNumber < 17) & (df.YearNumber == 2020)))['ÿid'].count()
#week.plot(kind='line', figsize=(10,5), legend = None)

## 31.
## 34. What is the most used COVID test in the state? Proportionally, what is the percentage of positive results? How about negative ones?