# Data pre-processing

### Pre-Processing of COVID-19 data in Piauí

Presenting a jupyter notebook generates 4 pickles X_train.pickle, y_train.pickle, X_test.pickle and y_test.pickle referring to pre-processed data considering the dataset of COVID-19 cases in the state of Piauí. Assume that the target (y) of the dataset is the attribute number of deaths;

## Imports

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle

## Loading the data

In [2]:
# selecting in the dataset only the data from piauí by the 'state' column
data = pd.read_csv('caso.csv')
data_piaui = data[data['state'] == 'PI']
data_piaui

Unnamed: 0,date,state,city,place_type,confirmed,deaths,order_for_place,is_last,estimated_population_2019,estimated_population,city_ibge_code,confirmed_per_100k_inhabitants,death_rate
2049353,2022-02-16,PI,,state,360563,7580,700,True,3273227.0,3281480.0,22.0,10987.81647,0.0210
2049354,2022-02-15,PI,,state,359519,7571,699,False,3273227.0,3281480.0,22.0,10956.00156,0.0211
2049355,2022-02-14,PI,,state,358516,7558,698,False,3273227.0,3281480.0,22.0,10925.43608,0.0211
2049356,2022-02-13,PI,,state,357810,7545,697,False,3273227.0,3281480.0,22.0,10903.92140,0.0211
2049357,2022-02-12,PI,,state,357107,7535,696,False,3273227.0,3281480.0,22.0,10882.49814,0.0211
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2191916,2020-04-27,PI,Água Branca,city,6,0,5,False,17411.0,17470.0,2200202.0,34.34459,0.0000
2191917,2020-04-26,PI,Água Branca,city,4,0,4,False,17411.0,17470.0,2200202.0,22.89639,0.0000
2191918,2020-04-25,PI,Água Branca,city,4,0,3,False,17411.0,17470.0,2200202.0,22.89639,0.0000
2191919,2020-04-24,PI,Água Branca,city,4,0,2,False,17411.0,17470.0,2200202.0,22.89639,0.0000


## Handling missing dataes

In [3]:
data_piaui.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 142568 entries, 2049353 to 2191920
Data columns (total 13 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   date                            142568 non-null  object 
 1   state                           142568 non-null  object 
 2   city                            141868 non-null  object 
 3   place_type                      142568 non-null  object 
 4   confirmed                       142568 non-null  int64  
 5   deaths                          142568 non-null  int64  
 6   order_for_place                 142568 non-null  int64  
 7   is_last                         142568 non-null  bool   
 8   estimated_population_2019       142568 non-null  float64
 9   estimated_population            142568 non-null  float64
 10  city_ibge_code                  142568 non-null  float64
 11  confirmed_per_100k_inhabitants  142349 non-null  float64
 12  death_rat

In [4]:
data_piaui_t = data_piaui.dropna(subset=["city", "confirmed_per_100k_inhabitants"]) 

In [5]:
data_piaui_t

Unnamed: 0,date,state,city,place_type,confirmed,deaths,order_for_place,is_last,estimated_population_2019,estimated_population,city_ibge_code,confirmed_per_100k_inhabitants,death_rate
2050053,2022-02-15,PI,Acauã,city,228,2,619,True,7084.0,7102.0,2200053.0,3210.36328,0.0088
2050054,2022-02-14,PI,Acauã,city,228,2,618,False,7084.0,7102.0,2200053.0,3210.36328,0.0088
2050055,2022-02-13,PI,Acauã,city,228,2,617,False,7084.0,7102.0,2200053.0,3210.36328,0.0088
2050056,2022-02-12,PI,Acauã,city,228,2,616,False,7084.0,7102.0,2200053.0,3210.36328,0.0088
2050057,2022-02-11,PI,Acauã,city,228,2,615,False,7084.0,7102.0,2200053.0,3210.36328,0.0088
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2191916,2020-04-27,PI,Água Branca,city,6,0,5,False,17411.0,17470.0,2200202.0,34.34459,0.0000
2191917,2020-04-26,PI,Água Branca,city,4,0,4,False,17411.0,17470.0,2200202.0,22.89639,0.0000
2191918,2020-04-25,PI,Água Branca,city,4,0,3,False,17411.0,17470.0,2200202.0,22.89639,0.0000
2191919,2020-04-24,PI,Água Branca,city,4,0,2,False,17411.0,17470.0,2200202.0,22.89639,0.0000


In [6]:
# Reset the indexes
data_piaui_t = data_piaui_t.reset_index(drop=True)
data_piaui_t

Unnamed: 0,date,state,city,place_type,confirmed,deaths,order_for_place,is_last,estimated_population_2019,estimated_population,city_ibge_code,confirmed_per_100k_inhabitants,death_rate
0,2022-02-15,PI,Acauã,city,228,2,619,True,7084.0,7102.0,2200053.0,3210.36328,0.0088
1,2022-02-14,PI,Acauã,city,228,2,618,False,7084.0,7102.0,2200053.0,3210.36328,0.0088
2,2022-02-13,PI,Acauã,city,228,2,617,False,7084.0,7102.0,2200053.0,3210.36328,0.0088
3,2022-02-12,PI,Acauã,city,228,2,616,False,7084.0,7102.0,2200053.0,3210.36328,0.0088
4,2022-02-11,PI,Acauã,city,228,2,615,False,7084.0,7102.0,2200053.0,3210.36328,0.0088
...,...,...,...,...,...,...,...,...,...,...,...,...,...
141644,2020-04-27,PI,Água Branca,city,6,0,5,False,17411.0,17470.0,2200202.0,34.34459,0.0000
141645,2020-04-26,PI,Água Branca,city,4,0,4,False,17411.0,17470.0,2200202.0,22.89639,0.0000
141646,2020-04-25,PI,Água Branca,city,4,0,3,False,17411.0,17470.0,2200202.0,22.89639,0.0000
141647,2020-04-24,PI,Água Branca,city,4,0,2,False,17411.0,17470.0,2200202.0,22.89639,0.0000


In [7]:
data_piaui_t.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141649 entries, 0 to 141648
Data columns (total 13 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   date                            141649 non-null  object 
 1   state                           141649 non-null  object 
 2   city                            141649 non-null  object 
 3   place_type                      141649 non-null  object 
 4   confirmed                       141649 non-null  int64  
 5   deaths                          141649 non-null  int64  
 6   order_for_place                 141649 non-null  int64  
 7   is_last                         141649 non-null  bool   
 8   estimated_population_2019       141649 non-null  float64
 9   estimated_population            141649 non-null  float64
 10  city_ibge_code                  141649 non-null  float64
 11  confirmed_per_100k_inhabitants  141649 non-null  float64
 12  death_rate      

In [8]:
# Separating the target (y) from the dataset is the attribute number of deaths
data_piaui_alvo = data_piaui_t["deaths"].copy()
data_piaui_t = data_piaui_t.drop('deaths', axis=1) 
data_piaui_alvo

0         2
1         2
2         2
3         2
4         2
         ..
141644    0
141645    0
141646    0
141647    0
141648    0
Name: deaths, Length: 141649, dtype: int64

## Handling categorical data

In [9]:
from sklearn.preprocessing import LabelEncoder
data_piaui_categoria = data_piaui_t['is_last']
data_piaui_categoria1 = data_piaui_t['city']
le = LabelEncoder()
data_piaui_categoria_le = le.fit_transform(data_piaui_categoria)
data_piaui_categoria_le1 = le.fit_transform(data_piaui_categoria1)

In [10]:
# Transforming data into dataFrame
is_last = pd.DataFrame(data_piaui_categoria_le, columns=['is_last'])
city = pd.DataFrame(data_piaui_categoria_le1, columns=['city'])

In [11]:
city.head()

Unnamed: 0,city
0,0
1,0
2,0
3,0
4,0


In [12]:
is_last.head()

Unnamed: 0,is_last
0,1
1,0
2,0
3,0
4,0


In [13]:
# Drop in the is_last column
data_piaui_t = data_piaui_t.drop(['is_last', 'city'], axis=1)
data_piaui_t

Unnamed: 0,date,state,place_type,confirmed,order_for_place,estimated_population_2019,estimated_population,city_ibge_code,confirmed_per_100k_inhabitants,death_rate
0,2022-02-15,PI,city,228,619,7084.0,7102.0,2200053.0,3210.36328,0.0088
1,2022-02-14,PI,city,228,618,7084.0,7102.0,2200053.0,3210.36328,0.0088
2,2022-02-13,PI,city,228,617,7084.0,7102.0,2200053.0,3210.36328,0.0088
3,2022-02-12,PI,city,228,616,7084.0,7102.0,2200053.0,3210.36328,0.0088
4,2022-02-11,PI,city,228,615,7084.0,7102.0,2200053.0,3210.36328,0.0088
...,...,...,...,...,...,...,...,...,...,...
141644,2020-04-27,PI,city,6,5,17411.0,17470.0,2200202.0,34.34459,0.0000
141645,2020-04-26,PI,city,4,4,17411.0,17470.0,2200202.0,22.89639,0.0000
141646,2020-04-25,PI,city,4,3,17411.0,17470.0,2200202.0,22.89639,0.0000
141647,2020-04-24,PI,city,4,2,17411.0,17470.0,2200202.0,22.89639,0.0000


In [14]:
# concatenate
data_piuai_encoded = pd.concat([data_piaui_t, is_last, city], axis=1)
data_piuai_encoded

Unnamed: 0,date,state,place_type,confirmed,order_for_place,estimated_population_2019,estimated_population,city_ibge_code,confirmed_per_100k_inhabitants,death_rate,is_last,city
0,2022-02-15,PI,city,228,619,7084.0,7102.0,2200053.0,3210.36328,0.0088,1,0
1,2022-02-14,PI,city,228,618,7084.0,7102.0,2200053.0,3210.36328,0.0088,0,0
2,2022-02-13,PI,city,228,617,7084.0,7102.0,2200053.0,3210.36328,0.0088,0,0
3,2022-02-12,PI,city,228,616,7084.0,7102.0,2200053.0,3210.36328,0.0088,0,0
4,2022-02-11,PI,city,228,615,7084.0,7102.0,2200053.0,3210.36328,0.0088,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
141644,2020-04-27,PI,city,6,5,17411.0,17470.0,2200202.0,34.34459,0.0000,0,223
141645,2020-04-26,PI,city,4,4,17411.0,17470.0,2200202.0,22.89639,0.0000,0,223
141646,2020-04-25,PI,city,4,3,17411.0,17470.0,2200202.0,22.89639,0.0000,0,223
141647,2020-04-24,PI,city,4,2,17411.0,17470.0,2200202.0,22.89639,0.0000,0,223


In [15]:
# removing 'unnecessary' fields
data_piuai_encoded = data_piuai_encoded.drop(['date', 'state', 'place_type', 'city_ibge_code'], axis=1)

In [16]:
data_piuai_encoded.head()

Unnamed: 0,confirmed,order_for_place,estimated_population_2019,estimated_population,confirmed_per_100k_inhabitants,death_rate,is_last,city
0,228,619,7084.0,7102.0,3210.36328,0.0088,1,0
1,228,618,7084.0,7102.0,3210.36328,0.0088,0,0
2,228,617,7084.0,7102.0,3210.36328,0.0088,0,0
3,228,616,7084.0,7102.0,3210.36328,0.0088,0,0
4,228,615,7084.0,7102.0,3210.36328,0.0088,0,0


In [18]:
data_piuai_encoded.to_csv('data_piaui.csv', index=False)

## Splitting data into training and testing

In [19]:
X_train, X_test, y_train, y_test = train_test_split(data_piuai_encoded, data_piaui_alvo, test_size = 0.2, random_state=1, shuffle=True)

In [20]:
print(X_train)

        confirmed  order_for_place  estimated_population_2019  \
50911          97              289                     2558.0   
78026         137              274                     6441.0   
21449         133              129                     8329.0   
117126        643              324                    12711.0   
76385           1               17                     5349.0   
...           ...              ...                        ...   
73349        1152              439                     5295.0   
109259         64              140                     6432.0   
50057         186              531                     4463.0   
5192          892              554                     6788.0   
128037        268              117                     7989.0   

        estimated_population  confirmed_per_100k_inhabitants  death_rate  \
50911                 2560.0                      3789.06250      0.0412   
78026                 6449.0                      2124.36037      0

## Feature Scaling

In [21]:
#z = (x - u) / s
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [22]:
print(X_train)

[[-0.1510606  -0.16062652 -0.20670927 ...  0.18862142 -0.04054729
  -0.48962709]
 [-0.14372219 -0.24228777 -0.14313121 ... -0.44246783 -0.04054729
   0.17512196]
 [-0.14445603 -1.03167983 -0.11221815 ... -0.32758508 -0.04054729
  -1.21621327]
 ...
 [-0.13473262  1.15684162 -0.17551787 ... -0.03042169 -0.04054729
  -0.50508637]
 [-0.00520953  1.28205553 -0.13744962 ... -0.23567887 -0.04054729
  -1.60269528]
 [-0.11968886 -1.09700883 -0.11778512 ... -0.38579234 -0.04054729
   1.3964051 ]]


In [23]:
pickle.dump(X_train, open('X_train.pickle', 'wb'))
pickle.dump(X_test, open('X_test.pickle', 'wb'))
pickle.dump(y_train, open('y_train.pickle', 'wb'))
pickle.dump(y_test, open('y_test.pickle', 'wb'))

In [24]:
len(X_train)

113319