# 00.Importing Libraries

In [1]:
!pip install metpy --quiet

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path as path

from metpy.calc import heat_index
from metpy.units import units

# 01.Importing Datasets

In [3]:
datasetDATASUSpath = path(r'datasetDATASUS.csv')
datasetINMETpath = path(r'datasetINMET.csv')

datasetDATASUS = pd.read_csv(datasetDATASUSpath)
datasetINMET = pd.read_csv(datasetINMETpath)

# 02.Concatenating Datasets

In [4]:
# Concatenating datasets
dataset = pd.merge(datasetDATASUS, datasetINMET , on='date', how='outer')

# verifying shape of the datset
print(f'Shape of the final dataset: {dataset.shape}')

# dropping nan to keep only overlapping vales
dataset.dropna(inplace = True)
print(f'Shape of the final dataset after dropin NANs: {dataset.shape}')
# print summary of stats of the dataframe
print(dataset.info())

dataset.head(2)

Shape of the final dataset: (308, 23)
Shape of the final dataset after dropin NANs: (286, 23)
<class 'pandas.core.frame.DataFrame'>
Index: 286 entries, 21 to 306
Data columns (total 23 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   date                               286 non-null    object 
 1   hospitalizations                   286 non-null    float64
 2   hospitalizations_lag_3             286 non-null    float64
 3   average_stay_lag_3                 286 non-null    float64
 4   deaths_lag_3                       286 non-null    float64
 5   mortality_rate_lag_3               286 non-null    float64
 6   hospitalizations_lag_3_12m_window  286 non-null    float64
 7   average_stay_lag_3_12m_window      286 non-null    float64
 8   deaths_lag_3_12m_window            286 non-null    float64
 9   mortality_rate_lag_3_12m_window    286 non-null    float64
 10  heat_index                      

Unnamed: 0,date,hospitalizations,hospitalizations_lag_3,average_stay_lag_3,deaths_lag_3,mortality_rate_lag_3,hospitalizations_lag_3_12m_window,average_stay_lag_3_12m_window,deaths_lag_3_12m_window,mortality_rate_lag_3_12m_window,...,heat_index99,heat_index95,temperature_mean,temperature_max,temperature_min,wind_speed_mean,humidity_mean,lag_heat_index_1,lag_heat_index_2,lag_heat_index_3
21,2000-12,4266.0,4365.0,6.4,293.0,6.71,4277.5,5.9,245.333333,5.728333,...,37.8966,33.538,23.575,35.4,13.4,1.826613,70.836022,21.132403,19.318992,15.300694
22,2001-01,4032.0,4388.0,6.3,275.0,6.27,4257.25,5.925,246.25,5.775833,...,37.8471,34.6685,25.136962,35.4,17.0,1.777016,75.375,24.154261,21.132403,19.318992


In [5]:
#Conver date column to datetime object
dataset['date'] = pd.to_datetime(dataset['date'], format = '%Y-%m')

# Reducing the time range
#Filtering out COVID period - final dataset will only go from January 2001 to December 2020
dataset[(dataset['date'] >= pd.Timestamp(2001,1,1)) & (dataset['date'] < pd.Timestamp(2020,1,1))]

Unnamed: 0,date,hospitalizations,hospitalizations_lag_3,average_stay_lag_3,deaths_lag_3,mortality_rate_lag_3,hospitalizations_lag_3_12m_window,average_stay_lag_3_12m_window,deaths_lag_3_12m_window,mortality_rate_lag_3_12m_window,...,heat_index99,heat_index95,temperature_mean,temperature_max,temperature_min,wind_speed_mean,humidity_mean,lag_heat_index_1,lag_heat_index_2,lag_heat_index_3
22,2001-01-01,4032.0,4388.0,6.3,275.0,6.27,4257.250000,5.925000,246.250000,5.775833,...,37.8471,34.6685,25.136962,35.4,17.0,1.777016,75.375000,24.154261,21.132403,19.318992
23,2001-02-01,3888.0,4329.0,6.1,258.0,5.96,4256.583333,5.933333,248.083333,5.819167,...,37.7548,35.3940,25.691071,34.2,20.2,1.916518,76.424107,26.316156,24.154261,21.132403
24,2001-03-01,3658.0,4266.0,6.1,242.0,5.67,4265.166667,5.966667,250.833333,5.873333,...,36.7670,34.4155,24.985349,34.4,16.3,1.629704,76.383065,27.119464,26.316156,24.154261
25,2001-04-01,4022.0,4032.0,5.9,218.0,5.41,4263.833333,5.975000,250.666667,5.871667,...,31.0030,28.0215,21.038333,30.8,11.9,1.455139,81.152778,26.113387,27.119464,26.316156
26,2001-05-01,4332.0,3888.0,5.8,217.0,5.58,4242.500000,5.991667,250.500000,5.896667,...,24.5396,23.2800,17.072984,27.3,10.8,1.699059,82.583333,21.383903,26.113387,27.119464
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,2019-08-01,3610.0,3209.0,6.1,256.0,7.98,3208.666667,6.500000,310.416667,9.650000,...,28.9057,26.0325,15.573118,32.7,5.2,1.410887,74.543011,13.207366,18.511556,19.420833
246,2019-09-01,3518.0,3239.0,6.4,327.0,10.10,3204.666667,6.491667,310.583333,9.667500,...,32.9849,26.8015,16.913750,36.0,7.1,1.453056,76.058333,15.129073,13.207366,18.511556
247,2019-10-01,3623.0,3375.0,6.4,357.0,10.58,3205.416667,6.475000,308.083333,9.590833,...,36.6327,30.2535,20.594489,37.9,12.6,1.644758,78.755376,16.676639,15.129073,13.207366
248,2019-11-01,3516.0,3610.0,6.4,362.0,10.03,3226.833333,6.441667,310.916667,9.611667,...,34.8625,31.0765,22.237778,33.4,12.6,1.582500,73.569444,20.828575,16.676639,15.129073


# 03.Save Final Dataset

In [6]:
dataset.to_csv('dataset.csv', index=False)

In [9]:
for c in dataset.columns: print(c)

date
hospitalizations
hospitalizations_lag_3
average_stay_lag_3
deaths_lag_3
mortality_rate_lag_3
hospitalizations_lag_3_12m_window
average_stay_lag_3_12m_window
deaths_lag_3_12m_window
mortality_rate_lag_3_12m_window
heat_index
heat_index_sum
heat_index_std
heat_index99
heat_index95
temperature_mean
temperature_max
temperature_min
wind_speed_mean
humidity_mean
lag_heat_index_1
lag_heat_index_2
lag_heat_index_3


In [10]:
dataset

Unnamed: 0,date,hospitalizations,hospitalizations_lag_3,average_stay_lag_3,deaths_lag_3,mortality_rate_lag_3,hospitalizations_lag_3_12m_window,average_stay_lag_3_12m_window,deaths_lag_3_12m_window,mortality_rate_lag_3_12m_window,...,heat_index99,heat_index95,temperature_mean,temperature_max,temperature_min,wind_speed_mean,humidity_mean,lag_heat_index_1,lag_heat_index_2,lag_heat_index_3
21,2000-12-01,4266.0,4365.0,6.4,293.0,6.71,4277.500000,5.900000,245.333333,5.728333,...,37.8966,33.5380,23.575000,35.4,13.4,1.826613,70.836022,21.132403,19.318992,15.300694
22,2001-01-01,4032.0,4388.0,6.3,275.0,6.27,4257.250000,5.925000,246.250000,5.775833,...,37.8471,34.6685,25.136962,35.4,17.0,1.777016,75.375000,24.154261,21.132403,19.318992
23,2001-02-01,3888.0,4329.0,6.1,258.0,5.96,4256.583333,5.933333,248.083333,5.819167,...,37.7548,35.3940,25.691071,34.2,20.2,1.916518,76.424107,26.316156,24.154261,21.132403
24,2001-03-01,3658.0,4266.0,6.1,242.0,5.67,4265.166667,5.966667,250.833333,5.873333,...,36.7670,34.4155,24.985349,34.4,16.3,1.629704,76.383065,27.119464,26.316156,24.154261
25,2001-04-01,4022.0,4032.0,5.9,218.0,5.41,4263.833333,5.975000,250.666667,5.871667,...,31.0030,28.0215,21.038333,30.8,11.9,1.455139,81.152778,26.113387,27.119464,26.316156
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
302,2024-05-01,3172.0,3176.0,6.4,264.0,8.31,3490.333333,6.950000,316.416667,9.045833,...,33.1822,27.3330,16.684946,32.5,7.1,1.345027,85.837366,22.136556,25.621075,27.636221
303,2024-06-01,3640.0,3390.0,6.7,277.0,8.17,3503.083333,6.916667,316.500000,9.015833,...,29.7834,26.8025,17.371111,31.5,4.6,1.265139,80.636111,16.708185,22.136556,25.621075
304,2024-07-01,3689.0,3329.0,6.5,308.0,9.25,3510.166667,6.891667,321.500000,9.150000,...,26.8167,21.7780,13.736022,28.9,3.4,1.123387,81.887097,17.283889,16.708185,22.136556
305,2024-08-01,3846.0,3172.0,6.7,313.0,9.87,3497.666667,6.866667,319.500000,9.127500,...,28.9485,24.4525,15.594758,31.1,4.5,1.394892,78.956989,13.313495,17.283889,16.708185
