# Imports

In [1]:
import pandas as pd
import numpy as np

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pylab as plt

In [3]:
from statsmodels.tsa.arima_model import ARMA

In [4]:
from sklearn.metrics import mean_squared_error as mse

In [5]:
import warnings
warnings.simplefilter('ignore')

In [6]:
plt.rcParams['figure.figsize']=(20, 20)

# Functions

## Data cleaning

In [7]:
def clean_data():
    
    global db1
    db1 = db1[(db1.num_infections > 0)].reset_index()
    db1 = db1.drop(columns = ['index'], axis = 1)
    
    db1['date'] = pd.to_datetime(db1.date)
    
   
    db1['autonomous_region'] = db1.autonomous_region.apply(lambda x: 'Comunidad Valenciana' if x == 'Valenciana, Comunidad' else x)
    db1['autonomous_region'] = db1.autonomous_region.apply(lambda x: 'Comunidad de Madrid' if x == 'Madrid, Comunidad de' else x)
    db1['autonomous_region'] = db1.autonomous_region.apply(lambda x: 'Región de de Murcia' if x == 'Murcia, Región de' else x)
    db1['autonomous_region'] = db1.autonomous_region.apply(lambda x: 'Comunidad Foral de Navarra' if x == 'Navarra, Comunidad Foral de' else x)
    db1['autonomous_region'] = db1.autonomous_region.apply(lambda x: 'Principado de Asturias' if x == 'Asturias, Principado de' else x)
    
    
    db1['province'] = db1.province.apply(lambda x: 'Alicante' if x == 'Alicante/Alacant' else x)
    db1['province'] = db1.province.apply(lambda x: 'Castellón' if x == 'Castellón/Castelló' else x)
    db1['province'] = db1.province.apply(lambda x: 'Araba' if x == 'Araba/Álava' else x)
    db1['province'] = db1.province.apply(lambda x: 'Valencia' if x == 'Valencia/València' else x)
    
    db1['sex'] = db1.sex.apply(lambda x: 'UKNOWN' if x == 'NC' else x)
    
    db1['age_interval'] = db1.age_interval.apply(lambda x: 'UKNOWN' if x == 'NC' else x)

## Cumulative

In [8]:
def cumulative(dataframe):
    
    cumu_num_infections = dataframe.num_infections.cumsum()
    cumu_num_hosp = dataframe.num_hosp.cumsum()
    cumu_num_uci = dataframe.num_uci.cumsum()
    cumu_num_dead = dataframe.num_dead.cumsum()

    cumulative = pd.DataFrame({'cumu_num_infections': cumu_num_infections, 'cumu_num_hosp': cumu_num_hosp, 
                               'cumu_num_uci': cumu_num_uci, 'cumu_num_dead': cumu_num_dead})
    dataframe = pd.concat([dataframe, cumulative], axis = 1)
    return dataframe

## Relative frequencies

In [9]:
def freq_rel(dataframe):
    
    dataframe['Number of infection (%)'] = [round(i/ dataframe['num_infections'].sum(),3)*100 for i in dataframe['num_infections']]
    dataframe['Number of hospitalisation (%)'] = [round(i/ dataframe['num_hosp'].sum(),3)*100 for i in dataframe['num_hosp']]
    dataframe['Number of Intensive Care Unit (%)'] = [round(i/ dataframe['num_uci'].sum(),3)*100 for i in dataframe['num_uci']]
    dataframe['Number of deaths (%)'] = [round(i/ dataframe['num_dead'].sum(),3)*100 for i in dataframe['num_dead']]

In [10]:
def organise(column):
    
    name = db1.groupby(column).sum()
    freq_rel(name)
    
    return name

## Moving averages: 7 days moving average

In [11]:
def mov_7_ave(dataframe):
    dataframe['ave_7_num_infections'] = dataframe.iloc[:,1].rolling(window=7).mean()
    dataframe['ave_7_num_hosp'] = dataframe.iloc[:,2].rolling(window=7).mean()
    dataframe['ave_7_num_uci'] = dataframe.iloc[:,3].rolling(window=7).mean()
    dataframe['ave_7_num_dead'] = dataframe.iloc[:,4].rolling(window=7).mean()
    
    return dataframe

## Evolution

In [12]:
def evolution(dataframe):
    global db1
    
    bydate = db1.groupby('date').sum().reset_index()
    bydate = cumulative(bydate)
    bydate = mov_7_ave(bydate)
    
    return bydate

# Data

In [13]:
db1 =pd.read_csv('covid-19-sample.csv')

In [14]:
clean_data()

In [15]:
bydate = evolution(db1)

In [22]:
def predict(name, days):
    
    global bydate
    
    if name == 'Number of infections':
        
        infections = bydate[['date', 'ave_7_num_infections']].set_index('date')
        infections = infections[6:]
        train, test = infections[:-days], infections[-days:]
        
        if days == 1:
            
            model=ARMA(train, order = (4, 2)).fit(disp=False)
            print('rsme:', mse(model.predict(len(train), len(infections)-1), test)**0.5, 'daily infections')
            return pd.DataFrame(model.predict(), columns = ['pred_num_infections_1'])
        
        if days == 3:
            
            model=ARMA(train, order = (4, 12)).fit(disp=False)
            print('rsme:', mse(model.predict(len(train), len(infections)-1), test)**0.5, 'daily infections')
            return pd.DataFrame(model.predict(), columns = ['pred_num_infections_3'])
            
        if days == 5:
            
            model=ARMA(train, order = (19, 11)).fit(disp=False)
            print('rsme:', mse(model.predict(len(train), len(infections)-1), test)**0.5, 'daily infections')
            return pd.DataFrame(model.predict(), columns = ['pred_num_infections_5'])
        
        if days == 7:
            
            model=ARMA(train, order = (19, 7)).fit(disp=False)
            print('rsme:', mse(model.predict(len(train), len(infections)-1), test)**0.5, 'daily infections')
            return pd.DataFrame(model.predict(), columns = ['pred_num_infections_7'])
        

    if name == 'Number of hospitalisation':
        
        hosp = bydate[['date', 'ave_7_num_hosp']].set_index('date')
        hosp = hosp[6:]
        train, test = hosp[:-days], hosp[-days:]
        
        if days == 1:
            
            model=ARMA(train, order = (2, 2)).fit(disp=False)
            print('rsme:', mse(model.predict(len(hosp), len(hosp)-1), test)**0.5, 'daily hospitalisations')
            return pd.DataFrame(model.predict(), columns = ['pred_num_hosp_1'])
        
        if days == 3:
            
            model=ARMA(train, order = (6, 5)).fit(disp=False)
            print('rsme:', mse(model.predict(len(hosp), len(hosp)-1), test)**0.5, 'daily hospitalisations')
            return pd.DataFrame(model.predict(), columns = ['pred_num_hosp_3'])
            
        if days == 5:
            
            model=ARMA(train, order = (2, 8)).fit(disp=False)
            print('rsme:', mse(model.predict(len(hosp), len(hosp)-1), test)**0.5, 'daily hospitalisations')
            return pd.DataFrame(model.predict(), columns = ['pred_num_hosp_5'])

        if days == 7:
            
            model=ARMA(train, order = (19, 11)).fit(disp=False)
            print('rsme:', mse(model.predict(len(hosp), len(hosp)-1), test)**0.5, 'daily hospitalisations')
            return pd.DataFrame(model.predict(), columns = ['pred_num_hosp_7'])

        
    if name == 'Number of Intensive Care Unit':
        
        uci = bydate[['date', 'ave_7_num_uci']].set_index('date')
        uci = uci[6:]
        train, test = uci[:-days], uci[-days:]
        
        if days == 1:
            
            model=ARMA(train, order = (2, 4)).fit(disp=False)
            print('rsme:', mse(model.predict(len(uci), len(hosp)-1), test)**0.5, 'daily intensive care unit')
            return pd.DataFrame(model.predict(), columns = ['pred_num_hosp_1'])
        
        if days == 3:
            
            model=ARMA(train, order = (5, 5)).fit(disp=False)
            print('rsme:', mse(model.predict(len(uci), len(hosp)-1), test)**0.5, 'daily intensive care unit')
            return pd.DataFrame(model.predict(), columns = ['pred_num_hosp_3'])
            
        if days == 5:
            
            model=ARMA(train, order = (4, 5)).fit(disp=False)
            print('rsme:', mse(model.predict(len(uci), len(hosp)-1), test)**0.5, 'daily intensive care unit')
            return pd.DataFrame(model.predict(), columns = ['pred_num_hosp_5'])

        if days == 7:
            
            model=ARMA(train, order = (2, 6)).fit(disp=False)
            print('rsme:', mse(model.predict(len(uci), len(hosp)-1), test)**0.5, 'daily intensive care unit')
            return pd.DataFrame(model.predict(), columns = ['pred_num_hosp_7'])

    if name == 'Number of deaths':
        
        deaths = bydate[['date', 'ave_7_num_dead']].set_index('date')
        deaths = deaths[6:]
        train, test = deaths[:-days], deaths[-days:]
        
        if days == 1:
            
            model=ARMA(train, order = (2, 3)).fit(disp=False)
            print('rsme:', mse(model.predict(len(deaths), len(hosp)-1), test)**0.5, 'daily intensive care unit')
            return pd.DataFrame(model.predict(), columns = ['pred_num_hosp_1'])
        
        if days == 3:
            
            model=ARMA(train, order = (4, 5)).fit(disp=False)
            print('rsme:', mse(model.predict(len(deaths), len(hosp)-1), test)**0.5, 'daily intensive care unit')
            return pd.DataFrame(model.predict(), columns = ['pred_num_hosp_3'])
            
        if days == 5:
            
            model=ARMA(train, order = (9, 6)).fit(disp=False)
            print('rsme:', mse(model.predict(len(deaths), len(hosp)-1), test)**0.5, 'daily intensive care unit')
            return pd.DataFrame(model.predict(), columns = ['pred_num_hosp_5'])

        if days == 7:
            
            model=ARMA(train, order = (2, 7)).fit(disp=False)
            print('rsme:', mse(model.predict(len(deaths), len(hosp)-1), test)**0.5, 'daily intensive care unit')
            return pd.DataFrame(model.predict(), columns = ['pred_num_hosp_7'])


In [17]:
pred_num_infections_1 = predict('Number of infections', 1)

rsme: 0.8948312256152349 daily infections


In [18]:
pred_num_infections_3 = predict('Number of infections', 3)

rsme: 102.2607415602929 daily infections


In [20]:
pred_num_infections_5 = predict('Number of infections', 5)

rsme: 357.45878260447734 daily infections
