In [None]:
# warnings
import warnings
warnings.filterwarnings('ignore')

# import modules
import pandas as pd
import numpy as np
import datetime
import glob
import os
from datetime import timedelta

# sklearn
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, minmax_scale
from sklearn.model_selection import train_test_split

# machine learning
from lightgbm import LGBMRegressor
from fbprophet import Prophet


# plots
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
matplotlib.style.use('seaborn-ticks')

## `read COVID-19 datasets`

In [None]:
path = './data/*.csv'
files = glob.glob(os.path.join(path))
print (files)

In [None]:
datasets = {}
for f in files:
    filename = f.split('\\')[-1].split('.')[0]
    d = pd.read_csv(f, encoding='utf-8')
    datasets[filename] = d

In [None]:
datasets.keys()

In [None]:
key = 'covid_19_data'
datasets[key].head()

In [None]:
# fillna values -> 0 [confirmed, deaths, recovered] -> astype INT
cols = ['Confirmed', 'Deaths', 'Recovered']
for c in cols:
    datasets[key][c] = datasets[key][c].fillna(0)
    datasets[key][c] = datasets[key][c].astype(int)
    
datasets[key].head(1)

In [None]:
# build date
datasets[key]['Date'] = pd.to_datetime(datasets[key]['ObservationDate'])

## `create dataframe`

In [None]:
datasets[key]['Country/Region'] = [
    ' '.join(i.split()).strip() for i in datasets[key]['Country/Region']
]

In [None]:
df = datasets[key].groupby(['Country/Region', 'Date']) \
    .agg({'Confirmed': sum}) \
    .reset_index()

df.shape

In [None]:
# fix names
df.loc[df['Country/Region'] == 'US', 'Country/Region'] = 'United States'
df.loc[df['Country/Region'] == 'Mainland China', 'Country/Region'] = 'China'
df.loc[df['Country/Region'] == 'UK', 'Country/Region'] = 'United Kingdom'
df.loc[df['Country/Region'] == 'Czechia', 'Country/Region'] = 'Czech Republic'
df.loc[df['Country/Region'] == 'Taiwan*', 'Country/Region'] = 'Taiwan'
df.loc[df['Country/Region'] == 'Viet Nam', 'Country/Region'] = 'Vietnam'
df.loc[df['Country/Region'] == 'occupied Palestinian territory', 'Country/Region'] = 'Palestine'
df.loc[df['Country/Region'] == "('St. Martin',)", 'Country/Region'] = 'St. Martin'

In [None]:
# remove Cruise Ship
df = df[~ df['Country/Region'].isin(['Cruise Ship', 'Others'])].copy()

In [None]:
df = df.sort_values(by=['Country/Region', 'Date']) \
    .reset_index(drop=True)

In [None]:
# Fix dates -> adding missing intervals
store_frames = []

# create time frame
default = pd.date_range(
    start=sorted(df['Date'].tolist())[0],
    end=sorted(df['Date'].tolist())[-1],
    freq='D'
).values
default = [pd.to_datetime(i) for i in default]

# iterate over countries
for country in df['Country/Region'].unique():
    d = df[df['Country/Region'] == country]
    serie = sorted(d['Date'].tolist())
    s1 = serie[0]
    idx = default.index(s1)
    
    # check if series match in length
    match = len(serie) == len(default)
    if not match:
        
        # temporal dataframe
        tmp = pd.DataFrame(
            {
                'Date': default
            }
        )
        tmp['Country/Region'] = country
        
        # merge frames
        d = tmp.merge(d, how='left') \
            .reset_index(drop=True)
        
        # iterate rows
        for row in range(d.shape[0]):
            if row < idx:
                d['Confirmed'].iloc[row] = 0
            else:
                break
        
        d['Confirmed'] = d['Confirmed'].fillna(method='ffill')
        d['Confirmed'] = d['Confirmed'].astype(int)                
    
    # Fix confirmed cases
    values = d['Confirmed'].tolist()
    store_values = []
    i = 0
    N = d.shape[0]
    store_values.append(values[i])
    for j in values:
        if i + 1 != N:
            v1 = store_values[i]
            v2 = values[i + 1]
            if v2 < v1:
                v2 = v1
        
            # sotore value
            store_values.append(v2)
            i += 1
    
    # add values
    d['Confirmed'] = store_values
    
    # Min Max scale
    d['Confirmed_scale'] = minmax_scale(d['Confirmed'])
    
    # store frame
    store_frames.append(d)

# concat frames
df = pd.concat(store_frames, sort=True) \
    .sort_values(by=['Country/Region', 'Date']) \
    .reset_index(drop=True)

# add outbreak
df['Outbreak'] = 'COVID-19'
df.shape

In [None]:
df.head()

In [None]:
# N countries
df['Country/Region'].unique().shape

## `add new data  ---> manually`

In [None]:
df[df['Country/Region'] == 'India'][['Date', 'Confirmed']].tail(1)

In [None]:
sorted(df.columns.tolist())

In [None]:
def add_new_data(d, main):
    '''
    '''
    m = main.copy()
    d = pd.DataFrame(d)
    d['Date'] = pd.to_datetime(d['Date'])
    
    return pd.concat([m, d], sort=True) \
            .sort_values(by=['Country/Region', 'Date']) \
            .reset_index(drop=True)

In [None]:
new_data = [
    {
        'Date': '2020-04-07',
        'Country/Region': 'India',
        'Confirmed': 103942,
        'Confirmed_scale': 0,
        'Outbreak': 'COVID-19'
    }
]

In [None]:
df = add_new_data(new_data, df)
df.shape

## `Prophet`

In [None]:
sample = df[df['Country/Region'] == 'India'].copy()
sample['cases'] = sample['Confirmed'].diff().fillna(0)

cap = sample[sample['Date'] > '2020-03-31']['cases'].mean() * 30
print (cap)

In [None]:
sample = df[df['Country/Region'] == 'India'][['Date', 'Confirmed']].reset_index(drop=True)
sample.head()

In [None]:
sample.tail(1)

In [None]:
sample['Confirmed'].plot()

In [None]:
sample['ds'] = sample['Date']
sample['y'] = sample['Confirmed']

# delete old columns
del sample['Date']
del sample['Confirmed']

In [None]:
sample['floor'] = 0
sample['cap'] = cap

In [None]:
M = Prophet(
    growth='logistic',
    interval_width=0.98,
    daily_seasonality=True,
    weekly_seasonality=False,
    yearly_seasonality=False,
    seasonality_mode='additive'
)

M.fit(sample)

In [None]:
future = M.make_future_dataframe(periods=30)
future['floor'] = 0
future['cap'] = cap
future.shape

In [None]:
forecast = M.predict(future)

In [None]:
forecast.head()

In [None]:
sample.tail(2)

In [None]:
cnfrm = forecast.loc[:, ['ds','trend']]
cnfrm = cnfrm[cnfrm['trend'] > 0]
cnfrm[cnfrm['ds'] > '2020-04-07'].head()

In [None]:
preds = cnfrm[cnfrm['ds'] > '2020-04-07'].copy()
preds.index = pd.DatetimeIndex(preds['ds'], name='index')
del preds['ds']

preds.name = 'ML Model'
preds['predicted'] = preds['trend'].astype(int)
del preds['trend']

# sample
preds.head()

In [None]:
actual = sample[['ds', 'y']].copy()
actual.columns = ['ds', 'trend']
actual.index = pd.DatetimeIndex(actual['ds'], name='index')
del actual['ds']

actual.name = 'Historical data'
actual['trend'] = actual['trend'].astype(int)

# sample
actual.head()

`PLOT`

In [None]:
fig, ax = plt.subplots(figsize=(16, 9))

actual.plot(ax=ax, marker='o', linewidth=2.5)
preds.plot(ax=ax, marker='o', color='#FDA50F', linewidth=2)
plt.legend(loc='upper left')
plt.tight_layout()
plt.legend(fontsize=20)
ax.set_xlabel('')
plt.box(False);

In [None]:
preds.to_excel('c:/i/workspace/DFRLab/Research/_builder/assets/20200330COVIDMex/IndiaForecast30days.xlsx')

# OLD FORECAST

## `Get SARS data`

In [None]:
sars_data = datasets['sars_2003_complete_dataset_clean']
sars_data.shape

In [None]:
sars_data['Outbreak'] = 'SARS_2003'

In [None]:
sars_data.head()

In [None]:
# rename columns
sars_data.rename(
    {
        'Cumulative number of case(s)': 'Confirmed',
        'Country': 'Country/Region',
    }, axis=1, inplace=True
)

sars_data['Date'] = pd.to_datetime(sars_data['Date'])
sars_data.head()

In [None]:
# fix countries names
sars_data['Country/Region'] = [' '.join(i.split()).strip() for i in sars_data['Country/Region']]

In [None]:
# fix names
sars_data.loc[sars_data['Country/Region'] == 'Taiwan, China', 'Country/Region'] = 'Taiwan'
sars_data.loc[sars_data['Country/Region'] == 'Hong Kong SAR, China', 'Country/Region'] = 'Hong Kong'
sars_data.loc[sars_data['Country/Region'] == 'Russian Federation', 'Country/Region'] = 'Russia'
sars_data.loc[sars_data['Country/Region'] == 'Viet Nam', 'Country/Region'] = 'Vietnam'
sars_data.loc[sars_data['Country/Region'] == 'Macao SAR, China', 'Country/Region'] = 'Macau'
sars_data.loc[sars_data['Country/Region'] == 'Republic of Korea', 'Country/Region'] = 'South Korea'

In [None]:
sars_data_filter = sars_data[['Country/Region', 'Date', 'Confirmed', 'Outbreak']]
sars_data_filter.shape

In [None]:
# fix cumulative number of cases
sars_frames = []
for c in sars_data_filter['Country/Region'].unique():
    d = sars_data_filter[sars_data_filter['Country/Region'] == c]
    values = d['Confirmed'].tolist()
    store_values = []
    i = 0
    N = d.shape[0]
    store_values.append(values[i])
    for j in values:
        if i + 1 != N:
            v1 = store_values[i]
            v2 = values[i + 1]
            if v2 < v1:
                v2 = v1
        
            # sotore value
            store_values.append(v2)
            i += 1
            
    # change values in Confirmed cases
    d['Confirmed'] = store_values
    
    # Min Max scale
    d['Confirmed_scale'] = minmax_scale(d['Confirmed'])
    
    # store new frames
    sars_frames.append(d)
    
# concat new frames
sars = pd.concat(sars_frames, sort=True) \
    .sort_values(by=['Country/Region', 'Date']) \
    .reset_index(drop=True)

In [None]:
sars.shape

In [None]:
sars.head()

In [None]:
sars[sars['Country/Region'] == 'Australia']

## `Build datasets`

In [None]:
_df_ = pd.concat([df, sars], sort=True) \
    .sort_values(by=['Country/Region', 'Date']) \
    .reset_index(drop=True)

_df_.shape

In [None]:
# remove countries with minor confirmation cases
t = _df_.groupby('Country/Region') \
    .agg({'Confirmed': max})
t = t.loc[t['Confirmed'] > 50]
t.shape

In [None]:
_df_ = pd.merge(_df_, t[[]], left_on=['Country/Region'], right_index=True)

In [None]:
_df_.head()

    # sort data
    _df_ = _df_.sort_values(by=['Country/Region', 'Date']) \
        .reset_index(drop=True)

    # remove extra row in China
    _df_ = _df_.drop(562)

In [None]:
# sort data
data = _df_.sort_values(by=['Country/Region', 'Date']) \
    .reset_index(drop=True)

In [None]:
# save dataset
data.to_excel('./data/countries.xlsx', index=False)

## `Build new dataset`

In [None]:
data = df.copy()

In [None]:
data.head()

In [None]:
data.tail(2)

In [None]:
fix, ax = plt.subplots(figsize=(16, 6), ncols=2)
s0 = data['Confirmed']
s0.plot.hist(ax=ax[0]);

# sklearn preprocessing
transformer = MinMaxScaler(feature_range=(0, 1)).fit(np.asarray([0, 2e5]).reshape(-1, 1))
s1 = pd.Series(transformer.transform(s0.values.reshape(-1, 1)).reshape(-1))
s1.plot.hist(ax=ax[1])

In [None]:
# add tranform data to dataset
data['Confirmed_transformed'] = s1

## `Date preprocessing`

In [None]:
data['Month'] = data['Date'].dt.month
data['Week'] = data['Date'].dt.week
data['Day'] = data['Date'].dt.day

### `merge new data to countries`

In [None]:
countries_data = datasets['countries of the world']
countries_data.shape

In [None]:
# clean countries names
countries_data['Country'] = [' '.join(i.split()).strip() for i in countries_data['Country']]

In [None]:
# find countries not match
countries_ls = [
    i for i in data['Country/Region'].unique().tolist()
    if i not in countries_data['Country'].unique().tolist()
]
len(countries_ls)

In [None]:
countries_data = countries_data[~ countries_data['Country'].isin(countries_ls)]

In [None]:
data = data[~ data['Country/Region'].isin(countries_ls)]

    countries_ls

    # change country name
    countries_data.loc[countries_data['Country'] == 'Korea, South', 'Country'] = 'South Korea'

    # find countries not match
    countries_ls = [
        i for i in data['Country/Region'].unique().tolist()
        if i not in countries_data['Country'].unique().tolist()
    ]
    len(countries_ls)

In [None]:
# match countries
match_countries = [
    i for i in countries_data['Country'].unique().tolist()
    if i in data['Country/Region'].unique().tolist()
]
len(match_countries)

In [None]:
countries_df = countries_data[countries_data['Country'].isin(match_countries)].copy()
countries_df.shape

In [None]:
# rename country field
countries_df['Country/Region'] = countries_df['Country']

In [None]:
# merge
data = data.merge(countries_df, on='Country/Region')
data.shape

In [None]:
# fix Pop. Density (per sq. mi.) and Net migration
data['Pop. Density (per sq. mi.)'] = [float(i.replace(',', '.')) for i in data['Pop. Density (per sq. mi.)']]
data['Net migration'] = [float(i.replace(',', '.')) for i in data['Net migration']]

### `Lags`

> Lag is expressed in a time unit (e.g. in minutes) and corresponds to the amount of data history we allow the model to use when making the prediction.

In [None]:
n_lags = 5
for k, v in data.groupby('Country/Region'):
    for i in range(n_lags, 0, -1):
        data.loc[v.index, f'Confirmed_Lag_{i}'] = v['Confirmed'].shift(i)
        data.loc[v.index, f'Confirmed_Rolling_Mean_Lag_{i}'] = v['Confirmed'].shift(i).rolling(n_lags).mean()
        data.loc[v.index, f'Confirmed_Transformed_Lag_{i}'] = v['Confirmed_transformed'].shift(i)

In [None]:
data.shape

In [None]:
# get specific columns
X_mask_lags = sorted([c for c in data.columns if ('Lag' in c and not 'Transformed' in c)])
X_mask_tranformed_lags = [c for c in data.columns if 'Transformed_Lag' in c]

In [None]:
# fillna columns --> 0
data[X_mask_lags] = data[X_mask_lags].fillna(0)
data[X_mask_tranformed_lags] = data[X_mask_tranformed_lags].fillna(0)

In [None]:
# example
data[X_mask_lags].tail(6)

### `Encoding`

In [None]:
# country
encoding_country = LabelEncoder().fit(data['Country/Region'])
data['Country_encoding'] = encoding_country.transform(data['Country/Region'])

# region
# encoding_region = LabelEncoder().fit(data['Region'])
# data['Region_encoding'] = encoding_region.transform(data['Region'])

# outbreak
encoding_outbreak = LabelEncoder().fit(data['Outbreak'])
data['Outbreak_encoding'] = encoding_outbreak.transform(data['Outbreak'])

In [None]:
data.shape

In [None]:
# resave data
data.to_excel('./data/countries_lags.xlsx', index=False)

## `Machine Learning --> LGBM`

In [None]:
train, valid = train_test_split(data.loc[data['Confirmed'] > 1000], test_size=0.2, shuffle=True, random_state=7)

In [None]:
train.shape

In [None]:
valid.shape

In [None]:
model_lgbm = LGBMRegressor(n_estimators=500, metric='mae')

In [None]:
# X_mask_cat = ['Month', 'Week', 'Outbreak_encoding', 'Region_encoding']
X_mask_cat = ['Month', 'Week', 'Outbreak_encoding']
X_cols = X_mask_cat + sorted(X_mask_lags[:n_lags], reverse=True) + sorted(X_mask_lags[n_lags:], reverse=True)
Y = train['Confirmed']

In [None]:
model_lgbm.fit(
    X=train[X_cols],
    y=Y,
    eval_set=(valid[X_cols], valid['Confirmed']),
    early_stopping_rounds=500,
    verbose=10
)

## `Exploratory`

    tmp = data[data['Outbreak'] == 'COVID-19']
    g = sns.FacetGrid(tmp, col='Country/Region', hue='Country/Region',
                      sharey=False, col_wrap=5)
    g.map(plt.plot, 'Date', 'Confirmed')
    g.set_xticklabels(rotation=90);

## `Machine Learning --> Prediction`

In [None]:
print (sorted(data['Country/Region'].unique().tolist()))

In [None]:
country = 'United States'
data[
    (data['Country/Region'] == country) &
    (data['Outbreak'] == 'COVID-19')
]['Confirmed'].plot()

In [None]:
pred_steps = 20

# historical data
history = data.loc[
    (data['Country/Region'] == country) &
    (data['Outbreak'] == 'COVID-19')
]
history_ending = history.iloc[-1]

In [None]:
dt_rng = pd.date_range(
    start=history_ending['Date'] + timedelta(days=1),
    end=history_ending['Date'] + timedelta(days=pred_steps),
    freq='D'
).values

In [None]:
pred_months = pd.Series(dt_rng).apply(lambda dt: dt.month)
pred_weeks = pd.Series(dt_rng).apply(lambda dt: dt.week)
pred_days = pd.Series(dt_rng).apply(lambda dt: dt.day)

In [None]:
# X_mask_cat & X_mask_lags --> get arrays from values
pred_cat = history_ending[X_mask_cat].values
pred_lags = history_ending[X_mask_lags].values

y = history_ending['Confirmed']

In [None]:
# final value
print (f'Final value --> {y}')

In [None]:
print ('History ending cat', pred_cat)

In [None]:
print ('History ending lags', pred_lags)

### `Specific process - changing lags`

*`lags int`*

In [None]:
np.asarray(sorted(pred_lags[:n_lags]))

In [None]:
np.roll(np.asarray(sorted(pred_lags[:n_lags])), -1)

In [None]:
pred_lags[:n_lags] = np.roll(np.asarray(sorted(pred_lags[:n_lags])), -1)

In [None]:
pred_lags[:n_lags]

In [None]:
pred_lags[n_lags - 1]

In [None]:
pred_lags[n_lags - 1] = y

In [None]:
pred_lags[:n_lags]

*`lags mean`*

In [None]:
np.asarray(sorted(pred_lags[n_lags:]))

In [None]:
np.roll(np.asarray(sorted(pred_lags[n_lags:])), -1)

In [None]:
pred_lags[n_lags:] = np.roll(np.asarray(sorted(pred_lags[n_lags:])), -1)

In [None]:
pred_lags[n_lags:]

In [None]:
pred_lags[-1]

In [None]:
np.mean(pred_lags[:n_lags])

In [None]:
pred_lags[-1] = np.mean(pred_lags[:n_lags])

In [None]:
pred_lags[n_lags:]

In [None]:
# build np zeros array
pred = np.zeros(pred_steps)

`pred_cat ---> ['Month', 'Week', 'Outbreak_encoding', 'Region_encoding', 'Confirmed_scale', 'cum_sum']`

In [None]:
for i in range(pred_steps):     
    pred_cat[0] = pred_months[i]
    pred_cat[1] = pred_weeks[i]
    
    y = model_lgbm.predict(np.hstack([pred_cat, pred_lags]).reshape(1, -1))[0]
    print(f'Predicted was: {y}')
    
    pred_lags[:n_lags] = np.roll(pred_lags[:n_lags], -1)
    
    # Lag
    pred_lags[n_lags-1] = y
    pred_lags[n_lags:] = np.roll(pred_lags[n_lags:], -1)
    
    # rolling_mean
    pred_lags[-1] = np.mean(pred_lags[n_lags:])

    pred[i] = y

In [None]:
preds = pd.Series(data=pred, index=dt_rng, name='LGBM Regressor')

In [None]:
history[['Date', 'Confirmed']].tail()

In [None]:
preds

## `Plot Forecast`

In [None]:
fig, ax = plt.subplots(figsize=(16, 9))

hist = history.set_index(['Date'])['Confirmed'].plot(ax=ax, marker='o')
preds.plot(ax=ax, marker='.', color='#FDA50F')
plt.legend(loc='upper left')
plt.tight_layout()
ax.set_xlabel('')
plt.box(False);

## `Save data`

In [None]:
# save output
output_path = f'./data/20200321Outputs/output_{"_".join(country.split(","))}.xlsx'
preds.to_excel(output_path)