In [38]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import scipy.optimize
import matplotlib.dates as mdates
from datetime import datetime, timedelta
pd.set_option('display.max_rows', 500)

In [2]:
confirmed_df = pd.read_csv(r'../csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv')
confirmed_df.rename(columns={'Province/State': 'Province_State', 'Country/Region': 'Country_Region'}, inplace=True)
confirmed_df.head()

Unnamed: 0,Province_State,Country_Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,3/6/20,3/7/20,3/8/20,3/9/20,3/10/20,3/11/20,3/12/20,3/13/20,3/14/20,3/15/20
0,,Thailand,15.0,101.0,2,3,5,7,8,8,...,48,50,50,50,53,59,70,75,82,114
1,,Japan,36.0,138.0,2,1,2,2,4,4,...,420,461,502,511,581,639,639,701,773,839
2,,Singapore,1.2833,103.8333,0,1,3,3,4,5,...,130,138,150,150,160,178,178,200,212,226
3,,Nepal,28.1667,84.25,0,0,0,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,,Malaysia,2.5,112.5,0,0,0,3,4,4,...,83,93,99,117,129,149,149,197,238,428


In [None]:
confirmed_indexed_df = confirmed_df.set_index(['Country/Region', 'Province/State', 'Lat', 'Long'])

In [None]:
confirmed_indexed_df.head()

In [None]:
confirmed_indexed_df.columns

In [None]:
confirmed_arr = confirmed_indexed_df.loc['Israel', :].values[3:]
# confirmed_arr = confirmed_indexed_df.loc['Italy', :].values[3:]
# confirmed_arr = confirmed_indexed_df.loc['Taiwan*', :].values[3:]
confirmed_arr

In [None]:
plt.figure()
plt.plot(confirmed_arr, '--*')
plt.show()

In [None]:
def exponential(x, a, k, b):
    return a*np.exp(x*k) + b

In [None]:
confirmed_arr.shape

In [None]:
y_array_exp = confirmed_arr[30:]
x_array = np.array(range(len(y_array_exp)))
y_array_exp.shape, x_array.shape

In [None]:
popt_exponential, pcov_exponential = scipy.optimize.curve_fit(exponential, x_array, y_array_exp)

In [None]:
a, k, b = popt_exponential
a, k, b 

In [None]:
np.exp(k), np.exp(k*7)

In [None]:
estimated = exponential(x_array, a, k, b)
estimated

In [None]:
plt.figure()
plt.plot(estimated)
# plt.hold(True)
plt.plot(y_array_exp, '--*')

In [None]:
%matplotlib notebook
plt.figure(figsize=(9, 8))
for iteration in range(0, 8):
    plt.subplot(331 + iteration)
    n_days = int(iteration * 7)
    x_long_term = range(len(y_array_exp) + n_days)
    y_forecast = exponential(x_long_term, a, k, b)
    max_val = y_forecast.max() 
    plt.plot(estimated)
    plt.plot(y_array_exp, '--*')
    plt.plot(y_forecast, '--b')
    plt.title(f'weeks from now: {iteration}\ncases:{int(max_val):,}', y=.5)

In [None]:
confirmed_indexed_df.head()

In [None]:
a = confirmed_indexed_df.iloc[0].to_frame('count').reset_index()
a.rename(columns={'index': 'date'}, inplace=True)
a.head()

In [None]:
confirmed_indexed_df.head()

In [None]:
def get_dates_count_df(a_series):
    df = a_series.to_frame('count').reset_index()
    df.rename(columns={'index': 'date'}, inplace=True)
    return df

In [None]:
confirmed_indexed_df.apply(get_dates_count_df, axis=1)

In [None]:
confirmed_indexed_df

In [None]:
confirmed_indexed_df.iloc[0]

In [None]:
grpby = confirmed_df.groupby(['Country/Region', 'Province/State', 'Lat', 'Long'])

In [None]:
next(iter(grpby))[1]

In [None]:
a_series = confirmed_df.iloc[0]
dates = a_series.index[4:].values
counts = a_series.values[4:]
dates, counts

In [None]:
df = pd.DataFrame({'dates': dates, 'counts': counts})
df['Province/State'] = a_series['Province/State'] 
df['Country/Region'] = a_series['Country/Region'] 
df['Lat'] = a_series['Lat'] 
df['Long'] = a_series['Long'] 
df.head()

In [None]:
a_series.head()

# phase 2

In [3]:
def create_df(a_series):
    dates = a_series.index[4:].values
    counts = a_series.values[4:]
    df = pd.DataFrame({'date': dates, 'count': counts})
    df['Province_State'] = a_series['Province_State'] 
    df['Country_Region'] = a_series['Country_Region'] 
    df['Lat'] = a_series['Lat'] 
    df['Long'] = a_series['Long'] 
    return df

In [4]:
confirmed_df_2 = pd.concat(confirmed_df.apply(create_df, axis=1).values).set_index(['Country_Region', 'Province_State', 'Lat', 'Long'])
confirmed_df_2['date'] = pd.to_datetime(confirmed_df_2['date'])
confirmed_df_2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,date,count
Country_Region,Province_State,Lat,Long,Unnamed: 4_level_1,Unnamed: 5_level_1
Thailand,,15.0,101.0,2020-01-22,2
Thailand,,15.0,101.0,2020-01-23,3
Thailand,,15.0,101.0,2020-01-24,5
Thailand,,15.0,101.0,2020-01-25,7
Thailand,,15.0,101.0,2020-01-26,8


In [5]:
country_df = confirmed_df_2.query('Country_Region == "Israel"')
country_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,date,count
Country_Region,Province_State,Lat,Long,Unnamed: 4_level_1,Unnamed: 5_level_1
Israel,,31.0,35.0,2020-01-22,0
Israel,,31.0,35.0,2020-01-23,0
Israel,,31.0,35.0,2020-01-24,0
Israel,,31.0,35.0,2020-01-25,0
Israel,,31.0,35.0,2020-01-26,0


In [6]:
%matplotlib notebook
country_df.plot(x='date', y='count', marker='*')

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7fe6d1232d50>

In [12]:
def exponential(x, a, k, b):
    return a*np.exp(x*k) + b

In [13]:
non_zero_country_df = country_df[country_df['count'] > 0]
y_array_exp = non_zero_country_df['count'].values
ref_date = datetime(2020, 1, 1)
x_array = (non_zero_country_df['date'] - ref_date).dt.days.values.astype(float)

In [14]:
plt.figure()
plt.plot(x_array, y_array_exp, '--*')
plt.show()

<IPython.core.display.Javascript object>

In [23]:
x_array.shape

(24,)

In [28]:
sample = 22
popt_exponential, pcov_exponential = scipy.optimize.curve_fit(exponential, x_array[:sample], y_array_exp[:sample], p0=
(0.0021819439144763727, 0.2196989290489097, -0.8576298838011189))
a, k, b = popt_exponential
a, k, b 

(2.348866204624933e-05, 0.21908245786135838, -2.9729230653750025)

In [29]:
estimated = exponential(x_array, a, k, b)

In [30]:
non_zero_country_df['estimated'] = estimated

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [31]:
plt.figure()
ax = plt.gca()
non_zero_country_df.plot(ax=ax, x='date', y='estimated' ,marker='*')
non_zero_country_df.plot(ax=ax, x='date', y='count' ,marker='*')
plt.show()

<IPython.core.display.Javascript object>

In [32]:
non_zero_country_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,date,count,estimated
Country_Region,Province_State,Lat,Long,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Israel,,31.0,35.0,2020-02-21,1,-1.300602
Israel,,31.0,35.0,2020-02-22,1,-0.890994
Israel,,31.0,35.0,2020-02-23,1,-0.381058
Israel,,31.0,35.0,2020-02-24,1,0.253777
Israel,,31.0,35.0,2020-02-25,1,1.044105
Israel,,31.0,35.0,2020-02-26,2,2.028012
Israel,,31.0,35.0,2020-02-27,3,3.252911
Israel,,31.0,35.0,2020-02-28,4,4.777829
Israel,,31.0,35.0,2020-02-29,7,6.676251
Israel,,31.0,35.0,2020-03-01,10,9.039661


In [35]:
last_date = non_zero_country_df['date'].iloc[-1]
last_date

Timestamp('2020-03-15 00:00:00')

In [58]:
extrapolation_dates_series = pd.Series([last_date + timedelta(days=k) for k in range(-2, 6)])
extrapolation_x_array = (extrapolation_dates_series - ref_date).dt.days.values.astype(float)

In [59]:
extrapolation_estimated = exponential(extrapolation_x_array, a, k, b)

In [60]:
extrapolation_df = pd.DataFrame({'date': extrapolation_dates_series, 'estimated': extrapolation_estimated})
extrapolation_df

Unnamed: 0,date,estimated
0,2020-03-13,163.518575
1,2020-03-14,204.297992
2,2020-03-15,255.065671
3,2020-03-16,318.268077
4,2020-03-17,396.950897
5,2020-03-18,494.905808
6,2020-03-19,616.853201
7,2020-03-20,768.669648


In [61]:
plt.figure()
ax = plt.gca()
non_zero_country_df.plot(ax=ax, x='date', y='estimated' ,marker='*')
non_zero_country_df.plot(ax=ax, x='date', y='count' ,marker='*')
extrapolation_df.plot(ax=ax, x='date', y='estimated' ,marker='o')
plt.show()

<IPython.core.display.Javascript object>