In [21]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import scipy.optimize
import matplotlib.dates as mdates
from datetime import datetime, timedelta
pd.set_option('display.max_rows', 500)

In [22]:
confirmed_df = pd.read_csv(r'../csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
confirmed_df.rename(columns={'Province/State': 'Province_State', 'Country/Region': 'Country_Region'}, inplace=True)
confirmed_df.head()

Unnamed: 0,Province_State,Country_Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,3/24/20,3/25/20,3/26/20,3/27/20,3/28/20,3/29/20,3/30/20,3/31/20,4/1/20,4/2/20
0,,Afghanistan,33.0,65.0,0,0,0,0,0,0,...,74,84,94,110,110,120,170,174,237,273
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,123,146,174,186,197,212,223,243,259,277
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,264,302,367,409,454,511,584,716,847,986
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,164,188,224,267,308,334,370,376,390,428
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,3,3,4,4,5,7,7,7,8,8


In [23]:
def create_df(a_series):
    dates = a_series.index[4:].values
    counts = a_series.values[4:]
    df = pd.DataFrame({'date': dates, 'count': counts})
    df['Province_State'] = a_series['Province_State'] 
    df['Country_Region'] = a_series['Country_Region'] 
    df['Lat'] = a_series['Lat'] 
    df['Long'] = a_series['Long'] 
    return df

In [24]:
confirmed_df_2 = pd.concat(confirmed_df.apply(create_df, axis=1).values).set_index(['Country_Region', 'Province_State', 'Lat', 'Long'])
confirmed_df_2['date'] = pd.to_datetime(confirmed_df_2['date'])
confirmed_df_2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,date,count
Country_Region,Province_State,Lat,Long,Unnamed: 4_level_1,Unnamed: 5_level_1
Afghanistan,,33.0,65.0,2020-01-22,0
Afghanistan,,33.0,65.0,2020-01-23,0
Afghanistan,,33.0,65.0,2020-01-24,0
Afghanistan,,33.0,65.0,2020-01-25,0
Afghanistan,,33.0,65.0,2020-01-26,0


In [25]:
a = confirmed_df_2.reset_index()
b = a.groupby(['Country_Region', 'date']).agg({'Lat': 'mean', 'Long': 'mean', 'count': 'sum'})
countries_df = b.reset_index().set_index(['Country_Region', 'Lat', 'Long'])
countries_df.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,date,count
Country_Region,Lat,Long,Unnamed: 3_level_1,Unnamed: 4_level_1
Zimbabwe,-20.0,30.0,2020-03-29,7
Zimbabwe,-20.0,30.0,2020-03-30,7
Zimbabwe,-20.0,30.0,2020-03-31,8
Zimbabwe,-20.0,30.0,2020-04-01,8
Zimbabwe,-20.0,30.0,2020-04-02,9


In [26]:
country_df = confirmed_df_2.query('Country_Region == "Israel"')
# country_df = countries_df.query('Country_Region == "US"')
# country_df = confirmed_df_2.query('Country_Region == "Spain"')
# country_df = confirmed_df_2.query('Country_Region == "Italy"')
country_df.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,date,count
Country_Region,Province_State,Lat,Long,Unnamed: 4_level_1,Unnamed: 5_level_1
Israel,,31.0,35.0,2020-03-29,4247
Israel,,31.0,35.0,2020-03-30,4695
Israel,,31.0,35.0,2020-03-31,5358
Israel,,31.0,35.0,2020-04-01,6092
Israel,,31.0,35.0,2020-04-02,6857


In [27]:
last_date = country_df['date'].iloc[-1]
last_date

Timestamp('2020-04-02 00:00:00')

In [28]:
values = list(country_df.index[0]) + [last_date + timedelta(days=1), 945]
columns = list(country_df.index.names) + country_df.columns.tolist()
values, columns
row_df = pd.DataFrame([values], columns=columns).set_index(country_df.index.names)
row_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,date,count
Country_Region,Province_State,Lat,Long,Unnamed: 4_level_1,Unnamed: 5_level_1
Israel,,31.0,35.0,2020-04-03,945


In [29]:
# country_updated_df = country_df.append(row_df)
# country_updated_df.tail()
country_updated_df = country_df

In [30]:
%matplotlib notebook
country_updated_df.plot(x='date', y='count', marker='*')

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7f775d1a6c10>

In [31]:
def exponential(x, a, k, b):
    return a*np.exp(x*k) + b

In [32]:
non_zero_country_df = country_updated_df[country_updated_df['count'] > 0]
y_array_exp = non_zero_country_df['count'].values
ref_date = datetime(2020, 1, 1)
x_array = (non_zero_country_df['date'] - ref_date).dt.days.values.astype(float)

In [33]:
last_model_date = country_updated_df['date'].iloc[-1]
# last_model_date = datetime(2020, 3, 18)
last_model_index = np.argmin((non_zero_country_df['date'] - last_model_date).abs().values)
last_model_index

41

In [34]:
popt_exponential, pcov_exponential = scipy.optimize.curve_fit(exponential, x_array[:last_model_index], y_array_exp[:last_model_index], p0=
(0.0021819439144763727, 0.2196989290489097, -0.8576298838011189))
a, k, b = popt_exponential
a, k, b, np.exp(k), np.exp(k*7)

(0.001555723556360772,
 0.16766058668129652,
 -92.3134611327451,
 1.182535174337244,
 3.2336906685108877)

In [35]:
estimated = exponential(x_array, a, k, b)
non_zero_country_df['estimated'] = estimated
estimated

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


array([-8.42698720e+01, -8.28016341e+01, -8.10653911e+01, -7.90122227e+01,
       -7.65842788e+01, -7.37131497e+01, -7.03179387e+01, -6.63029821e+01,
       -6.15551548e+01, -5.59406820e+01, -4.93013704e+01, -4.14501509e+01,
       -3.21658077e+01, -2.11867454e+01, -8.20361789e+00,  7.14938701e+00,
        2.53048553e+01,  4.67743352e+01,  7.21627504e+01,  1.02185444e+02,
        1.37688336e+02,  1.79671754e+02,  2.29318623e+02,  2.88027791e+02,
        3.57453448e+02,  4.39551729e+02,  5.36635835e+02,  6.51441204e+02,
        7.87202591e+02,  9.47745207e+02,  1.13759250e+03,  1.36209360e+03,
        1.62757404e+03,  1.94151401e+03,  2.31275906e+03,  2.75176939e+03,
        3.27091455e+03,  3.88482196e+03,  4.61078907e+03,  5.46927071e+03,
        6.48445544e+03,  7.68494710e+03])

In [36]:
plt.figure()
ax = plt.gca()
non_zero_country_df.plot(ax=ax, x='date', y='estimated' ,marker='*')
non_zero_country_df.plot(ax=ax, x='date', y='count' ,marker='*')
plt.show()

<IPython.core.display.Javascript object>

In [37]:
non_zero_country_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,date,count,estimated
Country_Region,Province_State,Lat,Long,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Israel,,31.0,35.0,2020-02-21,1,-84.269872
Israel,,31.0,35.0,2020-02-22,1,-82.801634
Israel,,31.0,35.0,2020-02-23,1,-81.065391
Israel,,31.0,35.0,2020-02-24,1,-79.012223
Israel,,31.0,35.0,2020-02-25,1,-76.584279
Israel,,31.0,35.0,2020-02-26,2,-73.71315
Israel,,31.0,35.0,2020-02-27,3,-70.317939
Israel,,31.0,35.0,2020-02-28,4,-66.302982
Israel,,31.0,35.0,2020-02-29,7,-61.555155
Israel,,31.0,35.0,2020-03-01,10,-55.940682


In [38]:
extrapolation_dates_series = pd.Series([last_model_date + timedelta(days=k) for k in range(1, 15)])
extrapolation_x_array = (extrapolation_dates_series - ref_date).dt.days.values.astype(float)
extrapolation_estimated = exponential(extrapolation_x_array, a, k, b)
extrapolation_df = pd.DataFrame({'date': extrapolation_dates_series, 'estimated': extrapolation_estimated})
extrapolation_df

Unnamed: 0,date,estimated
0,2020-04-03,9104.570717
1,2020-04-04,10783.325573
2,2020-04-05,12768.512241
3,2020-04-06,15116.065302
4,2020-04-07,17892.129371
5,2020-04-08,21174.922779
6,2020-04-09,25056.941454
7,2020-04-10,29647.565084
8,2020-04-11,35076.138999
9,2020-04-12,41495.6186


In [39]:
plt.figure()
ax = plt.gca()
non_zero_country_df.plot(ax=ax, x='date', y='estimated' ,marker='*')
non_zero_country_df.plot(ax=ax, x='date', y='count' ,marker='*')
extrapolation_df.plot(ax=ax, x='date', y='estimated' ,marker='o')
plt.show()

<IPython.core.display.Javascript object>

In [40]:
# %matplotlib notebook
# plt.figure(figsize=(9, 8))
# for iteration in range(0, 8):
#     plt.subplot(331 + iteration)
#     n_days = int(iteration * 7)
#     x_long_term = range(len(y_array_exp) + n_days)
#     y_forecast = exponential(x_long_term, a, k, b)
#     max_val = y_forecast.max() 
#     plt.plot(estimated)
#     plt.plot(y_array_exp, '--*')
#     plt.plot(y_forecast, '--b')
#     plt.title(f'weeks from now: {iteration}\ncases:{int(max_val):,}', y=.5)

# countries growth per week

In [41]:
def get_growth_rate(df):
    try:
        non_zero_country_df = df[df['count'] > 0]
        y_array_exp = non_zero_country_df['count'].values
        ref_date = datetime(2020, 1, 1)
        x_array = (non_zero_country_df['date'] - ref_date).dt.days.values.astype(float)
        popt_exponential, pcov_exponential = scipy.optimize.curve_fit(exponential, x_array[:last_model_index], y_array_exp[:last_model_index], p0=
        (0.0021819439144763727, 0.2196989290489097, -0.8576298838011189))
        a, k, b = popt_exponential
        estimated = exponential(x_array, a, k, b)
        max_error_in_percentage = np.abs(estimated - y_array_exp).max() / y_array_exp.max() * 100
        return_values = [np.exp(k), np.exp(k*7), max_error_in_percentage, y_array_exp.max()]        
    except:
        return_values = [None, None, None, None]
    names = ['day_rate', 'week_rate', 'max_error_in_percentage', 'confirmed_cases']
    return pd.Series(return_values, index=names)

In [42]:
countries_stat_df = countries_df.groupby('Country_Region').apply(get_growth_rate)
countries_stat_df



Unnamed: 0_level_0,day_rate,week_rate,max_error_in_percentage,confirmed_cases
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,1.181714,3.218004,6.862914,273.0
Albania,1.068022,1.58512,6.834851,277.0
Algeria,1.164378,2.901728,3.879769,986.0
Andorra,1.109564,2.070464,10.55894,428.0
Angola,1.034161,1.265081,14.0267,8.0
Antigua and Barbuda,1.069645,1.602057,27.42642,9.0
Argentina,1.134465,2.41845,9.773749,1133.0
Armenia,1.11047,2.08232,5.034682,663.0
Australia,1.276651,5.527192,652.4627,5116.0
Austria,1.101997,1.973622,12.33197,11129.0


In [43]:
countries_stat_sorted_df = countries_stat_df.dropna().round(4).sort_values(by='confirmed_cases', ascending=False)
countries_stat_sorted_df

Unnamed: 0_level_0,day_rate,week_rate,max_error_in_percentage,confirmed_cases
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
US,1.176,3.1106,94.3708,243453.0
Italy,1.2189,3.9982,772.0554,115242.0
Spain,1.3299,7.3577,818.5457,112065.0
Germany,1.4043,10.7727,6627.6538,84794.0
China,0.0,0.0,53.5244,82432.0
France,1.37,9.0583,4354.283,59929.0
Iran,1.0649,1.5533,2.7591,50468.0
United Kingdom,1.2624,5.1084,137.3826,34173.0
Switzerland,1.0877,1.8009,9.2752,18827.0
Turkey,1.2145,3.8985,5.5045,18135.0


In [None]:
# countries_stat_sorted_df.to_excel('countries_stat.xlsx')