In [2]:
import pandas as pd
import os
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn')
pd.set_option('display.max_columns', None)

---

### data import

In [3]:
df = pd.read_excel(os.path.join('data', 'COVID-19-geographic-disbtribution-worldwide-2020-03-10.xls'))

df.shape

(4652, 6)

### data cleaning

In [None]:
# we filter out China to get rid of the extreme values
df = df[df['CountryExp'].ne('China')]

# clean up column names
df.columns = df.columns.str.replace('DateRep', 'Date').str.replace('CountryExp', 'Country')

# convert Date column to datetime type
df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)
df['Country'] = df['Country'].str.replace('Cases on an international conveyance Japan', 'Japan')
df.head()

### visualizations

In [None]:
cases_sum = df.groupby('Date')['NewConfCases'].sum()
cases_sum.plot(figsize=(15,9))
plt.title('Amount of confirmed cases over time', size=13)
plt.savefig('plots/plt1.png', dpi=100)
plt.show()

In [None]:
cases_cumsum = cases_sum.cumsum()
cases_cumsum.plot(figsize=(15,9))
plt.title('Cumulative sum of confirmed cases over time', size=13)
plt.savefig('plots/plt2.png')
plt.show()

In [None]:
log_cumsum = np.log(cases_cumsum[cases_cumsum.gt(1)]).to_frame().reset_index()
log_cumsum['Date'] = log_cumsum['Date'].dt.dayofyear

plt.figure(figsize=(15,9))
plt.title("Regression line in logarithmic scale")
sns.regplot(x='Date', y='NewConfCases', data=log_cumsum)
plt.xlabel('Day of year')
plt.ylabel('New confirmed cases on logarithmic scale')
plt.show()

In [None]:
X = log_cumsum['NewConfCases']
y = log_cumsum['Date']

model = sm.OLS(y, X).fit()
predictions = model.predict(X) # make the predictions by the model

# Print out the statistics
model.summary()

In [None]:
cases_cumsum.plot(figsize=(15,9), logy=True)
plt.title('Cumulative sum of confirmed cases over time', size=13)
plt.savefig('plots/plt3.png')
plt.show()

In [None]:
cases_cumsum = cases_cumsum.replace(0, 1)
cases_cumsum.diff().div(cases_cumsum).add(1).fillna(1).plot(figsize=(15,9))
plt.title('Growth proportion considering previous day', size=13)
plt.ylabel('Growth proportion')
plt.savefig('plots/plt4.png')
plt.show()

In [None]:
df.groupby('Country')['NewConfCases'].max().sort_values(ascending=False).head(15)

In [None]:
df.groupby('Country')['NewConfCases'].sum().sort_values(ascending=False).head(15)

---