In [128]:
import pandas as pd
import numpy as np 

df = pd.read_csv('data/novel-corona-virus-2019-dataset/covid_19_data.csv',parse_dates=['Last Update'])
df.rename(columns={'ObservationDate':'Date', 'Country/Region':'Country'}, inplace=True)

df_confirmed = pd.read_csv("data/novel-corona-virus-2019-dataset/time_series_covid_19_confirmed.csv")
df_recovered = pd.read_csv("data/novel-corona-virus-2019-dataset/time_series_covid_19_recovered.csv")
df_deaths = pd.read_csv("data/novel-corona-virus-2019-dataset/time_series_covid_19_deaths.csv")

df_confirmed.rename(columns={'Country/Region':'Country'}, inplace=True)
df_recovered.rename(columns={'Country/Region':'Country'}, inplace=True)
df_deaths.rename(columns={'Country/Region':'Country'}, inplace=True)

deaths = df.groupby('Date').sum()['Deaths'].reset_index()
recovered = df.groupby('Date').sum()['Recovered'].reset_index()

deaths["Day"] = deaths.index
recovered["Day"] = recovered.index

In [None]:
df_confirmed.head()

In [None]:
df_confirmed.tail()

In [129]:
import datetime 

confirmed = df.groupby('Date').sum()['Confirmed'].reset_index()
confirmed["Datetime"] = confirmed["Date"].astype('datetime64[ns]') 
confirmed["Day"] = confirmed.index

past = confirmed.loc[(confirmed['Datetime'] <= datetime.datetime(2020, 4, 1))]
future = confirmed.loc[(confirmed['Datetime'] > datetime.datetime(2020, 4, 1))]


In [None]:
import seaborn as sbs
import matplotlib.pyplot as plt

sbs.set_style(
    {
        "axes.facecolor": "white", 
        "axes.grid": True,
        "xtick.bottom": True,
        "axes.spines.right": False,
        'axes.spines.top': False
    })

total = sbs.scatterplot(x=past["Day"], y=past["Confirmed"])
total = sbs.scatterplot(x=future["Day"], y=future["Confirmed"])

total.set(xlabel="Days", ylabel="Confirmed Infections")

yticks = ['{:,}'.format(int(x)) for x in total.get_yticks()]
# ylabels = ['{:,.2f}'.format(x) + 'K' for x in total.get_yticks()/1000]
total.set_yticklabels(yticks)

plt.show()

In [130]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Bar(x=confirmed['Date'],
                y=confirmed['Confirmed'],
                name='Confirmed',
                marker_color='blue'
                ))
fig.add_trace(go.Bar(x=deaths['Date'],
                y=deaths['Deaths'],
                name='Deaths',
                marker_color='Red'
                ))
fig.add_trace(go.Bar(x=recovered['Date'],
                y=recovered['Recovered'],
                name='Recovered',
                marker_color='Green'
                ))

fig.update_layout(
    title="COVID19 World Data",
    title_x = 0.5,
    xaxis_title="Date",
    xaxis= {
        'tickformat': '%b',
        # 'tickvals': pd.date_range('2020-1', '2020-4', freq='MS')
    },
    yaxis_title="Number of Cases",
    font={
        "family":"Courier New, monospace",
        # size=18,
        # color="#7f7f7f"
    },
    legend={
        "x": 0,
        "y": 1
    },
    shapes=[
        # 1st highlight during Feb 4 - Feb 6
        dict(
            type="rect",
            # x-reference is assigned to the x-values
            xref="x",
            # y-reference is assigned to the plot paper [0,1]
            yref="paper",
            x0="04/01/2020",
            y0=0,
            x1=max(confirmed["Date"]),
            y1=1,
            fillcolor="LightSalmon",
            opacity=0.5,
            layer="below",
            line_width=0,
        )
    ],
)

fig.write_image("images/world_all.png")
fig.show()

In [None]:
# Split
len(past)/(len(past)+len(future))

In [131]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(past["Day"].values.reshape(-1, 1), past["Confirmed"].values.reshape(-1,1), test_size=0.30, random_state=42)

In [132]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

poly_reg = PolynomialFeatures(degree=4)
x_poly = poly_reg.fit_transform(X_train)

lr = LinearRegression()
lr.fit(x_poly, y_train)

#To retrieve the intercept:
print(lr.intercept_)
#For retrieving the slope:
print(lr.coef_)

y_pred_test = lr.predict(poly_reg.fit_transform(X_test))
y_pred = lr.predict(poly_reg.fit_transform(confirmed["Day"].values.reshape(-1,1)))


# df = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred.flatten()}).head()
# df

[18481.71870124]
[[ 0.00000000e+00 -8.39060248e+03  9.94034585e+02 -3.02098274e+01
   2.91646218e-01]]


In [133]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=confirmed["Date"], 
        y=confirmed["Confirmed"],
        mode="lines", name="Actual"
    )
)

fig.add_trace(
    go.Scatter(
        x=confirmed["Date"], 
        y=y_pred.flatten(),
        mode="markers",
        name="Polynomial Model",
        opacity=0.5
    )
)

fig.update_layout(
    title="Polynomial Model Peformance",
    title_x = 0.5,
    xaxis_title="Date",
    xaxis= {
        'tickformat': '%b',
        'tickvals': pd.date_range('2020-1', '2020-4', freq='MS')
    },
    yaxis_title="Number of Infected",
)

fig.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(confirmed["Day"].values.reshape(-1, 1), confirmed["Confirmed"].values.reshape(-1,1), test_size=0.30, random_state=42)

poly_reg = PolynomialFeatures(degree=4)
x_poly = poly_reg.fit_transform(X_train)

lr_full = LinearRegression()
lr_full.fit(x_poly, y_train)

#To retrieve the intercept:
print(lr_full.intercept_)
#For retrieving the slope:
print(lr_full.coef_)

y_pred_full_test = lr_full.predict(poly_reg.fit_transform(X_test))
y_pred_full = lr_full.predict(poly_reg.fit_transform(confirmed["Day"].values.reshape(-1,1)))

In [134]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=confirmed["Date"], 
        y=confirmed["Confirmed"],
        mode="lines", name="Actual"
    )
)

fig.add_trace(
    go.Scatter(
        x=confirmed["Date"], 
        y=y_pred.flatten(),
        mode="markers",
        name="Model (March 31st)",
        opacity=0.5
    )
)

fig.add_trace(
    go.Scatter(
        x=confirmed["Date"], 
        y=y_pred_full.flatten(),
        mode="markers",
        name="Model (Present)",
        opacity=0.5
    )
)

fig.update_layout(
    title="Polynomial Model Peformance",
    title_x = 0.5,
    xaxis_title="Date",
    xaxis= {
        'tickformat': '%b',
        'tickvals': pd.date_range('2020-1', '2020-4', freq='MS')
    },
    yaxis_title="Number of Cases",
    font={
        "family":"Courier New, monospace",
        # size=18,
        # color="#7f7f7f"
    },
    legend={
        "x": 0.05,
        "y": 1-.05
    }
)

fig.write_image("images/polynomial_full.png")
fig.show()