In [1]:
import pandas as pd
import numpy as np 

df = pd.read_csv('../data/novel-corona-virus-2019-dataset/covid_19_data.csv',parse_dates=['Last Update'])
df.rename(columns={'ObservationDate':'Date', 'Country/Region':'Country'}, inplace=True)

df["Eradicated"] = df["Deaths"] + df["Recovered"]
df["Active"] = df["Confirmed"] - df["Eradicated"]

df["Country"].replace(["Mainland China"], ["China"], inplace=True)
df["Country"].replace(["US"], ["United States"], inplace=True)
df["Country"].replace(["UK"], ["United Kingdom"], inplace=True)

# df = df.groupby('Date').sum()['Deaths'].reset_index()
# df["Days"] = df.index

past = df.loc[(df['Date'] == "04/01/2020")]
future = df.loc[(df['Date'] > "04/01/2020")]

In [2]:
import pycountry_convert as pc

def get_continent(row):
    continents = {
        'NA': 'North America',
        'SA': 'South America', 
        'AS': 'Asia',
        'OC': 'Australia',
        'AF': 'Africa',
        'EU': 'European Union'
    }

    country = row["Country"]
    try: 
        country_code = pc.country_name_to_country_alpha2(country, cn_name_format="default")
        return continents[pc.country_alpha2_to_continent_code(country_code)]
    except:
        return None         

df["Continent"] = df.apply (lambda row: get_continent(row), axis=1)

# NA = ['US', 'Canada', 'Mexico']
# Asia = ['Iran', 'South Korea', 'Japan']
# Africa = ["Morocco", "Egypt", "Algeria"]
# Europe = ["Spain", "Italy", "Germany"]
# SA = ["Brazil", "Peru", "Ecuador"]


In [3]:
continents = df.groupby("Continent").sum().sort_values(by=['Confirmed'], ascending=False).reset_index()
continents

Unnamed: 0,Continent,SNo,Confirmed,Deaths,Recovered,Eradicated,Active
0,European Union,23870688,14081179.0,1053364.0,2648989.0,3702353.0,10378826.0
1,Asia,29540445,8789343.0,320626.0,4396963.0,4717589.0,4071754.0
2,North America,34376605,7352522.0,231780.0,434879.0,666659.0,6685863.0
3,South America,4269544,589768.0,21067.0,57760.0,78827.0,510941.0
4,Africa,15038283,193531.0,8745.0,24893.0,33638.0,159893.0
5,Australia,4295643,137727.0,845.0,24061.0,24906.0,112821.0


In [4]:
df[df["Continent"] == "Australia"].groupby("Country").sum().sort_values(by=['Confirmed'], ascending=False).reset_index().head()


Unnamed: 0,Country,SNo,Confirmed,Deaths,Recovered,Eradicated,Active
0,Australia,3297311,118101.0,809.0,19703.0,20512.0,97589.0
1,New Zealand,380113,19356.0,36.0,4358.0,4394.0,14962.0
2,Fiji,291193,229.0,0.0,0.0,0.0,229.0
3,Papua New Guinea,285890,35.0,0.0,0.0,0.0,35.0
4,Guam,41136,6.0,0.0,0.0,0.0,6.0


In [5]:
import plotly.graph_objects as go

def graph(data, name):
    fig = go.Figure()

    fig.add_trace(
        go.Bar(
            x=data['Date'],     
            y=data["Confirmed"],
            name="Confirmed",
        )
    )

    fig.add_trace(
        go.Scatter(
            x=data['Date'],     
            y=data['Exponential'],
            name="Exponential",
        )
    )


    fig.add_trace(
        go.Scatter(
            x=data['Date'],     
            y=data['Polynomial'],
            name="Polynomial",
        )
    )

    fig.update_layout(
        title="COVID19 in {}".format(name),
        title_x = 0.5,
        xaxis_title="Date",
        # xaxis= {
        #     'tickformat': '%b',
        #     # 'tickvals': pd.date_range('2020-1', '2020-4', freq='MS')
        # },
        yaxis_title="Number of Confirmed Cases",
        font={
            "family":"Courier New, monospace",
            # size=18,
            # color="#7f7f7f"
        },
        legend={
            "x": 0,
            "y": 1
        },
        shapes=[
            # 1st highlight during Feb 4 - Feb 6
            dict(
                type="rect",
                # x-reference is assigned to the x-values
                xref="x",
                # y-reference is assigned to the plot paper [0,1]
                yref="paper",
                x0="04/01/2020",
                y0=0,
                x1=max(data["Date"]),
                y1=1,
                fillcolor="LightSalmon",
                opacity=0.5,
                layer="below",
                line_width=0,
            )
        ],
    )

    fig.show()

In [6]:

# # country = check
# # country = country.loc[(country['Confirmed'] > 1000)]

# x = country.loc[(country['Date'] < "04/01/2020")]["Days"].values
# y = country.loc[(country['Date'] < "04/01/2020")]["Confirmed"].values



# fig = go.Figure()

# fig.add_trace(
#     go.Bar(
#         x=country['Date'],     
#         y=country["Confirmed"],
#         name="Confirmed",
#     )
# )

# fig.add_trace(
#     go.Scatter(
#         x=country['Date'],     
#         y=y_pred,
#         name="Polynomial Model",
#     )
# )

# fig.update_layout(
#     title="COVID19 in Mainland China",
#     title_x = 0.5,
#     xaxis_title="Date",
#     # xaxis= {
#     #     'tickformat': '%b',
#     #     # 'tickvals': pd.date_range('2020-1', '2020-4', freq='MS')
#     # },
#     yaxis_title="Number of Confirmed Cases",
#     font={
#         "family":"Courier New, monospace",
#         # size=18,
#         # color="#7f7f7f"
#     },
#     legend={
#         "x": 0,
#         "y": 1
#     },
# )

# fig.show()

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score, mean_squared_error
from scipy.optimize import curve_fit

def exponential_model(country):
    def exponential_model(x,a,b,c):
        return a*np.exp(b*(x-c))

    x = country.loc[(country['Date'] < "04/01/2020")]["Days"].values
    y = country.loc[(country['Date'] < "04/01/2020")]["Confirmed"].values

    x2 = country.loc[(country['Date'] >= "04/01/2020")]["Days"]
    y2 = country.loc[(country['Date'] >= "04/01/2020")]["Confirmed"].values


    exp_fit = curve_fit(exponential_model,x,y,p0=[0,0,0])

    y_test_pred = exponential_model(x2, *exp_fit[0])

    country["Exponential"] = exponential_model(country['Days'].values, *exp_fit[0])
    score = r2_score(y2, y_test_pred)

    return country, score

def polynomial(x_train, x_test, y_train, y_test, deg):
    poly_reg = PolynomialFeatures(degree=deg)
    x_poly = poly_reg.fit_transform(x_train)

    lr = LinearRegression()
    lr.fit(x_poly, y_train)

    x_poly_test = poly_reg.fit_transform(x_test)
    y_pred = lr.predict(x_poly_test)
    poly_mse = mean_squared_error(y_test, y_pred)
    poly_rmse = np.sqrt(poly_mse)

    return lr, poly_mse, poly_rmse

def optimize_lr(x_train, x_test, y_train, y_test): 
    rmses = []
    degrees = np.arange(1, 10)
    min_rmse, min_deg = 1e10, 0

    for deg in degrees: 
        lr, poly_mse, poly_rmse = polynomial(x_train, x_test, y_train, y_test, deg)

        # Cross-validation of degree
        if min_rmse > poly_rmse:
            min_rmse = poly_rmse
            min_deg = deg
    
    return lr, min_deg, min_rmse

def polynomial_model(country):

    x = country.loc[(country['Date'] < "04/01/2020")]["Days"].values
    y = country.loc[(country['Date'] < "04/01/2020")]["Confirmed"].values

    x2 = country.loc[(country['Date'] >= "04/01/2020")]["Days"].values.reshape(-1,1)
    y2 = country.loc[(country['Date'] >= "04/01/2020")]["Confirmed"].values.reshape(-1,1)

    x_train, x_test, y_train, y_test = train_test_split(x.reshape(-1, 1), y.reshape(-1,1), test_size=0.30, random_state=42)

    lr, min_deg, min_rmse = optimize_lr(x_train, x_test, y_train, y_test)

    poly_reg = PolynomialFeatures(degree=min_deg)
    # x_poly = poly_reg.fit_transform(x.reshape(-1, 1))

    # lr = LinearRegression()
    # lr.fit(x_poly, y.reshape(-1,1))

    
    #To retrieve the intercept:
    # print(lr.intercept_)
    #For retrieving the slope:
    # print(lr.coef_)
    print(type(lr))
    # print(x2)

    y_test_pred = lr.predict(poly_reg.fit_transform(x2))


    country["Polynomial"] = lr.predict(poly_reg.fit_transform(country["Days"].values.reshape(-1,1))).flatten()


    # y_pred_future = lr.predict(poly_reg.fit_transform(check["Days"].values.reshape(-1,1))).flatten()

    # lr.score(y_pred)
    score = r2_score(y2, y_test_pred)

    return country, score



In [8]:
# name = "Australia"

# country = df[df['Continent'] == name].groupby("Date")[["Date", "Confirmed"]].sum().reset_index()
# country["Days"] = country.index
# country = country.loc[(country['Confirmed'] > 0.01*country['Confirmed'].max())]

# country, score1 = polynomial_model(country)
# country, score2 = exponential_model(country)

# graph(country, name)

<class 'sklearn.linear_model._base.LinearRegression'>


ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 10 is different from 6)