In [None]:
import pandas as pd
from util import DbHelper
from util.defaults import db2covid as db_options

# Instantiate DbHelper class for executing
# query to database
db = DbHelper(db_options)

In [None]:
min_date = str(db.execute("SELECT MIN(date) FROM COVID19_OPEN_DATA").values.tolist()[0][0])
max_date = str(db.execute("SELECT MAX(date) FROM COVID19_OPEN_DATA").values.tolist()[0][0])

In [None]:
# Options
sr1_code = 'JI'
ndays = 7
lcc_colname = 'LEAD_%dDAYS' % (ndays)

In [None]:
date_index = pd.date_range(start=min_date, end=max_date)

In [None]:
sql = "SELECT DATE, CUMULATIVE_CONFIRMED FROM COVID19_CUMULATIVE_DATA WHERE SUBREGION1_CODE='%s'" % (sr1_code)

cum_confirmed = db.execute(sql)
cum_confirmed.index = date_index

In [None]:
lead_cum_confirmed = db.lead_cum_confirmed_date_range(min_date, max_date, sr1_code, ndays)
lead_cum_confirmed.index = date_index
lead_cum_confirmed.columns = [lcc_colname]

In [None]:
confirmed = pd.concat([cum_confirmed, lead_cum_confirmed], axis=1)

confirmed.plot()

In [None]:
sql = "SELECT "+ \
      "MOBILITY_RETAIL_AND_RECREATION " + \
      "FROM COVID19_DAILY_MOBILITY_CHANGES "+ \
      "WHERE SUBREGION1_CODE='%s'" % (sr1_code)

In [None]:
mobility = db.execute(sql)
mobility.index = date_index

In [None]:
confirmed_vs_mobility = pd.concat([confirmed, mobility], axis=1)

confirmed_vs_mobility.plot.scatter(x='MOBILITY_RETAIL_AND_RECREATION', y=lcc_colname)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [None]:
covid_X = confirmed_vs_mobility.MOBILITY_RETAIL_AND_RECREATION
covid_y = confirmed_vs_mobility[lcc_colname]

covid_X = np.array(covid_X.values.tolist()).reshape(-1,1)
covid_y = np.array(covid_y.values.tolist())

covid_X_unknown = covid_X[-14:]

covid_X = covid_X[:-14] # Ada null value
covid_y = covid_y[:-14] # Ada null value

#covid_X_train = covid_X
#covid_X_test = covid_X[-50:]

#covid_y_train = covid_y
#covid_y_test = covid_y[-50:]

covid_X_train, covid_X_test, covid_y_train, covid_y_test = train_test_split(covid_X, covid_y, test_size=0.4)

In [None]:
regr = LinearRegression()

In [None]:
regr.fit(covid_X_train, covid_y_train)

In [None]:
regr.coef_

In [None]:
covid_y_pred = regr.predict(covid_X_test)

In [None]:
# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(covid_y_test, covid_y_pred))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(covid_y_test, covid_y_pred))

In [None]:
# Plot outputs
plt.scatter(covid_X_test, covid_y_test,  color='black')
plt.plot(covid_X_test, covid_y_pred, color='blue', linewidth=3)

plt.show()

In [None]:
regr.predict(np.array([[2]]))

In [None]:
from joblib import dump
import math
dump(regr, 'model/regr-model_%s-%d_%d.joblib' % (sr1_code, ndays, math.ceil((r2_score(covid_y_test, covid_y_pred)) * 100) ))

In [None]:
def get_model_above_threshold(X, y, r2_threshold, test_size=None):
    while True:
        covid_X_train, covid_X_test, covid_y_train, covid_y_test = train_test_split(X, y, test_size=test_size)
        _regr = LinearRegression()

        _regr.fit(covid_X_train, covid_y_train)

        covid_y_pred = regr.predict(covid_X_test)

        _r2_score = r2_score(covid_y_test, covid_y_pred)

        # The coefficients
        print('Coefficients: \n', regr.coef_)
        # The mean squared error
        print('Mean squared error: %.2f'
              % mean_squared_error(covid_y_test, covid_y_pred))
        # The coefficient of determination: 1 is perfect prediction
        print('Coefficient of determination: %.2f'
              % _r2_score)

        if _r2_score < r2_threshold:
            continue

        return covid_X_train, covid_X_test, covid_y_train, covid_y_test, _regr

In [None]:
_,_,_,_,r = get_model_above_threshold(covid_X, covid_y, 0.63, 0.4)