#### Import libs

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
sns.set()

#### Preprocessing

In [None]:
# Get Covid Stringency Index data
stringency = pd.read_csv('covid-stringency-index.csv')
fin_strin = stringency[stringency['Entity'] == 'Finland']
fin_strin = fin_strin.drop(['Entity', 'Code'], axis=1)
fin_strin['Day'] = pd.to_datetime(fin_strin['Day'], format='%Y-%m-%d')
start = fin_strin.iloc[0]['Day']
end = fin_strin.iloc[-1]['Day']
print(end)

# Get Finland covid data
global_covid = pd.read_csv('WHO-COVID-19-global-data.csv')
fin_covid = global_covid[global_covid['Country'] == 'Finland']
fin_covid = fin_covid.drop(
    ['Country_code', 'Country', 'WHO_region', 'Cumulative_cases', 'Cumulative_deaths'], axis=1)
fin_covid['Date_reported'] = pd.to_datetime(
    fin_covid['Date_reported'], format='%Y-%m-%d')

# Filter data and combine data
mask = (fin_covid['Date_reported'] >= start) & (
    fin_covid['Date_reported'] <= end)
fin_covid = fin_covid.loc[mask]

# Insert this to  col 2
strin_values = fin_strin.loc[:, 'stringency_index'].to_numpy()
fin_covid.insert(0, 'Stringency_index', strin_values)


# Final datapoint
fin_covid = fin_covid.drop(['Date_reported'], axis=1)

print("\n", fin_covid.describe())
fin_covid.tail()


### Plotting data

In [None]:

fig1 = plt.figure(2, figsize=(10,10))
plt.xlabel('New_cases')
plt.ylabel('New_deaths')
plt.plot(fin_covid['New_cases'], fin_covid['New_deaths'], 'o', color='green')
plt.show() 

fig3 = plt.figure(3, figsize=(10,10))
plt.xlabel('Stringency_index')
plt.ylabel('New_deaths')
plt.plot(fin_covid['Stringency_index'], fin_covid['New_deaths'], 'o', color='purple')
plt.show()


In [None]:
sns.pairplot(fin_covid)


### Model and model validation

In [None]:
# Set features(x) and label(y)
features = ['Stringency_index', 'New_cases']
label = ['New_deaths']
X = fin_covid[features].values.reshape(-1, len(features))
y = fin_covid[label].values

# Split the dataset into a training and validation set
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.33, random_state=42)

# Applied Linear model FIXME:Will be removed later
Ln = LinearRegression()
Ln.fit(X_train, y_train)
y_pred_train = Ln.predict(X_train)
tr_error = mean_squared_error(y_train, y_pred_train)

y_pred_val = Ln.predict(X_val)
val_error = mean_squared_error(y_val, y_pred_val)
print(f'Training error: {tr_error} Validation error: {val_error}')


In [None]:
# Apply polynomial
degrees = [i for i in range(1,11)]
tr_errors = []
val_errors = []
for degree in degrees:
    Ln = LinearRegression(fit_intercept=False)
    poly = PolynomialFeatures(degree=degree)

    X_train_poly = poly.fit_transform(X_train)
    Ln.fit(X_train_poly, y_train)
    y_pred_train_poly = Ln.predict(X_train_poly)
    tr_error = mean_squared_error(y_train, y_pred_train_poly)
    tr_errors.append(tr_error)

    X_val_poly = poly.transform(X_val)
    y_pred_val_poly = Ln.predict(X_val_poly)
    val_error_poly = mean_squared_error(y_val, y_pred_val_poly)
    val_errors.append(val_error_poly)
    
    print(f'Degree: {degree}\n\ttr_error: {tr_error}, val_error: {val_error_poly}')

fig3, axs = plt.subplots()
axs.plot(degrees, tr_errors,  color='red', label='training error')
axs.plot(degrees, val_errors, color='blue', label='validation error')
axs.set_ylabel('Mean Squared Error (MSE)')
axs.legend()