In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
np.random.seed(1)
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2

### Statement of the problem:

CoverMyMeds Challenge Problem
At CoverMyMeds, time series analysis forms the backbone of the financial projections we make at the beginning of each fiscal year and helps us set realistic but challenging company goals.  Many of our products have multiple customers whose behaviors might (or might not) influence each other.  We are presenting daily volume (can be thought of as purchases) of one product for three customers. Questions that you should answer in this project are the following: 
* What trends are in the data at the daily, monthly, and yearly levels?
* What variables do and do not affect daily volume?
* How does the behavior of one customer affect the behavior of the other customers?
* How accurately can you predict what will happen in 2019 at the monthly and yearly levels?
* How accurate are your projections 1 month, 3 months, and 6 months after the  day they were forecasted from?


### Explanatory analysis

In [13]:
# needs to install pyarrow
df = pd.read_parquet('data/cmm_erdos_bootcamp_2020_timeseries.pq', engine='auto')
df.shape

(1095, 11)

Very small data size

In [None]:
df.head()

Check if there's missing value

In [None]:
df.isna().sum()

In [None]:
df[['volume_A','volume_B', 'volume_C']].describe()

Convert 1sf column to datetime type

In [14]:
df['date_val'] = pd.to_datetime(df['date_val'], yearfirst=True)
df.set_index(['date_val'], inplace = True)

In [None]:
dayly_effect = df.groupby('day_of_week')[['volume_A', 'volume_B', 'volume_C']].sum()

In [None]:
plt.plot(dayly_effect)
plt.show()

In [None]:
weekday_effect

In [None]:
weekday_effect = df.groupby('is_weekday')[['volume_A', 'volume_B', 'volume_C']].sum()
weekday_effect

In [None]:
df.groupby('is_holiday')[['volume_A', 'volume_B', 'volume_C']].sum()

In [None]:
df[['volume_A','volume_B', 'volume_C']].describe()

In [15]:
df_new = df.drop(columns = ['calendar_year', 'calendar_month', 'calendar_day'])

Inspect correlation

In [None]:
df_new.corr()

In [None]:
# scatter matrix makes a matrix of scatter plots
from pandas.plotting import scatter_matrix

scatter_matrix(df_new, figsize = (14,14), alpha = 1)

plt.show()

In [None]:
df_new['volume_C'].plot(figsize=(10, 5))
plt.show()

In [None]:
test_a = df_new['volume_A'].to_numpy()

In [None]:
# Import the SimpleExpSmoothing object
from statsmodels.tsa.api import SimpleExpSmoothing
ses = SimpleExpSmoothing(df_new['volume_A'])

In [None]:
fit = ses.fit(smoothing_level=0.1, optimized=False)

In [None]:
df_new.index

In [None]:
plt.figure(figsize=(8,6))

# plot the training data
plt.plot(df_new.index, df_new['volume_A'],'b',
            label = "Training Data")

# plot the fit
plt.plot(df_new.index, fit.fittedvalues,'r-',
            label = "Fitted Values")

### Modeling using tensorflow probability

In [50]:
from models.plot import plot_forecast
from models.models import STS_model
import matplotlib.dates as mdates

In [51]:
# needs to install pyarrow
df = pd.read_parquet('data/cmm_erdos_bootcamp_2020_timeseries.pq', engine='auto')
df['date_val'] = pd.to_datetime(df['date_val'], yearfirst=True)
df.set_index(['date_val'], inplace = True)
df_new = df.drop(columns = ['calendar_year', 'calendar_month', 'calendar_day'])
train_A, test_A = df_new['volume_A'].loc[:'2018-12-31'], df_new['volume_A'].loc['2019-1-1':]
train_A, test_A = train_A.to_numpy(dtype='float32'), test_A.to_numpy(dtype='float32')

In [52]:
model_A = STS_model(train_A)

In [53]:
model_A.build_model()

In [54]:
model_A.train()

TypeError: in user code:

    /Users/emp/Downloads/CoverMyMeds_TimeSeries/models/models.py:49 train  *
        self.elbo_loss_curve = tfp.vi.fit_surrogate_posterior(

    TypeError: 'NoneType' object is not callable


In [47]:
model_A.optimizer(learning_rate = 0.1)

TypeError: 'Adam' object is not callable