# Time series predictor

In [1]:
import numpy as np
import time
from datetime import date
import datetime
from datetime import timedelta  
import csv
import holidays # for importing the public holidays
import re
import torch

from src.utils import *
from src.data_miner import DataMiner
from src.models import MLP
from src.dataset import GoodNightDataset

## Parameters

In [2]:
num_features = 5
min_hour = 21 # Minimum hour for sleep detection
max_hour = 5 # Maximum hour for sleep detection
train_window = 3 # Sequence length of past days
local_holidays = holidays.Italy(prov='BO') # Get the holidays in Bologna, Italy :)
EPOCHS = 500
batch_size = 16
# Directories
data_dir = "data"
data_file = "data/LastSeenDataset.csv"

- Feature extraction: we first extract the features given the time series data of Telegram accesses.
- Supposition: last Telegram access in very similar to the time the person goes to sleep

## Open Data File

In [3]:
with open(data_file, newline='') as csvfile:
    date_list = list(csv.reader(csvfile))

date_list = convert_to_dates(date_list)

'''Test data: search calendar for local holidays'''
print("First day is holiday: ", date_list[0][0] in local_holidays)

First day is holiday:  False


## Feature engineering
Possible features to extract: 
1. Last seen time (arguably the most important)
2. Wake up time
3. Number of Telegram accesses during the previous day
4. Day of the week
5. Public holiday presence in the following day (using the holidays library)
6. (time spent on Telegram)


In [4]:
data_tensor =  DataMiner(date_list).to_tensor(verbose=False)
# print(data_tensor)
n_features = num_features # this is number of parallel inputs
n_timesteps = train_window # this is number of timesteps


## Model
Since we want to predict simple time series data, we can employ:
- MPL: Multi Layer Perceptron, simple deep neural network with hidden layer
- RNN: Recurrent Neural Network, more suitable for time series
- LSTM: Long Short Term Memory, an advancement of RNN
- Transformer: currently (2020) state of the art, but complex and possibly overpower
- ... Other

In [5]:
model = MLP(n_features*n_timesteps, 1)
criterion = torch.nn.MSELoss() # reduction='sum' created huge loss value
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

## Build the Dataset

In [6]:
dataset = GoodNightDataset(data_file, n_timesteps)

## Data Augmentation

Given that the training data is not much, we can insert some noise to augment it; this will also make the model less prone to overfitting

In [7]:
dataset.noisy() # apply gaussian noise

## Training the Model

In [8]:
trainloader = torch.utils.data.DataLoader(dataset,
                                         batch_size=batch_size, shuffle=True,
                                         num_workers=0)               
model.train()

losses = []

# Training loop
for t in range(EPOCHS):
    X, y = next(iter(trainloader))
    optimizer.zero_grad()
    prediction = model.forward(X.reshape(batch_size, n_features*n_timesteps))
    loss = criterion(prediction, y)  
    loss.backward()
    optimizer.step()        
    losses.append(loss.item())
    if t%10 == 0 and t >= 10:
        print(('Epoch: {:4}  | Total mean loss: {:.6f} ').format(t, mean(losses[t-10:t])))

Epoch:   10  | Total mean loss: 0.246021 
Epoch:   20  | Total mean loss: 0.067236 
Epoch:   30  | Total mean loss: 0.046674 
Epoch:   40  | Total mean loss: 0.042729 
Epoch:   50  | Total mean loss: 0.030918 
Epoch:   60  | Total mean loss: 0.031948 
Epoch:   70  | Total mean loss: 0.031738 
Epoch:   80  | Total mean loss: 0.022974 
Epoch:   90  | Total mean loss: 0.029274 
Epoch:  100  | Total mean loss: 0.026619 
Epoch:  110  | Total mean loss: 0.022413 
Epoch:  120  | Total mean loss: 0.021658 
Epoch:  130  | Total mean loss: 0.023719 
Epoch:  140  | Total mean loss: 0.022066 
Epoch:  150  | Total mean loss: 0.021249 
Epoch:  160  | Total mean loss: 0.021422 
Epoch:  170  | Total mean loss: 0.019611 
Epoch:  180  | Total mean loss: 0.019892 
Epoch:  190  | Total mean loss: 0.018941 
Epoch:  200  | Total mean loss: 0.017903 
Epoch:  210  | Total mean loss: 0.021144 
Epoch:  220  | Total mean loss: 0.017884 
Epoch:  230  | Total mean loss: 0.017367 
Epoch:  240  | Total mean loss: 0.

### Model evaluation on training data
We use the trained model to predict the same data as before, this time with no noise.
Notice that we are going to overfit if we train for too long
Potential fixes:
- Use validation loss
- Use higher noise value
- Use different noise generator
- Find another way to augment the data
- Collect more data

In [9]:
with torch.no_grad():
    for i in range(len(dataset)):
        X, y = dataset[i]
        prediction = model.forward(X.reshape(1,15)).item()
        real = y.T.item()
        print('Predicted: {:.4f} | Real: {:.4f}'.format(prediction, real))

Predicted: 0.5508 | Real: 0.5033
Predicted: 0.5828 | Real: 0.4888
Predicted: 0.5032 | Real: 0.5380
Predicted: 0.4785 | Real: 0.5200
Predicted: 0.5570 | Real: 0.6680
Predicted: 0.4035 | Real: 0.1981
Predicted: 0.5543 | Real: 0.5418
Predicted: 0.5749 | Real: 0.5891
Predicted: 0.5290 | Real: 0.5230
Predicted: 0.5929 | Real: 0.5878
Predicted: 0.3872 | Real: 0.2870
Predicted: 0.2527 | Real: 0.1545
Predicted: 0.3441 | Real: 0.3483
Predicted: 0.1001 | Real: 0.1007
Predicted: 0.6871 | Real: 0.6694
Predicted: 0.5279 | Real: 0.5091
Predicted: 0.5201 | Real: 0.4906
Predicted: 0.6158 | Real: 0.6093
Predicted: 0.6287 | Real: 0.6412
Predicted: 0.6608 | Real: 0.8530
Predicted: 0.3939 | Real: 0.3883
Predicted: 0.5692 | Real: 0.5664
Predicted: 0.7738 | Real: 0.7656


## Saving the time

We save the predicted time to send the message in a file, so that the Daemon can handle it

In [11]:
now = datetime.datetime.now()
seq_length = 3
with open(data_file, newline='') as csvfile:
    date_list = list(csv.reader(csvfile))
date_list = convert_to_dates(date_list)
data_tensor =  DataMiner(date_list).to_tensor(verbose=False)
X, y = create_sequences(data_tensor, seq_length)
x = get_latest_sequence(data_tensor, seq_length)

with torch.no_grad():
    p = model.forward(x.reshape(1,15)).item()
print(p)
p_sec = int(p*(max_hour+24-min_hour)*3600)
prediction = now.replace(hour=min_hour, minute=0, second=0) + timedelta(seconds=p_sec)
print('Expected time to go to sleep: ', prediction.strftime("%Y-%m-%d %H:%M:%S"))


'''Write the value on a text file to be read by the Daemon'''
with open ('prediction.txt','w') as z:
    z.write(prediction.strftime("%Y-%m-%d %H:%M:%S\n"))
z.close()

with open ('data/prediction_list.txt','a') as z:
    z.write(prediction.strftime("%Y-%m-%d %H:%M:%S\n"))
z.close()

0.5492951273918152
Expected time to go to sleep:  2020-12-28 01:23:39
