# Time series predictor

In [1]:
import numpy as np
import time
from datetime import date
import datetime
from datetime import timedelta  
import csv
import holidays # for importing the public holidays
import re
import torch
from src.utils import *
from src.data_miner import DataMiner

## Parameters

In [2]:
num_features = 5
min_hour = 21 # Minimum hour for sleep detection
max_hour = 5 # Maximum hour for sleep detection
train_window = 3 # Sequence length of past days
local_holidays = holidays.Italy(prov='BO') # Get the holidays in Bologna, Italy :)
EPOCHS = 500
batch_size = 16
# Directories
data_dir = "data"
dataset = "data/LastSeenDataset.csv"

- Feature extraction: we first extract the features given the time series data of Telegram accesses.
- Supposition: last Telegram access in very similar to the time the person goes to sleep

## Open Data File

In [3]:
with open(dataset, newline='') as csvfile:
    date_list = list(csv.reader(csvfile))

date_list = convert_to_dates(date_list)

'''Test data: search calendar for local holidays'''
print("First day is holiday: ", date_list[0][0] in local_holidays)

First day is holiday:  False


## Feature engineering
Possible features to extract: 
1. Last seen time (arguably the most important)
2. Wake up time
3. Number of Telegram accesses during the previous day
4. Day of the week
5. Public holiday presence in the following day (using the holidays library)
6. (time spent on Telegram)


In [4]:
data_tensor =  DataMiner(date_list).to_tensor(verbose=False)
# print(data_tensor)
n_features = num_features # this is number of parallel inputs
n_timesteps = train_window # this is number of timesteps


## Model
Since we want to predict simple time series data, we can employ:
- MPL: Multi Layer Perceptron, simple deep neural network with hidden layer
- RNN: Recurrent Neural Network, more suitable for time series
- LSTM: Long Short Term Memory, an advancement of RNN
- Transformer: currently (2020) state of the art, but complex and possibly overpower
- ... Other

In [5]:
from src.models import MLP

model = MLP(n_features*n_timesteps, 1)
criterion = torch.nn.MSELoss() # reduction='sum' created huge loss value
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [6]:
from torch.utils.data import Dataset

# Variables
data_dir = "data"
data_root = "data/LastSeenDataset.csv"

class GoodNightDataset(Dataset):
    def __init__(self, data_root, seq_length):
        self.seq_length = seq_length
        with open(data_root, newline='') as csvfile:
            date_list = list(csv.reader(csvfile))
        date_list = convert_to_dates(date_list)
        self.data =  DataMiner(date_list).to_tensor(verbose=False)
        self.n_features = self.data.shape[0]
        # the sequence on which we have a prediction is the last train_window days
        self.X, self.y = self.create_sequences()
        
    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
    def _apply_noise(self, x, scale, noise):
        """Crop noisy datum to be in [0,1]"""
        noisy = x + scale*noise(1)
        if min(noisy.item(), 1) == 1:
            noisy = torch.tensor(1)
        if max(noisy.item(), 0 ) == 0:
            noisy = torch.tensor(0)
        return noisy

    def create_sequences(self, data_type=torch.float32):
        '''We create a list of training data divided in inputs X and outputs y'''
        X = []
        y = []
        L = self.data.shape[1]
        tw = self.seq_length
        for i in range(L-tw):
            train_seq = torch.zeros(self.n_features, tw)
            for j in range(n_features):
                train_seq[j]= self.data[j][i:i+tw]
            train_label = self.data[0][i+tw:i+tw+1] 
            X.append(train_seq)
            y.append(train_label)
        return torch.transpose(torch.stack(X), 2, 1).type(data_type), torch.stack(y).type(data_type)

    def get_latest_sequence(self, data_type=torch.float32):
        '''Get latest sequence for making prediction'''
        X = []
        tw = self.seq_length
        idx = self.data.shape[1] # index of the last element
        seq = torch.zeros(self.n_features, tw)
        for j in range(self.n_features):
            seq[j]= self.data[j][idx-tw:idx]
        X.append(seq)
        return torch.transpose(
            torch.stack(X), 2, 1).type(data_type)


    def noisy(self, scale=0.05, noise=torch.randn):
        """Build batch with noise"""
        for i in range(self.X.shape[0]):
            #y[i] = self._apply_noise(y[i], scale, noise)
            for j in range(self.X.shape[1]):
                for k in [0, 1, 4]: # don't put noise on day of the week and festive day presence
                    self.X[i, j, k] = self._apply_noise(self.X[i, j, k], scale, noise)         
        return


## Build the Dataset

In [7]:
dataset = GoodNightDataset(data_root, n_timesteps)

## Data Augmentation

Given that the training data is not much, we can insert some noise to augment it; this will also make the model less prone to overfitting

In [8]:
dataset.noisy() # apply gaussian noise

## Training the Model

In [9]:
trainloader = torch.utils.data.DataLoader(dataset,
                                         batch_size=batch_size, shuffle=True,
                                         num_workers=0)               
model.train()

losses = []

# Training loop
for t in range(EPOCHS):
    X, y = next(iter(trainloader))
    optimizer.zero_grad()
    prediction = model.forward(X.reshape(batch_size, n_features*n_timesteps))
    loss = criterion(prediction, y)  
    loss.backward()
    optimizer.step()        
    losses.append(loss.item())
    if t%10 == 0 and t >= 10:
        print(('Epoch: {:4}  | Total mean loss: {:.6f} ').format(t, mean(losses[t-10:t])))

Epoch:   10  | Total mean loss: 0.145319 
Epoch:   20  | Total mean loss: 0.045858 
Epoch:   30  | Total mean loss: 0.053011 
Epoch:   40  | Total mean loss: 0.035244 
Epoch:   50  | Total mean loss: 0.028595 
Epoch:   60  | Total mean loss: 0.028810 
Epoch:   70  | Total mean loss: 0.025607 
Epoch:   80  | Total mean loss: 0.021317 
Epoch:   90  | Total mean loss: 0.017327 
Epoch:  100  | Total mean loss: 0.020425 
Epoch:  110  | Total mean loss: 0.018569 
Epoch:  120  | Total mean loss: 0.019713 
Epoch:  130  | Total mean loss: 0.014891 
Epoch:  140  | Total mean loss: 0.015498 
Epoch:  150  | Total mean loss: 0.016044 
Epoch:  160  | Total mean loss: 0.015179 
Epoch:  170  | Total mean loss: 0.013356 
Epoch:  180  | Total mean loss: 0.013897 
Epoch:  190  | Total mean loss: 0.012356 
Epoch:  200  | Total mean loss: 0.010361 
Epoch:  210  | Total mean loss: 0.010669 
Epoch:  220  | Total mean loss: 0.009967 
Epoch:  230  | Total mean loss: 0.007742 
Epoch:  240  | Total mean loss: 0.

### Model evaluation on training data
We use the trained model to predict the same data as before, this time with no noise.
Notice that we are going to overfit if we train for too long
Potential fixes:
- Use validation loss
- Use higher noise value
- Use different noise generator
- Find another way to augment the data
- Collect more data

In [10]:
with torch.no_grad():
    for i in range(len(dataset)):
        X, y = dataset[i]
        prediction = model.forward(X.reshape(1,15)).item()
        real = y.T.item()
        print('Predicted: {:.4f} | Real: {:.4f}'.format(prediction, real))

Predicted: 0.5093 | Real: 0.5033
Predicted: 0.4884 | Real: 0.4888
Predicted: 0.5462 | Real: 0.5380
Predicted: 0.5114 | Real: 0.5200
Predicted: 0.6595 | Real: 0.6680
Predicted: 0.2033 | Real: 0.1981
Predicted: 0.5413 | Real: 0.5418
Predicted: 0.5869 | Real: 0.5891
Predicted: 0.5179 | Real: 0.5230
Predicted: 0.5877 | Real: 0.5878
Predicted: 0.2969 | Real: 0.2870
Predicted: 0.1509 | Real: 0.1545
Predicted: 0.3465 | Real: 0.3483
Predicted: 0.1009 | Real: 0.1007
Predicted: 0.6687 | Real: 0.6694
Predicted: 0.5124 | Real: 0.5091
Predicted: 0.4955 | Real: 0.4906
Predicted: 0.6114 | Real: 0.6093
Predicted: 0.6302 | Real: 0.6412
Predicted: 0.8566 | Real: 0.8530
Predicted: 0.3897 | Real: 0.3883
Predicted: 0.5587 | Real: 0.5664
Predicted: 0.7652 | Real: 0.7656


## Saving the time

We save the predicted time to send the message in a file, so that the Daemon can handle it

In [11]:
now = datetime.datetime.now()
seq_length = 3
with open(data_root, newline='') as csvfile:
    date_list = list(csv.reader(csvfile))
date_list = convert_to_dates(date_list)
data_tensor =  DataMiner(date_list).to_tensor(verbose=False)
X, y = create_sequences(data_tensor, seq_length)
x = get_latest_sequence(data_tensor, seq_length)

with torch.no_grad():
    p = model.forward(x.reshape(1,15)).item()
print(p)
p_sec = int(p*(max_hour+24-min_hour)*3600)
prediction = now.replace(hour=min_hour, minute=0, second=0) + timedelta(seconds=p_sec)
print('Expected time to go to sleep: ', prediction.strftime("%Y-%m-%d %H:%M:%S"))


'''Write the value on a text file to be read by the Daemon'''
with open ('prediction.txt','w') as z:
    z.write(prediction.strftime("%Y-%m-%d %H:%M:%S\n"))
z.close()

with open ('data/prediction_list.txt','a') as z:
    z.write(prediction.strftime("%Y-%m-%d %H:%M:%S\n"))
z.close()

0.6732186675071716
Expected time to go to sleep:  2020-12-28 02:23:08
