# Prediction of the flow (24 h)

### Decision tree regression
Predicting next 24 hours on the previous predicted value

### Importing necessary libraries

In [204]:
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.tree import DecisionTreeRegressor
import datetime as dt
import numpy as np

### Loading the data set

In [205]:
flow_set = pd.read_csv('../dataset/flow1.csv', parse_dates={'hour':['time']},index_col='hour')
dayfirst=True
flow_set = flow_set.resample('60T').mean()

In [206]:
flow_set = flow_set.reset_index()

In [207]:
flow_set['time'] = flow_set['hour']

In [208]:
flow_set['hour'] = (flow_set['hour'].dt.hour)

In [209]:
flow_set.head()

Unnamed: 0,hour,flow,time
0,14,113.754443,2015-06-01 14:00:00
1,15,113.013333,2015-06-01 15:00:00
2,16,114.081666,2015-06-01 16:00:00
3,17,116.459167,2015-06-01 17:00:00
4,18,123.825,2015-06-01 18:00:00


### Adding the feature of previous hour's flow value

In [210]:
flow_set['previous-hour'] = flow_set['flow'].shift(1)

In [211]:
flow_set.head()

Unnamed: 0,hour,flow,time,previous-hour
0,14,113.754443,2015-06-01 14:00:00,
1,15,113.013333,2015-06-01 15:00:00,113.754443
2,16,114.081666,2015-06-01 16:00:00,113.013333
3,17,116.459167,2015-06-01 17:00:00,114.081666
4,18,123.825,2015-06-01 18:00:00,116.459167


### Dealing with missing values by filling with the mean

In [212]:
flow_set = flow_set.fillna(flow_set.mean())

### Splitting the data set into train and test sets

In [213]:
split_date = pd.datetime(2016,12,31,23,59)

flow_training = flow_set.loc[flow_set['time'] <= split_date]
flow_test = flow_set.loc[flow_set['time'] > split_date]

### Creating training and test variables

In [214]:
X_flow_training = flow_training[['hour','previous-hour']]
X_flow_test = flow_test[['hour','previous-hour','time']]
Y_flow_training = flow_training['flow']
Y_flow_test = flow_test['flow']

### Creating a multiple regression model with decision tree

In [215]:
regressor = DecisionTreeRegressor()
regressor.fit(X_flow_training, Y_flow_training)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

### Predicting next 24 hours on a previous predicted value

In [216]:
X_flow_test['actual-value'] = Y_flow_test

In [217]:
X_flow_test['24-hour'] = X_flow_test['actual-value']

In [218]:
X_flow_test.head()

Unnamed: 0,hour,previous-hour,time,actual-value,24-hour
13906,0,99.808334,2017-01-01 00:00:00,94.330833,94.330833
13907,1,94.330833,2017-01-01 01:00:00,91.927501,91.927501
13908,2,91.927501,2017-01-01 02:00:00,90.722501,90.722501
13909,3,90.722501,2017-01-01 03:00:00,87.402501,87.402501
13910,4,87.402501,2017-01-01 04:00:00,81.485833,81.485833


### Leaving the 24-hour column with value only for hour 0

In [237]:
X_flow_test['24-hour'].loc[X_flow_test['hour'] != 0] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


### Creating the check column with True where hour is 0

In [226]:
check = (X_flow_test['hour'] == 0)
check.head()
X_flow_test['check'] = check
X_flow_test.head()

Unnamed: 0,hour,previous-hour,time,actual-value,24-hour,check
13906,0,99.808334,2017-01-01 00:00:00,94.330833,94.330833,True
13907,1,94.330833,2017-01-01 01:00:00,91.927501,0.0,False
13908,2,91.927501,2017-01-01 02:00:00,90.722501,0.0,False
13909,3,90.722501,2017-01-01 03:00:00,87.402501,0.0,False
13910,4,87.402501,2017-01-01 04:00:00,81.485833,0.0,False


### Creating the proper index starting from 0

In [239]:
X_flow_test['index'] = range(0, len(X_flow_test)-1 + 1)

In [231]:
X_flow_test = X_flow_test.set_index('index')

### Creating the formula for next 24 hour prediction
Starting from known value for hour 0

In [234]:
for i in range(len(X_flow_test)-1):
    row = X_flow_test.iloc[[i]]
    next_row = X_flow_test.iloc[[i+1]]
    if row['check'].iloc[0] == True:
        test = row[['hour','actual-value']]
        X_flow_test.loc[i,('24-hour')] = X_flow_test['actual-value'].iloc[0]
        X_flow_test.loc[i+1,('previous-hour')] = X_flow_test.at[i,'24-hour']
    else:
        test = X_flow_test[['hour','previous-hour']].iloc[[i]]
        X_flow_test.loc[i,('24-hour')] = regressor.predict(test)
        X_flow_test.at[i+1,'previous-hour'] = X_flow_test.at[i,'24-hour']

### Showing the results 

In [240]:
X_flow_test.head()

Unnamed: 0_level_0,hour,previous-hour,time,actual-value,24-hour,check,index
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,99.808334,2017-01-01 00:00:00,94.330833,94.330833,True,0
1,1,94.330833,2017-01-01 01:00:00,91.927501,0.0,False,1
2,2,85.107501,2017-01-01 02:00:00,90.722501,0.0,False,2
3,3,77.250835,2017-01-01 03:00:00,87.402501,0.0,False,3
4,4,73.8,2017-01-01 04:00:00,81.485833,0.0,False,4


### Calculating the feature importance and error

j=1
i=0
for i in enumerate(X_flow_test):
    test = X_flow_test[['hour','previous-hour']][i:j]
    X_flow_test['24-hour'] = regressor.predict(test)
    X_flow_test['previous-hour'] = X_flow_test['24-hour'].shift(1),
    j=j+1