# Prediction of the flow
Extracting features from the rainfall

### Importing necessary libraries

In [1]:
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.tree import DecisionTreeRegressor
import datetime as dt
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from matplotlib import rcParams
rcParams['figure.figsize'] = 12, 4

### Loading the data sets

#### Rainfall data set

In [2]:
rainfall_set = pd.read_csv('../dataset/rainfall1.csv', parse_dates={'hour':['time']},index_col='hour')
dayfirst=True
rainfall_set = rainfall_set.resample('60T').mean()

In [3]:
rainfall_set = rainfall_set.reset_index()

#### Giving the range of the data '2015-06-01' - '2017-11-10'

In [4]:
rainfall_set = rainfall_set[(rainfall_set['hour'] > '2015-06-01') & (rainfall_set['hour'] <= '2017-11-10')]
rainfall_set.columns = ['time','rainfall']

#### Creating flow data set

In [5]:
flow_set = pd.read_csv('../dataset/flow1.csv', parse_dates={'hour':['time']},index_col='hour')
dayfirst=True
flow_set = flow_set.resample('60T').mean()

In [6]:
flow_set = flow_set.reset_index()

In [7]:
flow_set['time'] = flow_set['hour']

#### Leaving only a full hour as a feature

In [8]:
flow_set['hour'] = (flow_set['hour'].dt.hour)

In [9]:
flow_set.head()

Unnamed: 0,hour,flow,time
0,14,113.754443,2015-06-01 14:00:00
1,15,113.013333,2015-06-01 15:00:00
2,16,114.081666,2015-06-01 16:00:00
3,17,116.459167,2015-06-01 17:00:00
4,18,123.825,2015-06-01 18:00:00


### Merging two data sets into one 
Merging is based on time column

In [10]:
flow_set = pd.merge(flow_set, rainfall_set, how='left', on='time')

In [11]:
flow_set.head()

Unnamed: 0,hour,flow,time,rainfall
0,14,113.754443,2015-06-01 14:00:00,0.0
1,15,113.013333,2015-06-01 15:00:00,0.0
2,16,114.081666,2015-06-01 16:00:00,0.0
3,17,116.459167,2015-06-01 17:00:00,0.016667
4,18,123.825,2015-06-01 18:00:00,0.0


### Adding the feature of previous hour's flow value

In [12]:
flow_set['previous-hour'] = flow_set['flow'].shift(1)
flow_set.head()

Unnamed: 0,hour,flow,time,rainfall,previous-hour
0,14,113.754443,2015-06-01 14:00:00,0.0,
1,15,113.013333,2015-06-01 15:00:00,0.0,113.754443
2,16,114.081666,2015-06-01 16:00:00,0.0,113.013333
3,17,116.459167,2015-06-01 17:00:00,0.016667,114.081666
4,18,123.825,2015-06-01 18:00:00,0.0,116.459167


### Dealing with the missing values

In [13]:
flow_set = flow_set.fillna(flow_set.mean())

### Creating the features rain sum of past four hours 

In [26]:
flow_set['rain_sum_4'] = flow_set['rainfall'].rolling(min_periods=4, window=4).sum()

In [29]:
flow_set = flow_set.fillna(flow_set.mean())

In [30]:
flow_set.head()

Unnamed: 0,hour,flow,time,rainfall,previous-hour,rain_sum_4
0,14,113.754443,2015-06-01 14:00:00,0.0,109.165265,0.034343
1,15,113.013333,2015-06-01 15:00:00,0.0,113.754443,0.034343
2,16,114.081666,2015-06-01 16:00:00,0.0,113.013333,0.034343
3,17,116.459167,2015-06-01 17:00:00,0.016667,114.081666,0.016667
4,18,123.825,2015-06-01 18:00:00,0.0,116.459167,0.016667


In [31]:
split_date = pd.datetime(2016,12,31,23,59)

flow_training = flow_set.loc[flow_set['time'] <= split_date]
flow_test = flow_set.loc[flow_set['time'] > split_date]

In [32]:
X_flow_training = flow_training[['hour','previous-hour','rain_sum_4']]
X_flow_test = flow_test[['hour','previous-hour','rain_sum_4']]
Y_flow_training = flow_training['flow']
Y_flow_test = flow_test['flow']

In [33]:
regressor = DecisionTreeRegressor()
regressor.fit(X_flow_training, Y_flow_training)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [34]:
predict = regressor.predict(X_flow_test)

In [35]:
feature_importances = pd.DataFrame(regressor.feature_importances_,
                                   columns=['importance']).sort_values('importance', ascending=False)
feature_importances

Unnamed: 0,importance
0,0.667801
1,0.325192
2,0.007007


In [36]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [37]:
mape_tree = mean_absolute_percentage_error(Y_flow_test, predict)
mape_tree

3.7934672469658475