# **シンプルなブレンディングを試してみる**

### **必要な関数・ライブラリ**

In [1]:
# データ加工・処理・分析モジュール
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import os
import random

%matplotlib inline

import time

In [2]:
def set_time(dataframe, col_name):
    '''
    to_datetimeを使うための前処理
    '''
    dataframe[col_name] = dataframe[col_name].map(lambda x : transform_time(x))
    return dataframe

In [3]:
def transform_time(x):
    '''
    set_time内で使う関数
    to_datetimeで24時をサポートしないので00に変更する処理
    '''
    str_x = str(x)
    res = ''
    if str(x)[8:10] == '24':
        res = str_x[0:4] + '-' + str_x[4:6] + '-' + str_x[6:8] + ' 00:'+str_x[10:12] 
    else:
        res = str_x[0:4] + '-' + str_x[4:6] + '-' + str_x[6:8] + ' '+ str_x[8:10] +':'+str_x[10:12]
    return res

In [4]:
def drop_nan(X, Y):
    '''
    正解データがnanであるデータの組を削除
    '''
    mask = np.isnan(Y)
    X = X[~mask]
    Y = Y[~mask]
    return X, Y

In [5]:
n_estimators = 20
max_depth = 4
max_features = "sqrt"

In [20]:
# 発電量データ
all_output_30 = pd.read_csv('data/processed_data/out_put.tsv', delimiter = '\t')
all_output_30['datetime'] = all_output_30['datetime'].map(lambda x : pd.to_datetime(x))

### **浮島発電所について**

In [21]:
target_place = 1

In [22]:
# 発電量データ
output_30 = all_output_30[['datetime', 'SOLA0'+str(target_place)]]

In [23]:
# いろんなモデルの予測値の集合(学習用)
train_prediction = pd.read_csv("data/predicted_data/predict_train_SOLA0"+str(target_place)+".tsv", delimiter="\t")
train_prediction['datetime'] = train_prediction['datetime'].map(lambda x : pd.to_datetime(x))

In [24]:
# いろんなモデルの予測値の集合(ほんちゃん)
test_prediction = pd.read_csv("data/predicted_data/predict_SOLA0"+str(target_place)+".tsv", delimiter="\t")
test_prediction['datetime'] = test_prediction['datetime'].map(lambda x : pd.to_datetime(x))

In [25]:
test_prediction.columns

Index(['datetime', 'targetplace_1_kwh_yokohama_one_layer_1000',
       'targetplace_1_kwh_yokohama_nagoya_one_layer_1000',
       'targetplace_1_kwh_yokohama_nagoya_one_layer_2000',
       'targetplace_1_kwh_yokohama_nagoya_hamamatsu_one_layer_2000',
       'targetplace_1_kwh_yokohama_nagoya_hamamatsu_one_layer_3000',
       'targetplace_1_kwh_yokohama_nagoya_hamamatsu_one_layer_4000',
       'targetplace_1_kwh_yokohama_nagoya_hamamatsu_one_layer_5000',
       'targetplace_1_kwh_yokohama_nagoya_hamamatsu_osaka_one_layer_2000',
       'targetplace_1_kwh_yokohama_nagoya_hamamatsu_osaka_one_layer_3000',
       'targetplace_1_kwh_yokohama_nagoya_hamamatsu_osaka_one_layer_4000',
       'targetplace_1_kwh_yokohama_nagoya_hamamatsu_osaka_hannou_one_layer_2000',
       'targetplace_1_kwh_yokohama_nagoya_hamamatsu_osaka_hannou_one_layer_3000',
       'targetplace_1_kwh_yokohama_nagoya_hamamatsu_osaka_hannou_one_layer_4000',
       'targetplace_1_kwh_yokohama_nagoya_hamamatsu_osaka_hannou_one_la

In [26]:
t_s_datetime = train_prediction['datetime'][0]
t_e_datetime = train_prediction['datetime'][len(train_prediction)-1]
train_y_s_idx = output_30[output_30['datetime'] == t_s_datetime].index[0]
train_y_e_idx = output_30[output_30['datetime'] == t_e_datetime].index[0]

X = np.array(train_prediction.drop(["datetime"], axis=1))
Y = output_30["SOLA0"+str(target_place)][train_y_s_idx:train_y_e_idx+1]
X_test = np.array(test_prediction.drop(["datetime"], axis=1))

In [27]:
X, Y = drop_nan(X, Y)

In [28]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=int((X.shape[0] * 0.3)))

In [29]:
# ランダムフォレストリグレッサー
rf = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth, max_features=max_features)
rf.fit(X_train, Y_train)
Y_train_pred = rf.predict(X_train)
Y_val_pred = rf.predict(X_val)
print("train_mae : ", np.abs(Y_train_pred - Y_train).mean())
print("val_mae : ", np.abs(Y_val_pred - Y_val).mean())

train_mae :  95.9022445594
val_mae :  97.2384136823


In [30]:
Y_test_pred = rf.predict(X_test)

In [31]:
s_idx = test_prediction[test_prediction['datetime'] == pd.to_datetime('2016/01/01 00:00')].index[0]
e_idx = test_prediction[test_prediction['datetime'] == pd.to_datetime('2017/3/31 23:30')].index[0]
predict_data = pd.DataFrame({"datetime":test_prediction['datetime'][s_idx:e_idx+1]})
predict_data.index = np.arange(len(predict_data))

In [32]:
# 2016/01/01 00:00 ~ 2017/3/31 23:50の予測データを書き出す
predict_data["blending_"+str(target_place)] = Y_test_pred

### **扇島発電所について**

In [33]:
target_place = 2

In [34]:
# 発電量データ
output_30 = all_output_30[['datetime', 'SOLA0'+str(target_place)]]

In [35]:
# いろんなモデルの予測値の集合(学習用)
train_prediction = pd.read_csv("data/predicted_data/predict_train_SOLA0"+str(target_place)+".tsv", delimiter="\t")
train_prediction['datetime'] = train_prediction['datetime'].map(lambda x : pd.to_datetime(x))

In [36]:
# いろんなモデルの予測値の集合(ほんちゃん)
test_prediction = pd.read_csv("data/predicted_data/predict_SOLA0"+str(target_place)+".tsv", delimiter="\t")
test_prediction['datetime'] = test_prediction['datetime'].map(lambda x : pd.to_datetime(x))

In [38]:
test_prediction.columns

Index(['datetime', 'targetplace_2_kwh_yokohama_one_layer_1000',
       'targetplace_2_kwh_yokohama_nagoya_one_layer_1000',
       'targetplace_2_kwh_yokohama_nagoya_one_layer_2000',
       'targetplace_2_kwh_yokohama_nagoya_one_layer_3000',
       'targetplace_2_kwh_yokohama_nagoya_one_layer_4000',
       'targetplace_2_kwh_yokohama_nagoya_hamamatsu_one_layer_2000',
       'targetplace_2_kwh_yokohama_nagoya_hamamatsu_one_layer_3000',
       'targetplace_2_kwh_yokohama_nagoya_hamamatsu_one_layer_4000',
       'targetplace_2_kwh_yokohama_nagoya_hamamatsu_one_layer_5000',
       'targetplace_2_kwh_yokohama_nagoya_hamamatsu_osaka_one_layer_2000',
       'targetplace_2_kwh_yokohama_nagoya_hamamatsu_osaka_one_layer_3000',
       'targetplace_2_kwh_yokohama_nagoya_hamamatsu_osaka_one_layer_4000',
       'targetplace_2_kwh_yokohama_nagoya_hamamatsu_osaka_hannou_one_layer_2000',
       'targetplace_2_kwh_yokohama_nagoya_hamamatsu_osaka_hannou_one_layer_3000',
       'targetplace_2_kwh_yokohama_

In [37]:
t_s_datetime = train_prediction['datetime'][0]
t_e_datetime = train_prediction['datetime'][len(train_prediction)-1]
train_y_s_idx = output_30[output_30['datetime'] == t_s_datetime].index[0]
train_y_e_idx = output_30[output_30['datetime'] == t_e_datetime].index[0]

X = np.array(train_prediction.drop(["datetime"], axis=1))
Y = output_30["SOLA0"+str(target_place)][train_y_s_idx:train_y_e_idx+1]
X_test = np.array(test_prediction.drop(["datetime"], axis=1))

In [22]:
X, Y = drop_nan(X, Y)

In [23]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=int((X.shape[0] * 0.3)))

In [24]:
# ランダムフォレストリグレッサー
rf = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth, max_features=max_features)
rf.fit(X_train, Y_train)
Y_train_pred = rf.predict(X_train)
Y_val_pred = rf.predict(X_val)
print("train_mae : ", np.abs(Y_train_pred - Y_train).mean())
print("val_mae : ", np.abs(Y_val_pred - Y_val).mean())

train_mae :  164.199206582
val_mae :  168.723488278


In [25]:
Y_test_pred = rf.predict(X_test)

In [26]:
# 2016/01/01 00:00 ~ 2017/3/31 23:50の予測データを書き出す
predict_data["blending_"+str(target_place)] = Y_test_pred

### **米倉山発電所について**

In [27]:
target_place = 3

In [28]:
# 発電量データ
output_30 = pd.read_csv('data/processed_data/out_put.tsv', delimiter = '\t')
output_30['datetime'] = output_30['datetime'].map(lambda x : pd.to_datetime(x))
output_30 = output_30[['datetime', 'SOLA0'+str(target_place)]]

In [29]:
# いろんなモデルの予測値の集合(学習用)
train_prediction = pd.read_csv("data/predicted_data/predict_train_SOLA0"+str(target_place)+".tsv", delimiter="\t")
train_prediction['datetime'] = train_prediction['datetime'].map(lambda x : pd.to_datetime(x))

In [30]:
# いろんなモデルの予測値の集合(ほんちゃん)
test_prediction = pd.read_csv("data/predicted_data/predict_SOLA0"+str(target_place)+".tsv", delimiter="\t")
test_prediction['datetime'] = test_prediction['datetime'].map(lambda x : pd.to_datetime(x))

In [31]:
t_s_datetime = train_prediction['datetime'][0]
t_e_datetime = train_prediction['datetime'][len(train_prediction)-1]
train_y_s_idx = output_30[output_30['datetime'] == t_s_datetime].index[0]
train_y_e_idx = output_30[output_30['datetime'] == t_e_datetime].index[0]

X = np.array(train_prediction.drop(["datetime"], axis=1))
Y = output_30["SOLA0"+str(target_place)][train_y_s_idx:train_y_e_idx+1]
X_test = np.array(test_prediction.drop(["datetime"], axis=1))

In [32]:
X, Y = drop_nan(X, Y)

In [33]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=int((X.shape[0] * 0.3)))

In [34]:
# ランダムフォレストリグレッサー
rf = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth, max_features=max_features)
rf.fit(X_train, Y_train)
Y_train_pred = rf.predict(X_train)
Y_val_pred = rf.predict(X_val)
print("train_mae : ", np.abs(Y_train_pred - Y_train).mean())
print("val_mae : ", np.abs(Y_val_pred - Y_val).mean())

train_mae :  123.034612653
val_mae :  122.594891208


In [35]:
Y_test_pred = rf.predict(X_test)

In [36]:
# 2016/01/01 00:00 ~ 2017/3/31 23:50の予測データを書き出す
predict_data["blending_"+str(target_place)] = Y_test_pred

### **書き出す**

In [37]:
predict_data.to_csv('data/predicted_data/simple_blending_prediction.tsv', sep = '\t', index=False)