In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import my_function
import time_dependent_tree
import random
from sklearn.model_selection import train_test_split
from statsmodels.tsa.seasonal import STL

In [2]:
np.random.seed(314) # 乱数シードを314に設定

# 合成波の作成
syn_t = my_function.make_wave(0.5, 1.0, 3.0, 100)[0]
syn_y = my_function.make_wave(0.5, 1.0, 3.0, 100)[1] + my_function.make_wave(0.75, 1.7, 3.0, 100)[1] + 0.05 * np.random.randn(len(syn_t))

# STL分解
stl=STL(syn_y, period=12, robust=True)
stl_series = stl.fit()

# データの整形
trend_x, trend_y = my_function.sliding_window(stl_series.trend)
seasonal_x, seasonal_y = my_function.sliding_window(stl_series.seasonal)
resid_x, resid_y = my_function.sliding_window(stl_series.resid)

# データの分割
trend_x_train, trend_x_test, trend_y_train, trend_y_test = train_test_split(trend_x, trend_y, test_size=0.1, shuffle=False)
seasonal_x_train, seasonal_x_test, seasonal_y_train, seasonal_y_test = train_test_split(seasonal_x, seasonal_y, test_size=0.1, shuffle=False)
resid_x_train, resid_x_test, resid_y_train, resid_y_test = train_test_split(resid_x, resid_y, test_size=0.1, shuffle=False)

## 学習部

In [3]:
max_depth = 5

tree_df = pd.DataFrame(columns=['n',  'mean',  'depth', 'eval', 'feature_index', 'threshold', 'leaf'], 
                       index=range(my_function.count_node(max_depth)[0]))
tree_ls = [[np.array([0]) for _ in range(2)] for _ in range(my_function.count_node(max_depth)[0])]

In [4]:
def best_split_fixed_depth(ls, node_index, eval_type, time, min_samples_leaf):
    
    best_evaluation = 10**8
    best_feature_index = -1
    best_threshold = None
    is_leaf = False
    num_df_row = ls[node_index][0].shape[0]

    thresholds, values = zip(*sorted(zip(ls[node_index][0][:, time], ls[node_index][1])))

    # 予測対象数だけループ
    for i in range(1, num_df_row):
        tentative_thresholds = thresholds[i - 1]
        left_node = values[0:i]
        right_node = values[i:]
        left_pred = np.full(len(left_node), np.mean(left_node))
        right_pred = np.full(len(right_node), np.mean(right_node))

        if(eval_type == "MSE"):
            evaluation = my_function.mean_squared_error(left_pred, left_node) + my_function.mean_squared_error(right_pred, right_node)

            if best_evaluation > evaluation and left_pred.shape[0] > min_samples_leaf and right_pred.shape[0] > min_samples_leaf:
                best_evaluation = evaluation
                best_feature_index = time
                best_threshold = tentative_thresholds

        elif(eval_type == "KLD_sum"):
            evaluation = calc_kld(left_pred, left_node) + calc_kld(right_pred, right_node)

            if best_evaluation > evaluation and left_pred.shape[0] > min_samples_leaf and right_pred.shape[0] > min_samples_leaf:
                best_evaluation = evaluation
                best_feature_index = time
                best_threshold = tentative_thresholds

        elif(eval_type == "KLD_def"):
            evaluation = abs(calc_kld(left_pred, left_node) - calc_kld(right_pred, right_node))

            if best_evaluation > evaluation and left_pred.shape[0] > min_samples_leaf and right_pred.shape[0] > min_samples_leaf:
                best_evaluation = evaluation
                best_feature_index = time
                best_threshold = tentative_thresholds

        else:
            break
            
    if best_evaluation == 10**8:
        is_leaf = True
    
    return [best_evaluation, best_feature_index, best_threshold, is_leaf]

In [5]:
def record_df(df, ls, node_index, time, split_array):
    df.loc[node_index, 'n'] = len(ls[node_index][1])
    df.loc[node_index, 'mean'] = np.mean(ls[node_index][1])
    df.loc[node_index, 'depth'] = time
    df.loc[node_index, 'eval'] = split_array[0]
    df.loc[node_index, 'feature_index'] = split_array[1]
    df.loc[node_index, 'threshold'] = split_array[2]
    df.loc[node_index, 'leaf'] = split_array[3]
    
    return df

In [6]:
def record_next_ls(df, ls, node_index, time, split_array):
    
    LEFT_NODE_INDEX = node_index*2 + 1
    RIGHT_NODE_INDEX = node_index*2 + 2
    
    concat_xy = np.hstack([ls[node_index][0], ls[node_index][1].reshape(len(ls[node_index][1]), 1)])
    left_node = concat_xy[concat_xy[:, df.loc[node_index, 'feature_index']] < df.loc[node_index, 'threshold']]
    right_node = concat_xy[concat_xy[:, df.loc[node_index, 'feature_index']] >= df.loc[node_index, 'threshold']]
    
    ls[LEFT_NODE_INDEX][0] = left_node[:, :-1]
    ls[LEFT_NODE_INDEX][1] = left_node[:, -1]
    ls[RIGHT_NODE_INDEX][0] = right_node[:, :-1]
    ls[RIGHT_NODE_INDEX][1] = right_node[:, -1]
    
    return ls

In [7]:
# 前処理
tree_ls[0][0] = seasonal_x_train
tree_ls[0][1] = seasonal_y_train
tree_df.loc[:, 'leaf'] = False
for time in range(max_depth):
    for node_index in my_function.count_node(time+1)[1]:
        tree_df.loc[node_index, 'depth'] = time

In [8]:
for time in range(max_depth):
    for node_index in my_function.count_node(time+1)[1]:
        
        left_node_index = node_index*2 + 1
        right_node_index = node_index*2 + 2
        
        # 存在するノードなら
        if tree_df.loc[node_index, 'n'] != -1:
            # 最下層でなければ
            if  tree_df.loc[node_index, 'depth'] < max_depth-1:
                # 分割点を探す
                split_point_array = best_split_fixed_depth(ls=tree_ls, 
                                                           node_index=node_index,
                                                           eval_type="MSE",
                                                           time=time, 
                                                           min_samples_leaf=5)

            # ノードの情報を記録する
            tree_df = record_df(df=tree_df, 
                                ls=tree_ls, 
                                node_index=node_index, 
                                time=time, 
                                split_array=split_point_array)
            
            # 直前の実行内容で葉に切り替わったら
            if tree_df.loc[node_index, 'leaf']:
                # その先のノードは存在しない扱いにする
                tree_df.loc[left_node_index, 'n'] = -1
                tree_df.loc[right_node_index, 'n'] = -1
            # まだ節であれば
            else:
                # その先のノードにデータを付与する
                tree_ls = record_next_ls(df=tree_df, 
                                        ls=tree_ls, 
                                        node_index=node_index, 
                                        time=time,
                                        split_array=split_point_array)
       
        # 存在しないノードならば
        else:
            tree_df.loc[left_node_index, 'n'] = -1
            tree_df.loc[right_node_index, 'n'] = -1

In [9]:
# 後処理
tree_df = tree_df[:my_function.count_node(max_depth)[0]]

In [10]:
tree_df

Unnamed: 0,n,mean,depth,eval,feature_index,threshold,leaf
0,265,0.000361,0,0.001117,0.0,-0.055642,False
1,5,0.026912,1,100000000.0,-1.0,,True
2,260,-0.00015,1,0.001208,1.0,-0.020279,False
3,-1,,2,,,,False
4,-1,,2,,,,False
5,51,0.001001,2,0.000555,2.0,0.036626,False
6,209,-0.000431,2,0.001014,2.0,0.057249,False
7,-1,,3,,,,False
8,-1,,3,,,,False
9,-1,,3,,,,False
