In [None]:
import os
import pickle
import sys
import warnings
from glob import glob
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import shap
import xgboost as xgb
import lightgbm as lgb
import catboost as cbt
import datetime as dt
import statistics
import random
import eli5
from pyti.ichimoku_cloud import tenkansen, kijunsen, chiku_span, senkou_a, senkou_b
from pyti.bollinger_bands import upper_bollinger_band, middle_bollinger_band, lower_bollinger_band, bandwidth, percent_bandwidth, range
from pyti.relative_strength_index import relative_strength_index
from pyti.exponential_moving_average import exponential_moving_average
from pyti.weighted_moving_average import weighted_moving_average
from pyti.volume_adjusted_moving_average import volume_adjusted_moving_average
from pyti.moving_average_convergence_divergence import moving_average_convergence_divergence
from pyti.stochastic import percent_d, percent_k
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr
from sklearn.ensemble import (
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor,
)
from sklearn.metrics import accuracy_score, mean_squared_error
from tqdm.auto import tqdm
warnings.simplefilter('ignore')

%matplotlib inline
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100
pd.options.display.width = 120

## Data Read

In [None]:
dataset_dir="../input"
inputs = {
    "stock_list": f"{dataset_dir}/stock_list.csv.gz",
    "stock_price": f"{dataset_dir}/stock_price.csv.gz",
    "stock_fin": f"{dataset_dir}/stock_fin.csv.gz",
    # "stock_fin_price": f"{dataset_dir}/stock_fin_price.csv",
    "stock_labels": f"{dataset_dir}/stock_labels.csv.gz",
}

In [None]:
dfs = {}
for k, v in inputs.items():
    dfs[k] = pd.read_csv(v)

In [None]:
TRAIN_END = "2018-12-31"
VAL_START = "2019-01-01"
VAL_END = "2019-12-31"
TEST_START = "2020-01-01"

## Feature Developing

In [None]:
stock_list = dfs["stock_list"].copy()

In [None]:
fin_list = stock_list[["Local Code", "33 Sector(name)", "17 Sector(name)", "IssuedShareEquityQuote IssuedShare"]].copy()

In [None]:
def get_features_for_predict(dfs, code):
    ################## stock_fin_feat ##################
    # stock_finデータを読み込み
    stock_fin = dfs["stock_fin"].copy()

    # 特定の銘柄コードのデータに絞る
    fin_data = stock_fin[stock_fin["Local Code"] == code].copy()
    
    # 業界データと結合
    stock_list = dfs["stock_list"].copy()
    fin_list = stock_list[["Local Code", "33 Sector(name)", "17 Sector(name)", "IssuedShareEquityQuote IssuedShare"]].copy()
    fin_data = pd.merge(fin_data, fin_list, how="left", on="Local Code")
        
    # 日付列をpd.Timestamp型に変換してindexに設定
    fin_data["datetime"] = pd.to_datetime(fin_data["base_date"])
    fin_data.set_index("datetime", inplace=True)
    
    # fin_dataのnp.float64のデータのみを取得
    fin_data = fin_data.select_dtypes(include=["float64"])
    
    # 企業
    ## 営業利益率
    fin_data["Rate_Operating_Margin"] = fin_data["Result_FinancialStatement OperatingIncome"] / fin_data["Result_FinancialStatement NetSales"] * 100
    ## 経常利益率
    fin_data["Rate_Ordinary_Margin"] = fin_data["Result_FinancialStatement OrdinaryIncome"] / fin_data["Result_FinancialStatement NetSales"] * 100
    ## 純利益率
    fin_data["Rate_Ordinary_Margin"] = fin_data["Result_FinancialStatement NetIncome"] / fin_data["Result_FinancialStatement NetSales"] * 100
    ## 売上成長率
    fin_data["Rate_Grow_NetSales"] = fin_data["Result_FinancialStatement NetSales"].pct_change(4)
    ## 営業利益成長率
    fin_data["Rate_Grow_Operating"] = fin_data["Result_FinancialStatement OperatingIncome"].pct_change(4)
    ## 経常利益成長率
    fin_data["Rate_Grow_Ordinary"] = fin_data["Result_FinancialStatement OrdinaryIncome"].pct_change(4)
    ## 純利益成長率
    fin_data["Rate_Grow_NetIncome"] = fin_data["Result_FinancialStatement NetIncome"].pct_change(4)   
    ## ROE
    fin_data["ROE"] = fin_data["Result_FinancialStatement NetIncome"] / fin_data["Result_FinancialStatement NetAssets"] * 100
    ## ROA
    fin_data["ROA"] = fin_data["Result_FinancialStatement NetIncome"] / fin_data["Result_FinancialStatement TotalAssets"] * 100
    ## EPS
    fin_data["EPS"] = fin_data["Result_FinancialStatement NetIncome"] / fin_data["IssuedShareEquityQuote IssuedShare"] * 100
    ## BPS
    fin_data["BPS"] = fin_data["Result_FinancialStatement TotalAssets"] / fin_data["IssuedShareEquityQuote IssuedShare"]
    ## ROE成長率
    fin_data["Rate_Grow_ROE"] = fin_data["ROE"].pct_change(4)
    ## ROA成長率
    fin_data["Rate_Grow_ROA"] = fin_data["ROA"].pct_change(4)
    ## 負債比率
    fin_data["Result_FinancialStatement Liability"] = fin_data["Result_FinancialStatement TotalAssets"] - fin_data["Result_FinancialStatement NetAssets"]
    fin_data["Rate Liability"] = fin_data["Result_FinancialStatement Liability"] / fin_data["Result_FinancialStatement NetAssets"] * 100
    ## 来季結果と今期予測の差異
    fin_forecast = fin_data[["Forecast_FinancialStatement NetSales", "Forecast_FinancialStatement OperatingIncome", "Forecast_FinancialStatement OrdinaryIncome", "Forecast_FinancialStatement NetIncome"]].diff(4)
    fin_data["Diff Forecast Result NetSales"] = fin_forecast["Forecast_FinancialStatement NetSales"] / fin_data["Result_FinancialStatement NetSales"] * 100
    fin_data["Diff Forecast Result OperatingIncome"] = fin_forecast["Forecast_FinancialStatement OperatingIncome"] / fin_data["Result_FinancialStatement OperatingIncome"] * 100
    fin_data["Diff Forecast Result OrdinaryIncome"] = fin_forecast["Forecast_FinancialStatement OrdinaryIncome"] / fin_data["Result_FinancialStatement OrdinaryIncome"] * 100
    fin_data["Diff Forecast Result NetIncome"] = fin_forecast["Forecast_FinancialStatement NetIncome"] / fin_data["Result_FinancialStatement NetIncome"] * 100
    
    # 欠損値処理
    fin_feats = fin_data.fillna(0)

    ################## stock_price_feat ##################
    # stock_priceデータを読み込む
    price = dfs["stock_price"].copy()

    # 特定の銘柄コードのデータに絞る
    price_data = price[price["Local Code"] == code].copy()
    
    # 日付列をpd.Timestamp型に変換してindexに設定
    price_data["datetime"] = pd.to_datetime(price_data["EndOfDayQuote Date"])
    price_data.set_index("datetime", inplace=True)
    
    # 終値, ボリューム
    feats = price_data[["EndOfDayQuote ExchangeOfficialClose", "EndOfDayQuote Volume"]].copy()

    # 終値の20営業日リターン
    feats["return_1month"] = feats["EndOfDayQuote ExchangeOfficialClose"].pct_change(20)
    # 終値と20営業日の単純移動平均線の乖離
    feats["MA_gap_1month"] = feats["EndOfDayQuote ExchangeOfficialClose"] / (feats["EndOfDayQuote ExchangeOfficialClose"].rolling(20).mean())
    # 終値と20営業日の単純移動平均線の乖離
    feats["EMA_gap_1month"] = feats["EndOfDayQuote ExchangeOfficialClose"] / (feats["EndOfDayQuote ExchangeOfficialClose"].ewm(span=20).mean())    
    # 過去20営業日の平均売買金額
    feats["Volume_mean_1month"] = feats["EndOfDayQuote Volume"].rolling(window=20).mean()
    
    # RSI
    feats["rsi"] = relative_strength_index(feats["EndOfDayQuote ExchangeOfficialClose"], period=20)
    # feats["rsi_under_30"] = feats["rsi"] < 30
    # feats["rsi_over_70"] = feats["rsi"] > 70
        
    # テクニカル指標
      # ラグファクター
      # トレンドライン、サポートライン、レジスタンスライン

    # おおまかな手順の3つ目
    # 欠損値処理
    feats = feats.fillna(0)

    # 財務データの特徴量とマーケットデータの特徴量のインデックスを合わせる
    feats = feats.loc[feats.index.isin(fin_feats.index)]
    fin_feats = fin_feats.loc[fin_feats.index.isin(feats.index)]
    
    # データを結合
    feats = pd.concat([feats, fin_feats], axis=1).dropna()    
    
    ################## stock_fin&price_feat ##################
    ## 時価総額
    feats["Market Capitalization"] = feats["IssuedShareEquityQuote IssuedShare"] * feats["EndOfDayQuote ExchangeOfficialClose"]
    ## PER
    feats["PER"] = feats["EndOfDayQuote ExchangeOfficialClose"] / feats["EPS"]
    ## PBR
    feats["PBR"] = feats["EndOfDayQuote ExchangeOfficialClose"] / feats["BPS"]

    # 元データのカラムを削除
    feats = feats.drop(["EndOfDayQuote ExchangeOfficialClose"], axis=1)

    # 欠損値処理を行います。
    feats = feats.replace([np.inf, -np.inf], 0)

    # 銘柄コードを設定
    feats["code"] = code

    return feats

In [None]:
def get_features_and_label(dfs, codes, feature, label):
   # 分割データ用の変数を定義
    trains_X, vals_X, tests_X = [], [], []
    trains_y, vals_y, tests_y = [], [], []

    # 銘柄コード毎に特徴量を作成
    for code in tqdm(codes):
        # 特徴量取得
        feats = feature[feature["code"] == code]

        # stock_labelデータを読み込み
        stock_labels = dfs["stock_labels"].copy()
        # 特定の銘柄コードのデータに絞る
        stock_labels = stock_labels[stock_labels["Local Code"] == code]
        # 日付列をpd.Timestamp型に変換してindexに設定
        stock_labels["datetime"] = pd.to_datetime(stock_labels["base_date"])
        stock_labels.set_index("datetime", inplace=True)

        # 特定の目的変数に絞る
        labels = stock_labels[label]
        # nanを削除
        labels.dropna(inplace=True)

        if feats.shape[0] > 0 and labels.shape[0] > 0:
            # 特徴量と目的変数のインデックスを合わせる
            labels = labels.loc[labels.index.isin(feats.index)]
            feats = feats.loc[feats.index.isin(labels.index)]
            labels.index = feats.index

            # データを分割（ホールドアウト法）
            _train_X = feats[: TRAIN_END].copy()
            _val_X = feats[VAL_START : VAL_END].copy()
            _test_X = feats[TEST_START :].copy()

            _train_y = labels[: TRAIN_END].copy()
            _val_y = labels[VAL_START : VAL_END].copy()
            _test_y = labels[TEST_START :].copy()

            # データを配列に格納 (後ほど結合するため)
            trains_X.append(_train_X)
            vals_X.append(_val_X)
            tests_X.append(_test_X)

            trains_y.append(_train_y)
            vals_y.append(_val_y)
            tests_y.append(_test_y)

    # 銘柄毎に作成した説明変数データを結合します。
    train_X = pd.concat(trains_X)
    val_X = pd.concat(vals_X)
    test_X = pd.concat(tests_X)
    
    # 銘柄毎に作成した目的変数データを結合します。
    train_y = pd.concat(trains_y)
    val_y = pd.concat(vals_y)
    test_y = pd.concat(tests_y)

    return train_X, train_y, val_X, val_y, test_X, test_y

In [None]:
def get_codes(dfs):
    stock_list = dfs["stock_list"].copy()
    # 予測対象の銘柄コードを取得
    codes = stock_list[stock_list["prediction_target"] == True]["Local Code"].values
    rondom_list = []
    for k in range(150):
        # x = random.randint(1,len(codes)-1)
        rondom_list.append(k)
    
    limit_codes = codes[rondom_list]
    
    return limit_codes

In [None]:
def create_model(dfs, codes, label):
    """
    Args:
        dfs (dict)  : dict of pd.DataFrame include stock_fin, stock_price
        codes (list[int]): A local code for a listed company
        label (str): prediction target label
    Returns:
         RandomForestRegressor
    """
    # 特徴量を取得
    buff = []
    for code in codes:
        buff.append(get_features_for_predict(dfs, code))
    feature = pd.concat(buff)
    
    # 特徴量と目的変数を一致させて、データを分割
    train_X, train_y, val_X, val_y, test_X, test_y = get_features_and_label(
            dfs, codes, feature, label
    )
    
    train_X = pd.concat([train_X, val_X])
    train_y = pd.concat([train_y, val_y])
    
    # 不要な特徴を削除
    drop_col = ["code", "Result_FinancialStatement FiscalYear",
                "Result_Dividend FiscalYear",
                "Result_FinancialStatement TotalAssets",
                "Result_FinancialStatement NetAssets",
                "Result_FinancialStatement Liability",
                "Forecast_Dividend FiscalYear",
                "Result_FinancialStatement CashFlowsFromFinancingActivities",
                "Result_FinancialStatement CashFlowsFromInvestingActivities",
                "Result_FinancialStatement CashFlowsFromOperatingActivities",
                "Forecast_FinancialStatement FiscalYear"]
    train_X = train_X.drop(drop_col, axis=1)
    test_X = test_X.drop(drop_col, axis=1)
    
    # モデル作成
    spear = []
    rmse = []
    model_feature = []
    
    period_list = ["2016-07-01","2017-07-01", "2018-07-01"]
    for idx, period in enumerate(period_list):
        period_tr_end = dt.date(int(period[0:4])+1, int(period[5:7])-4, int(period[8:10])+24)
        period_va_start = dt.date(int(period[0:4])+1, int(period[5:7])-4, int(period[8:10])+25)
        period_va_end = dt.date(int(period[0:4])+1, int(period[5:7])-1, int(period[8:10])+10)

        is_tr = (train_X.index > period) & (train_X.index < period_tr_end.strftime('%Y-%m-%d'))
        is_va = (train_X.index > period_va_start.strftime('%Y-%m-%d')) & (train_X.index < period_va_end.strftime('%Y-%m-%d'))

        model = lgb.LGBMRegressor(random_state=0, n_estimators=300)
        model.fit(train_X[is_tr], train_y[is_tr],
                  eval_set=[(train_X[is_va], train_y[is_va])],
                  eval_metric='rmse',
                  eval_names=[('train_set', 'eval_set')],
                  early_stopping_rounds=20,
                  verbose=300)
        
        pred_y= model.predict(test_X)
        
        spear.append(spearmanr(test_y, pred_y)[0])
        rmse.append(np.sqrt(mean_squared_error(test_y, pred_y)))
        model_feature.append(model.feature_importances_)

    print("============= final result =============")
    print("spear : " + str(spear))
    print("rmse : " + str(rmse))
    print("============= final mean result =============")
    print("spear_means : " + str(statistics.mean(spear)))
    print("rmse_means : " + str(statistics.mean(rmse)))
    
    return model,train_X, model_feature

In [None]:
codes = get_codes(dfs)

In [None]:
model,train_X, model_feature = create_model(dfs, codes, label="label_high_20")

In [None]:
pd.DataFrame(np.average(model_feature, axis = 0), index=train_X.columns, columns=['importance']).sort_values('importance',  ascending=False)

In [None]:
# SHAP値
shap.initjs()
explainer = shap.TreeExplainer(model=model, feature_perturbation='tree_path_dependent', model_output='margin')
shap_values = explainer.shap_values(X=train_X)
shap.summary_plot(shap_values, train_X, plot_type="bar", max_display=train_X.shape[1])

In [None]:
# eli5
eli5.explain_weights(model, top=100, importance_type='gain')

In [None]:
period_list = ["2016-07-01","2017-07-01", "2018-07-01"]
for idx, period in enumerate(period_list):
    period_tr_end = dt.date(int(period[0:4])+1, int(period[5:7])-4, int(period[8:10])+24)
    period_va_start = dt.date(int(period[0:4])+1, int(period[5:7])-4, int(period[8:10])+25)
    period_va_end = dt.date(int(period[0:4])+1, int(period[5:7])-1, int(period[8:10])+10)

    is_tr = (train_X.index > period) & (train_X.index < period_tr_end.strftime('%Y-%m-%d'))
    is_va = (train_X.index > period_va_start.strftime('%Y-%m-%d')) & (train_X.index < period_va_end.strftime('%Y-%m-%d'))

In [None]:
code = 1301

In [None]:
stock_list = dfs["stock_list"].copy()
fin_list = stock_list[stock_list["Local Code"] == code][["Local Code", "33 Sector(name)", "17 Sector(name)", "IssuedShareEquityQuote IssuedShare"]].copy()

In [None]:
stock_fin = dfs["stock_fin"].copy()
fin_data = stock_fin[stock_fin["Local Code"] == code].copy()
fin_data = pd.merge(fin_data, fin_list, how="left", on="Local Code")

In [None]:
stock_price = dfs["stock_price"].copy()
price_data = stock_price[stock_price["Local Code"] == code].copy()
price_data = pd.merge(price_data, fin_list, how="left", on="Local Code")

In [None]:
fin_sector33_mean = fin_data.groupby(["33 Sector(name)","Result_FinancialStatement ReportType"]).mean()[["Result_FinancialStatement NetSales", "Result_FinancialStatement OperatingIncome", "Result_FinancialStatement OrdinaryIncome", "Result_FinancialStatement NetIncome"]].reset_index()
fin_sector33_mean.columns = ["33 Sector(name)", "Result_FinancialStatement ReportType",
                             "Result_FinancialStatement NetSales_33mean", 
                             "Result_FinancialStatement OperatingIncome_33mean",
                             "Result_FinancialStatement OrdinaryIncome_33mean",
                             "Result_FinancialStatement NetIncome_33mean"]

In [None]:
fin_sector17_mean = fin_data.groupby(["17 Sector(name)", "Result_FinancialStatement ReportType"]).mean()[["Result_FinancialStatement NetSales", "Result_FinancialStatement OperatingIncome", "Result_FinancialStatement OrdinaryIncome", "Result_FinancialStatement NetIncome"]].reset_index()
fin_sector17_mean.columns = ["17 Sector(name)", "Result_FinancialStatement ReportType",
                             "Result_FinancialStatement NetSales_17mean", 
                             "Result_FinancialStatement OperatingIncome_17mean",
                             "Result_FinancialStatement OrdinaryIncome_17mean",
                             "Result_FinancialStatement NetIncome_17mean"]

In [None]:
fin_data = pd.merge(fin_data, fin_sector17_mean, how="left", on=["17 Sector(name)", "Result_FinancialStatement ReportType"])

In [None]:
fin_data = pd.merge(fin_data, fin_sector33_mean, how="left", on=["33 Sector(name)", "Result_FinancialStatement ReportType"])

## Create_model Backup

In [None]:
def create_model(dfs, codes, label):
    """
    Args:
        dfs (dict)  : dict of pd.DataFrame include stock_fin, stock_price
        codes (list[int]): A local code for a listed company
        label (str): prediction target label
    Returns:
         RandomForestRegressor
    """
    # 特徴量を取得
    buff = []
    for code in codes:
        buff.append(get_features_for_predict(dfs, code))
    feature = pd.concat(buff)
    
    # 特徴量と目的変数を一致させて、データを分割
    train_X, train_y, val_X, val_y, test_X, test_y = get_features_and_label(
            dfs, codes, feature, label
    )
    
    train_X = pd.concat([train_X, val_X])
    train_y = pd.concat([train_y, val_y])
    
    # 不要な特徴を削除
    drop_col = ["code", "Result_FinancialStatement FiscalYear",
                "Result_Dividend FiscalYear",
                "Result_FinancialStatement TotalAssets",
                "Result_FinancialStatement NetAssets",
                "Result_FinancialStatement Liability",
                "Result_FinancialStatement NetSales_17mean",
                "Result_FinancialStatement OperatingIncome_17mean",
                "Result_FinancialStatement OrdinaryIncome_17mean",
                "Result_FinancialStatement NetIncome_17mean",
                "Result_FinancialStatement NetSales_33mean",
                "Result_FinancialStatement OperatingIncome_33mean",
                "Result_FinancialStatement OrdinaryIncome_33mean",
                "Result_FinancialStatement NetIncome_33mean",
                "Forecast_FinancialStatement FiscalYear"]
    train_X = train_X.drop(drop_col, axis=1)
    test_X = test_X.drop(drop_col, axis=1)
    
    # モデル作成
    spear = []
    rmse = []
    r2 = []
    # TRAIN_END = "2017-12-31" VAL_START = "2018-02-01" VAL_END = "2018-12-01" TEST_START = "2019-01-01"
    for mdl in [lgb.LGBMRegressor(random_state=0)]:
        model = mdl
        model.fit(train_X, train_y)
        pred_y= model.predict(test_X)
        
        spear.append(spearmanr(test_y, pred_y)[0])
        rmse.append(np.sqrt(mean_squared_error(test_y, pred_y)))
        r2.append(r2_score(test_y, pred_y))
        
    # print(spear)
    print("spear_means : " + str(statistics.mean(spear)))
    print("rmse_means : " + str(statistics.mean(rmse)))
    print("r2_means : " + str(statistics.mean(r2)))
    
    return model,train_X

## Prediction Execution

In [1]:
from predictor import ScoringService

In [2]:
# データセットを取得
DATASET_DIR= "../input"
inputs = ScoringService.get_inputs(DATASET_DIR)
dfs = ScoringService.get_dataset(inputs)

In [3]:
# 対象コードを取得
codes = ScoringService.get_codes(dfs)

In [5]:
for label in ["label_high_20", "label_low_20"]:
    model = ScoringService.create_model(dfs=dfs, codes=codes, label=label)
    ScoringService.save_model(model=model, label=label, model_path="../model/")

HBox(children=(FloatProgress(value=0.0, max=3523.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3523.0), HTML(value='')))




In [6]:
# 対象の目的変数を定義
labels = {"label_high_20", "label_low_20"}
ScoringService.get_model(model_path="../model/", labels=labels)

True

In [None]:
ScoringService.predict(inputs=inputs, labels=labels, codes=codes)

In [None]:
ScoringService.get_model()  # モデルの取得

In [None]:
ScoringService.predict(inputs=inputs, codes=codes) 

In [None]:
codes