# フロー
1. 変数の定義
1. 過去のCSVファイルからデータをロードする
1. 前処理モジュールの定義（当日の上昇率、過去からの上昇率、下降率）
1. CSVロードして、並び替え、必要な日付からにする
1. 前処理部分(現在の値と、過去のからの上昇率、下降率を記したDFを作成)
1. 前処理データの保存

In [20]:
import pandas as pd
import datetime
import pickle

class Make_data():
    
    pram_predict_day = 3  # 1日後の予測をする
    pd.set_option('display.max_columns', 100)
    
    def load(self):
        df = pd.read_pickle('data/stock.pkl')
        
        # MultiIndex であればフラット化
        if isinstance(df.columns, pd.MultiIndex):
            df.columns = df.columns.get_level_values(0)
        # 列がタプルで構成されている場合、その最初の要素を採用してフラット化
        elif all(isinstance(col, tuple) for col in df.columns):
            df.columns = [col[0] for col in df.columns]
        
        # 重複列を削除
        df = df.loc[:, ~df.columns.duplicated()]
        
        # 列名を出力して確認
        print("Columns after load:", df.columns.tolist())
        
        return df
    
    def Deviation_value(self, df_tmp, x):
        std = df_tmp.std()['Open']
        mean = df_tmp.mean()['Open']
        deviation = (mean - x) * 10 / std + 50
        return deviation

    def main(self, start_date):
        if start_date == "":
            start_date = '2000-01-01'
        else:
            self.start_date = start_date

        df = self.load()

        # 再度重複列を削除（念のため）
        df = df.loc[:, ~df.columns.duplicated()]

        # 必要な列が存在するか確認
        required_columns = ['Date', 'Close', 'Open', 'dow_compare']
        for col in required_columns:
            if col not in df.columns:
                raise KeyError(f"必要な列 '{col}' がデータに存在しません。列名を確認してください。")

        df = df[df['Date'] >= start_date]
        df = df.reset_index(drop=True)
        df_len = len(df)

        index_date  = df.columns.get_loc('Date')
        index_close = df.columns.get_loc('Close')
        index_open  = df.columns.get_loc('Open')
        index_dow   = df.columns.get_loc('dow_compare')

        # 日毎の90日前からの下落率を90日間計算
        columns = ["day+" + str(x+1) for x in range(90)]
        for i, col_name in enumerate(columns):
            df[col_name] = df['Open'].pct_change(periods=(i+1)) * 100

        # 偏差値の計算
        for i in range(df_len):
            day0_open = df.iloc[i, index_open]
            if i > 90:
                df_tail = df.iloc[i-90:i, :]
                Deviation30 = self.Deviation_value(df_tail.tail(30), day0_open)
                Deviation60 = self.Deviation_value(df_tail.tail(60), day0_open)
                Deviation90 = self.Deviation_value(df_tail.tail(90), day0_open)
            if i % 100 == 0:
                print("進捗: {:.1%}".format(i/df_len), end=" ")

        # 結果の計算
        df['result_1day'] = df['Close'].pct_change(periods=-1*self.pram_predict_day) * 100 * (-1)
        df['result_1day'] = df['result_1day'].apply(lambda x: 1 if x < 0 else 0)

        df['result_3day'] = df['Close'].pct_change(periods=-3*self.pram_predict_day) * 100 * (-1)
        df['result_3day'] = df['result_3day'].apply(lambda x: 1 if x < 0 else 0)

        df['result_5day'] = df['Close'].pct_change(periods=-5*self.pram_predict_day) * 100 * (-1)
        df['result_5day'] = df['result_5day'].apply(lambda x: 1 if x < 0 else 0)

        df['result_10day'] = df['Close'].pct_change(periods=-10*self.pram_predict_day) * 100 * (-1)
        df['result_10day'] = df['result_10day'].apply(lambda x: 1 if x < 0 else 0)

        df['result_20day'] = df['Close'].pct_change(periods=-20*self.pram_predict_day) * 100 * (-1)
        df['result_20day'] = df['result_20day'].apply(lambda x: 1 if x < 0 else 0)

        df['result_30day'] = df['Close'].pct_change(periods=-30*self.pram_predict_day) * 100 * (-1)
        df['result_30day'] = df['result_30day'].apply(lambda x: 1 if x < 0 else 0)

        df['result_1day_open'] = df['Open'].pct_change(periods=-1*self.pram_predict_day) * 100 * (-1)
        df['result_1day_open'] = df['result_1day_open'].apply(lambda x: 1 if x < 0 else 0)

        df['result_3day_open'] = df['Open'].pct_change(periods=-3*self.pram_predict_day) * 100 * (-1)
        df['result_3day_open'] = df['result_3day_open'].apply(lambda x: 1 if x < 0 else 0)

        df['result_5day_open'] = df['Open'].pct_change(periods=-5*self.pram_predict_day) * 100 * (-1)
        df['result_5day_open'] = df['result_5day_open'].apply(lambda x: 1 if x < 0 else 0)

        df['result_10day_open'] = df['Open'].pct_change(periods=-10*self.pram_predict_day) * 100 * (-1)
        df['result_10day_open'] = df['result_10day_open'].apply(lambda x: 1 if x < 0 else 0)

        df['result_20day_open'] = df['Open'].pct_change(periods=-20*self.pram_predict_day) * 100 * (-1)
        df['result_20day_open'] = df['result_20day_open'].apply(lambda x: 1 if x < 0 else 0)

        df['result_30day_open'] = df['Open'].pct_change(periods=-30*self.pram_predict_day) * 100 * (-1)
        df['result_30day_open'] = df['result_30day_open'].apply(lambda x: 1 if x < 0 else 0)

        df = df.dropna()

        if start_date <= '2000-01-01': 
            df.to_pickle('data/stock_preprocessing_20201017.pkl')
            print("【完了】データ保存")


In [21]:
df_test = pd.DataFrame({'price':  range(120, 110,1)})
pram_predict_day=1                   # 1日後の予測をする
df_test['result']=df_test['price'].pct_change(periods=-1*pram_predict_day)*100*(-1)#デフォルトは1で1行前
df_test.tail(3)

Unnamed: 0,price,result


In [22]:
%%time
start_date="2020-09-01"
m=Make_data()
m.main("")

Columns after load: ['Date', 'Open', 'Close', 'dow_compare']
進捗: 0.0% 進捗: 1.7% 進捗: 3.4% 進捗: 5.1% 進捗: 6.8% 進捗: 8.4% 進捗: 10.1% 進捗: 11.8% 進捗: 13.5% 進捗: 15.2% 進捗: 16.9% 進捗: 18.6% 進捗: 20.3% 進捗: 22.0% 進捗: 23.6% 進捗: 25.3% 進捗: 27.0% 進捗: 28.7% 進捗: 30.4% 進捗: 32.1% 進捗: 33.8% 進捗: 35.5% 進捗: 37.2% 進捗: 38.9% 進捗: 40.5% 進捗: 42.2% 進捗: 43.9% 進捗: 45.6% 進捗: 47.3% 進捗: 49.0% 進捗: 50.7% 進捗: 52.4% 進捗: 54.1% 進捗: 55.7% 進捗: 57.4% 進捗: 59.1% 進捗: 60.8% 進捗: 62.5% 進捗: 64.2% 進捗: 65.9% 進捗: 67.6% 進捗: 69.3% 進捗: 70.9% 進捗: 72.6% 進捗: 74.3% 進捗: 76.0% 進捗: 77.7% 進捗: 79.4% 進捗: 81.1% 進捗: 82.8% 進捗: 84.5% 進捗: 86.1% 進捗: 87.8% 進捗: 89.5% 進捗: 91.2% 進捗: 92.9% 進捗: 94.6% 進捗: 96.3% 進捗: 98.0% 進捗: 99.7% 

  df['result_5day_open'] = df['Open'].pct_change(periods=-5*self.pram_predict_day) * 100 * (-1)
  df['result_10day_open'] = df['Open'].pct_change(periods=-10*self.pram_predict_day) * 100 * (-1)
  df['result_20day_open'] = df['Open'].pct_change(periods=-20*self.pram_predict_day) * 100 * (-1)
  df['result_30day_open'] = df['Open'].pct_change(periods=-30*self.pram_predict_day) * 100 * (-1)


【完了】データ保存
CPU times: user 1min 6s, sys: 1.37 s, total: 1min 7s
Wall time: 1min 12s
