In [2]:
import pandas as pd
import numpy as np
from datetime import date
import warnings
from tqdm.auto import tqdm
import joblib

In [2]:
def build_weekly_group(df):
    # index情報から、(year, week)の情報を得る。
    return pd.Series(list(zip(df.index.isocalendar().year, df.index.isocalendar().week)), index=df.index)

In [3]:
def calc_weekly_return(x):
    CLOSE = "EndOfDayQuote ExchangeOfficialClose"
    OPEN = "EndOfDayQuote Open"
    wr = (x[CLOSE].iloc[-1] - x[OPEN].iloc[0]) / x[OPEN].iloc[0]
    try:
        dt = x.loc[x.index.dayofweek==4].reset_index().at[0, "datetime"]
    except KeyError:
        return

    return pd.DataFrame({"weekly_return": [wr]}, index=[dt])

In [4]:
def extract_fwd_return(df):
    weekly_group = build_weekly_group(df)
    weekly_return = df.groupby(weekly_group).apply(calc_weekly_return)
    weekly_fwd_return = weekly_return.shift(-1).dropna()
    return weekly_fwd_return.reset_index(0)["weekly_return"]

In [11]:
def build_objective(df, code):
    ob_df = df.fillna(0)
    
    weekly_fwd_return = extract_fwd_return(df)
    
    ob_df = ob_df.resample("B").ffill()
    ob_df = ob_df.loc[ob_df.index.dayofweek == 4]  # 4 is friday
    
    ob_df["objective"] = weekly_fwd_return
    ob_df.dropna(inplace=True)
    ob_df = ob_df.replace([np.inf, -np.inf], 0)
    
    ob_df["code"] = code
    return ob_df

In [6]:
price_df = pd.read_csv("../data/stock_price.csv.gz")
list_df = pd.read_csv("../data/stock_list.csv.gz")

In [7]:
codes = list_df[list_df["universe_comp2"] == True]["Local Code"].values

In [8]:
# stock.py Stock.preprocessで行っている置換
price_df.loc[:, "datetime"] = pd.to_datetime(price_df.loc[:, "EndOfDayQuote Date"])
price_df.set_index("datetime", inplace=True)

In [13]:
warnings.simplefilter("ignore")

In [14]:
results = joblib.Parallel(n_jobs=-1)(
    joblib.delayed(build_objective)(df, code) for df, code in zip(
        (price_df[price_df["Local Code"] == c].copy() for c in codes),
        (c for c in codes)
    )
)

    
objective_df = pd.concat(results)
objective_df.to_csv("stock_weekly_fwd_return.csv")

In [30]:
objective_df.loc[:, ["objective", "code"]].to_csv("../data/stock_weekly_fwd_return_slim.csv")

In [27]:
df = pd.read_csv("stock_weekly_fwd_return_slim.csv", index_col="datetime", parse_dates=True)

In [4]:
objective_df = pd.read_csv("stock_weekly_fwd_return.csv")
objective_df[objective_df["code"] == 4365]

Unnamed: 0,datetime,Local Code,EndOfDayQuote Date,EndOfDayQuote Open,EndOfDayQuote High,EndOfDayQuote Low,EndOfDayQuote Close,EndOfDayQuote ExchangeOfficialClose,EndOfDayQuote Volume,EndOfDayQuote CumulativeAdjustmentFactor,EndOfDayQuote PreviousClose,EndOfDayQuote PreviousCloseDate,EndOfDayQuote PreviousExchangeOfficialClose,EndOfDayQuote PreviousExchangeOfficialCloseDate,EndOfDayQuote ChangeFromPreviousClose,EndOfDayQuote PercentChangeFromPreviousClose,EndOfDayQuote VWAP,objective,code
132248,2016-01-08,4365,2016/01/08,0.0,0.0,0.0,0.0,9900.0,0.0,1.0,9900.0,2016/01/07,9900.0,2016/01/07,0.0,0.000,0.000,0.000000,4365
132249,2016-01-15,4365,2016/01/15,9410.0,9700.0,9410.0,9700.0,9700.0,500.0,1.0,9700.0,2016/01/12,9700.0,2016/01/12,0.0,0.000,9476.000,0.067888,4365
132250,2016-01-22,4365,2016/01/22,0.0,0.0,0.0,0.0,9910.0,0.0,1.0,9910.0,2016/01/21,9910.0,2016/01/21,0.0,0.000,0.000,0.010526,4365
132251,2016-01-29,4365,2016/01/29,0.0,0.0,0.0,0.0,9600.0,0.0,1.0,9600.0,2016/01/27,9600.0,2016/01/27,0.0,0.000,0.000,0.000000,4365
132252,2016-02-05,4365,2016/02/05,0.0,0.0,0.0,0.0,9750.0,0.0,1.0,9750.0,2016/02/04,9750.0,2016/02/04,0.0,0.000,0.000,-0.076923,4365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132491,2020-11-20,4365,2020/11/20,9690.0,9690.0,9650.0,9650.0,9650.0,2000.0,1.0,9690.0,2020/11/19,9690.0,2020/11/19,-40.0,-0.413,9655.000,0.005176,4365
132492,2020-11-27,4365,2020/11/27,9800.0,9800.0,9710.0,9710.0,9710.0,300.0,1.0,9700.0,2020/11/26,9700.0,2020/11/26,10.0,0.103,9770.000,0.010288,4365
132493,2020-12-04,4365,2020/12/04,10020.0,10020.0,9820.0,9820.0,9820.0,300.0,1.0,9890.0,2020/12/03,9890.0,2020/12/03,-70.0,-0.708,9886.667,0.000000,4365
132494,2020-12-11,4365,2020/12/11,0.0,0.0,0.0,0.0,9840.0,0.0,1.0,9840.0,2020/12/09,9840.0,2020/12/09,0.0,0.000,0.000,0.012195,4365


In [29]:
df.index

DatetimeIndex(['2016-01-08', '2016-01-15', '2016-01-22', '2016-01-29',
               '2016-02-05', '2016-02-12', '2016-02-19', '2016-02-26',
               '2016-03-04', '2016-03-11',
               ...
               '2020-10-16', '2020-10-23', '2020-10-30', '2020-11-06',
               '2020-11-13', '2020-11-20', '2020-11-27', '2020-12-04',
               '2020-12-11', '2020-12-18'],
              dtype='datetime64[ns]', name='datetime', length=419771, freq=None)