In [2]:
import pandas as pd
import numpy as np

In [5]:
dfp = pd.read_csv('../../data/preprocessed/stock_prices_train.csv')

In [8]:
dfp["ma_60days"]

0            0.000000
1            0.000000
2            0.000000
3            0.000000
4            0.000000
              ...    
2332526    807.783333
2332527    805.033333
2332528    802.300000
2332529    799.333333
2332530    796.516667
Name: ma_60days, Length: 2332531, dtype: float64

In [15]:
from decimal import *
def generate_adjusted_close(df):
    """
    Args:
        df (pd.DataFrame)  : stock_price for a single SecuritiesCode
    Returns:
        df (pd.DataFrame): stock_price with AdjustedClose for a single SecuritiesCode
    """
    # sort data to generate CumulativeAdjustmentFactor
    df = df.sort_values("Date", ascending=False)
    # generate CumulativeAdjustmentFactor by taking the cumulative product of AdjustmentFactor
    df.loc[:, "CumulativeAdjustmentFactor"] = df["AdjustmentFactor"].cumprod()
    # generate AdjustedClose
    df.loc[:, "AdjustedClose"] = (
        df["CumulativeAdjustmentFactor"] * df["Close"]
    ).map(lambda x: float(
        Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
    ))
    # reverse order
    df = df.sort_values("Date")
    # to fill AdjustedClose, replace 0 into np.nan
    df.loc[df["AdjustedClose"] == 0, "AdjustedClose"] = np.nan
    # forward fill AdjustedClose
    df.loc[:, "AdjustedClose"] = df.loc[:, "AdjustedClose"].ffill()
    return df

def adjust_price(price):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
    Returns:
        price DataFrame (pd.DataFrame): stock_price with generated AdjustedClose
    """
    # copy to edit
    price = price.copy()

    # generate AdjustedClose
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode", group_keys=False).apply(generate_adjusted_close).reset_index(drop=True)

    return price
df = adjust_price(df)

In [19]:
def add_return(df,col,period):
    df.loc[:,f"return_{period}days"] = df.groupby("SecuritiesCode")[col].pct_change(period)
returns = add_return(df,"AdjustedClose",40)

In [20]:
df

Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target,return_5days,return_60days,CumulativeAdjustmentFactor,AdjustedClose,return_40days
0,20170104_1301,2017-01-04,1301,2734.0,2755.0,2730.0,2742.0,31400,1.0,,False,0.000730,,,1.0,2742.0,
1,20170105_1301,2017-01-05,1301,2743.0,2747.0,2735.0,2738.0,17900,1.0,,False,0.002920,,,1.0,2738.0,
2,20170106_1301,2017-01-06,1301,2734.0,2744.0,2720.0,2740.0,19900,1.0,,False,-0.001092,,,1.0,2740.0,
3,20170110_1301,2017-01-10,1301,2745.0,2754.0,2735.0,2748.0,24200,1.0,,False,-0.005100,,,1.0,2748.0,
4,20170111_1301,2017-01-11,1301,2748.0,2752.0,2737.0,2745.0,9300,1.0,,False,-0.003295,,,1.0,2745.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2332526,20211129_9997,2021-11-29,9997,678.0,679.0,665.0,668.0,320800,1.0,,False,0.026987,-0.067039,-0.187348,1.0,668.0,-0.217799
2332527,20211130_9997,2021-11-30,9997,670.0,689.0,667.0,667.0,296300,1.0,,False,-0.001460,-0.063202,-0.198317,1.0,667.0,-0.205006
2332528,20211201_9997,2021-12-01,9997,661.0,688.0,660.0,685.0,339100,1.0,,False,0.017544,-0.025605,-0.193168,1.0,685.0,-0.183552
2332529,20211202_9997,2021-12-02,9997,681.0,692.0,680.0,684.0,342900,1.0,,False,0.014368,-0.031161,-0.206497,1.0,684.0,-0.162791


In [13]:
limit_sectors = {
    'Foods':'Retail trade',
    '-':'-',
    'Commercial & wholesale trade':'Retail trade',
    'Construction & materials':'Construction & materials',
    'Steel & nonferrous metals':'Construction & materials',
    'Energy resources':'Energy resources',
    'It & services, others':'It & services, others',
    'Machinery':'Machinery',
    'Pharmaceutical':'Pharmaceutical',
    'Real estate':'Real estate',
    'Transportation & logistics':'Transportation & logistics',
    'Financials （ex banks）':'Banks',
    'Retail trade':'Retail trade',
    'Raw materials & chemicals':'Construction & materials',
    'Electric appliances & precision instruments':'Machinery',
    'Automobiles & transportation equipment':'Transportation & logistics',
    'Banks':'Banks',
    'Electric power & gas':'Energy resources'
}

In [9]:
external_features["Sector"].value_counts()

It & services, others                          1188
-                                               500
Retail trade                                    359
Commercial & wholesale trade                    320
Construction & materials                        316
Electric appliances & precision instruments     305
Raw materials & chemicals                       294
Machinery                                       230
Real estate                                     143
Foods                                           137
Transportation & logistics                      119
Automobiles & transportation equipment          114
Financials （ex banks）                            97
Banks                                            93
Steel & nonferrous metals                        80
Pharmaceutical                                   77
Electric power & gas                             25
Energy resources                                 20
Name: Sector, dtype: int64

In [14]:
stock_list = pd.read_csv('../../data/stock_list.csv')

stock_list["Sector"] = [s.rstrip().lower().capitalize() for s in stock_list['17SectorName']]
stock_list["Capitalization"] = (stock_list['MarketCapitalization'] - stock_list['MarketCapitalization'].mean()) / stock_list['MarketCapitalization'].std()
stock_list["Shares"] = (stock_list['NumberOfShares'] - stock_list['NumberOfShares'].mean()) / stock_list['NumberOfShares'].std()