## ProbSpace米国株式市場　将来株価予測 DT-SN LightGBM Base line(LB=0.03781)

In [None]:
# set configulation

import os

class Config():
    root_path = '/content/drive/MyDrive/Probdata/stock'
    input_path = os.path.join(root_path)
    model_path = os.path.join(root_path, 'model')
    result_path = os.path.join(root_path, 'result')
    seed = 42

In [None]:
# create dirs

for dir in [Config.model_path, Config.result_path]:
    os.makedirs(dir, exist_ok=True)

In [None]:
import pandas as pd
import numpy as np
import random
import matplotlib.pylab as plt
import scipy.stats as stats
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold
import lightgbm as lgb

In [None]:
pd.set_option('max_columns', 50)
plt.style.use('bmh')

In [None]:
def seed_everything(seed=2021):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(Config.seed)

In [None]:
train_df = pd.read_csv(os.path.join(Config.input_path, 'train_data.csv'))
print(train_df.shape)

In [None]:
display(train_df)
train_df.info()

In [None]:
print('NaNの合計:', train_df.isnull().sum().sum())
print('NaNを含む行', train_df.index[train_df.isnull().any(axis=1)].to_list())

In [None]:
train_df['Date'] = pd.to_datetime(train_df['Date'])
train_df = train_df.dropna().set_index('Date')
display(train_df)
train_df.info()

#### 株価は、対数価格の増減が正規分布に従うそうですので、検証します まずは、対数価格を見てみます

In [None]:
train_df = train_df.apply(np.log1p)

In [None]:
# プロットしてみる

cols = ['VGSH', 'JEF', 'IVR']
train_df[cols].plot(figsize=(15,5))

In [None]:
train_df[cols].plot(
    subplots=True, kind='hist', bins=100,
    figsize=(15,10)
)

In [None]:
# 対数価格そのものは正規分布から外れている

def qqplot(dist):
    plt.figure(figsize=(5,5))
    stats.probplot(dist, dist='norm', plot=plt)
    plt.show()

for col in cols:
    qqplot(train_df[col])

In [None]:
# １週前からの増減をプロットしてみる

train_df[cols].diff(1).fillna(0).plot(figsize=(15,5))

In [None]:
train_df[cols].diff(1).fillna(0).plot(
    subplots=True, kind='hist', bins=100,
    figsize=(15,10)
)

In [None]:
# 正規分布に近づいた

for col in cols:
    qqplot(train_df[col].diff(1).fillna(0))

In [None]:
# 銘柄ごとに分散が異なるので標準偏差で割ったものをプロットしてみる

train_df[cols].diff(1).fillna(0).apply(lambda x: x / x.std()).plot(
    figsize=(15,5)
)

In [None]:
train_df[cols].diff(1).fillna(0).apply(lambda x: x / x.std()).plot(
    subplots=True, kind='hist', bins=100,
    figsize=(15,10)
)

In [None]:
# 年ごとの平均株価

df = pd.Series(index=range(2012,2019+1), dtype=np.float64)
for y in df.index:
    df[y] = train_df.loc[train_df.index.year == y].mean().mean()
df.plot(figsize=(15,10))

In [None]:
# 月ごとの平均株価

df = pd.Series(index=range(1,12+1), dtype=np.float64)
for m in df.index:
    df[m] = train_df.loc[train_df.index.month == m].mean().mean()
df.plot(figsize=(15,10))

In [None]:
# 週ごとの平均株価

df = pd.Series(index=range(1,52+1), dtype=np.float64)
for w in df.index:
    df[w] = train_df.loc[train_df.index.isocalendar().week == w].mean().mean()
df.plot(figsize=(15,10))

In [None]:
# company_list.csvを読み込む

company_df = pd.read_csv(os.path.join(Config.input_path, 'company_list.csv')).rename(columns={'Symbol':'id'})
print(company_df.shape)

In [None]:
# 表示してみる

display(company_df)
company_df.info()

In [None]:
# company_dfに含まれない銘柄

not_exist = list(train_df.columns[~train_df.columns.isin(company_df['id'])])
print(not_exist)

In [None]:
# とりあえずダミー追加

for col in not_exist:
    company_df = company_df.append({'id':col}, ignore_index=True)

In [None]:
# なんか多い

company_df = company_df[company_df['id'].isin(train_df.columns)]
print(len(company_df))

In [None]:
# 重複してるのを表示

company_df[company_df.duplicated(subset='id', keep=False)].sort_values('id')

In [None]:
# 重複は最大２個でListだけが異なっているので、Listを２列にする

company_df['List1'] = company_df[['id', 'List']].groupby('id').transform(lambda x: x.iloc[0])
company_df['List2'] = company_df[['id', 'List']].groupby('id').transform(lambda x: x.iloc[-1])
company_df = company_df.drop('List', axis=1).drop_duplicates(subset='id').reset_index(drop=True)
display(company_df)
company_df.info()

In [None]:
# 各Sectorの銘柄数

company_df['Sector'].fillna('nothing', inplace=True)
company_df['Sector'].value_counts().plot(kind='bar', figsize=(15,10))

In [None]:
# Sectorごとの平均株価

tmp_df = pd.DataFrame(columns=company_df['Sector'].value_counts().index)
for sector in tmp_df.columns:
    tmp_df[sector] = train_df[company_df.loc[company_df['Sector'] == sector, 'id']].mean(axis=1)
tmp_df.plot(figsize=(15,10))

In [None]:
# 各Industryの銘柄数(上位１０種類）

company_df['Industry'].fillna('nothing', inplace=True)
company_df['Industry'].value_counts()[:10].plot(kind='bar', figsize=(15,10))

In [None]:
# Industryごとの平均株価(上位１０種類）

tmp_df = pd.DataFrame(columns=company_df['Industry'].value_counts().index[:10])
for sector in tmp_df.columns:
    tmp_df[sector] = train_df[company_df.loc[company_df['Industry'] == sector, 'id']].mean(axis=1)
tmp_df.plot(figsize=(15,10))

In [None]:
# 各Listの銘柄数

company_df['List1'].fillna('nothing', inplace=True)
company_df['List1'].value_counts().plot(kind='bar', figsize=(15,10))

In [None]:
# Listごとの平均株価

tmp_df = pd.DataFrame(columns=company_df['List1'].value_counts().index)
for sector in tmp_df.columns:
    tmp_df[sector] = train_df[company_df.loc[company_df['List1'] == sector, 'id']].mean(axis=1)
tmp_df.plot(figsize=(15,10))