<a href="https://colab.research.google.com/github/dsk-yshkw/DataDrivenFinance/blob/main/ch08_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ロジスティック回帰

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## yfinanceからデータを取得する場合

In [None]:
!pip install yfinance

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting yfinance
  Downloading yfinance-0.1.70-py2.py3-none-any.whl (26 kB)
Collecting requests>=2.26
  Downloading requests-2.28.0-py3-none-any.whl (62 kB)
[K     |████████████████████████████████| 62 kB 557 kB/s 
Collecting lxml>=4.5.1
  Downloading lxml-4.9.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (6.4 MB)
[K     |████████████████████████████████| 6.4 MB 8.2 MB/s 
Installing collected packages: requests, lxml, yfinance
  Attempting uninstall: requests
    Found existing installation: requests 2.23.0
    Uninstalling requests-2.23.0:
      Successfully uninstalled requests-2.23.0
  Attempting uninstall: lxml
    Found existing installation: lxml 4.2.6
    Uninstalling lxml-4.2.6:
      Successfully uninstalled lxml-4.2.6
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behav

In [None]:
from pandas_datareader import data as pdr

import yfinance as yf
yf.pdr_override() 

In [None]:
target = '9983.T'
symbols = ('^N225',target)
df = pd.DataFrame()
for symbol in symbols:
    data = pdr.get_data_yahoo(symbol, 
                              start="2016-03-30", 
                              end="2021-03-30")['Adj Close']
    data.name = symbol
    df = pd.concat([df,data],axis = 1)

for i in np.arange(len(df.index)):
    df.index.values[i] = str(df.index[i].date())

df.index = df.index.rename('Date')

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


## CSVからのデータ取得

In [None]:
# GitHubからダウンロードしたzipファイルを展開し，そのままローカルで使う場合
df = pd.read_csv("ch08_stock_price.csv",index_col = 'Date')

# Google Colabを使用し，sample_dataフォルダにcsvファイルを置いた場合
#df = pd.read_csv("sample_data/ch08_stock_price.csv",index_col = 'Date')

# Google ColabとGoogle Driveを併用し，MyDrive下に作った以下のようなサブフォルダにcsvファイルを置いた場合
#df = pd.read_csv("drive/MyDrive/Colab Notebooks/Kyoritsu/ch08_stock_price.csv",index_col = 'Date')



## データの整理

In [None]:
df = df.dropna()
df

Unnamed: 0_level_0,^N225,9983.T
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-03-30,16878.960938,34949.539062
2016-03-31,16758.669922,34489.929688
2016-04-01,16164.160156,32622.761719
2016-04-04,16123.269531,31990.794922
2016-04-05,15732.820312,30659.839844
...,...,...
2021-03-24,28405.519531,83740.000000
2021-03-25,28729.880859,84900.000000
2021-03-26,29176.699219,85650.000000
2021-03-29,29384.519531,86390.000000


In [None]:
df_rate = pd.DataFrame()
df_rate['^N225 1 day return'] = df['^N225'].pct_change()
df_rate['9983 1 day return'] = df['9983.T'].pct_change()
df_rate = df_rate.dropna()

In [None]:
df_rate['diff'] =  df_rate['9983 1 day return'].shift(-1)\
                - df_rate['^N225 1 day return'].shift(-1)
df_rate['target'] = (df_rate['diff'] > 0).astype(int)
df_rate = df_rate.dropna()
df_rate

Unnamed: 0_level_0,^N225 1 day return,9983 1 day return,diff,target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-03-31,-0.007127,-0.013151,-0.018662,0
2016-04-01,-0.035475,-0.054137,-0.016842,0
2016-04-04,-0.002530,-0.019372,-0.017388,0
2016-04-05,-0.024217,-0.041604,-0.010758,0
2016-04-06,-0.001110,-0.011867,-0.038540,0
...,...,...,...,...
2021-03-23,-0.006109,-0.001036,-0.014892,0
2021-03-24,-0.020361,-0.035253,0.002433,1
2021-03-25,0.011419,0.013852,-0.006718,0
2021-03-26,0.015552,0.008834,0.001517,1


In [None]:
X = df_rate['9983 1 day return']
y = df_rate['target']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
val_size = 0.2

X_train,X_val,y_train,y_val = train_test_split(
    X,y,test_size = val_size,shuffle = False)

In [None]:
#import numpy as np

print(np.sum(y_train[y_train>0])/len(y_train))

0.4948770491803279


## ロジスティック回帰の実装


In [None]:
from sklearn import linear_model

In [None]:
log_reg = linear_model.LogisticRegression(penalty = 'none')
log_reg.fit(X_train.values.reshape(-1,1),y_train.values)

LogisticRegression(penalty='none')

In [None]:
print(log_reg.coef_)
print(log_reg.intercept_)

[[-4.21447895]]
[-0.01861724]


In [None]:
log_reg.predict_proba([[0.3]])

array([[0.78295337, 0.21704663]])

In [None]:
log_reg.predict([[0.3]])

array([0])

In [None]:
print(log_reg.predict([[-0.5]]))


[1]


In [None]:
y_train_pred = log_reg.predict(X_train.values.reshape(-1,1))

In [None]:
from sklearn.metrics import accuracy_score

print('accuracy score for train data', accuracy_score(y_train,y_train_pred))

accuracy score for train data 0.5266393442622951


## 日経平均のリターンも特徴量に加える

In [None]:
X = df_rate[df_rate.columns[(df_rate.columns != 'diff')
                            & (df_rate.columns != 'target')]]
y = df_rate['target']

In [None]:
val_size = 0.2

X_train,X_val,y_train,y_val\
    =train_test_split(X,y,test_size = val_size,shuffle = False)



log_reg = linear_model.LogisticRegression(
    penalty = 'none')
log_reg.fit(X_train,y_train)
y_train_pred = log_reg.predict(X_train)

print('accuracy score for train data', accuracy_score(y_train,y_train_pred))


accuracy score for train data 0.5204918032786885


## さまざまなインターバルのリターンを特徴量に加える

In [None]:
diffs = [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,40,60,80,100,120,
         140,160,180,200,220,240]

for diff in diffs:
    df_rate['^N225 ' + str(diff) + ' days return']\
        = df['^N225'].pct_change(diff)
    df_rate['9983 ' + str(diff) + ' days return']\
        = df['9983.T'].pct_change(diff)

df_rate = df_rate.dropna(how = 'any')

X = df_rate[df_rate.columns[(df_rate.columns != 'diff')
                            & (df_rate.columns != 'target')]]
y = df_rate['target']
X

val_size = 0.2

X_train,X_val,y_train,y_val \
    = train_test_split(X,y,test_size = val_size,shuffle = False)


In [None]:
log_reg = linear_model.LogisticRegression(penalty = 'none', max_iter = 493)
log_reg.fit(X_train,y_train)

print('coef',log_reg.coef_) 
print('intercept',log_reg.intercept_) 

coef [[-1.38155769e+01 -7.14146047e-01  1.25677128e+01 -7.09906635e+00
  -7.66096092e+00 -2.33352677e+00 -8.15548901e+00  1.38430843e+01
   1.46356334e+01 -7.53817030e+00  1.07559583e+00 -5.93126077e+00
   2.31084088e+00  1.14426900e+00 -2.85659104e+01  1.54084896e+01
   3.72422776e+01 -1.69491174e+01 -1.64740148e+01  9.03074025e+00
  -1.53535937e+00 -6.94649031e+00  2.61846917e+00 -1.72945696e-01
  -1.32240401e+01  6.26940286e+00 -2.20619315e+00  2.40437704e+00
   3.43559775e+01 -1.41055687e+01 -2.52240523e+01  2.85409760e+00
   8.18978441e+00  5.69867107e+00  3.07578829e+00 -4.85676394e+00
  -6.58171114e+00  6.46336012e-01  1.87691751e+00  1.38787150e+00
   2.36149995e+00  9.79487801e-02 -3.32353054e+00  2.71585776e+00
   2.17267330e+00 -1.94978164e-01 -2.86066113e-01 -3.90450347e+00
   2.49958044e+00  3.20689864e+00 -5.51518616e+00 -9.97589902e-02
   1.66230062e+00  6.21450961e-01 -1.89137543e-01 -2.51930970e-01
   7.13134565e-01 -1.05850352e+00 -7.57579145e+00 -3.17307930e-02
   6.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
y_train_pred = log_reg.predict(X_train)
print('Accuracy score:', accuracy_score(y_train,y_train_pred))

Accuracy score: 0.6050955414012739


## スケーリング

In [None]:
from sklearn import preprocessing

scaler_ss = preprocessing.StandardScaler()
scaler_mm = preprocessing.MinMaxScaler(feature_range=(-1,1))
X_train_scaled_ss = scaler_ss.fit_transform(X_train)
X_train_scaled_mm = scaler_mm.fit_transform(X_train)

In [None]:
log_reg_ss = linear_model.LogisticRegression(penalty = 'none',max_iter = 493)
log_reg_ss.fit(X_train_scaled_ss,y_train)
y_train_pred_ss = log_reg_ss.predict(X_train_scaled_ss)
print('Accuracy score:', accuracy_score(y_train,y_train_pred_ss))

log_reg_mm = linear_model.LogisticRegression(penalty = 'none',max_iter = 493)
log_reg_mm.fit(X_train_scaled_mm,y_train)
y_train_pred_mm = log_reg_ss.predict(X_train_scaled_mm)
print('Accuracy score:', accuracy_score(y_train,y_train_pred_mm))


Accuracy score: 0.6050955414012739
Accuracy score: 0.6114649681528662


## 正則化

In [None]:
y_val_pred = log_reg.predict(X_val)
print('Accuracy score for validation data:', accuracy_score(y_val,y_val_pred))

Accuracy score for validation data: 0.4720812182741117


In [None]:
X_val_scaled_ss = scaler_ss.fit_transform(X_val)
y_val_pred_ss = log_reg_ss.predict(X_val_scaled_ss)

X_val_scaled_mm = scaler_mm.fit_transform(X_val)
y_val_pred_mm = log_reg_mm.predict(X_val_scaled_ss)

print('Accuracy score for validation data, standard scaler:', 
      accuracy_score(y_val,y_val_pred_ss))
print('Accuracy score for validation data, min-max scaler:', 
      accuracy_score(y_val,y_val_pred_mm))

Accuracy score for validation data, standard scaler: 0.45685279187817257
Accuracy score for validation data, min-max scaler: 0.4720812182741117


In [None]:
log_reg_l2 = linear_model.LogisticRegression()
log_reg_l2.fit(X_train_scaled_mm,y_train)

y_train_pred_l2 = log_reg_l2.predict(X_train_scaled_mm)
y_val_pred_l2 = log_reg_l2.predict(X_val_scaled_mm)

print('Accuracy score:', accuracy_score(y_train,y_train_pred_l2))
print('Accuracy score:', accuracy_score(y_val,y_val_pred_l2))

Accuracy score: 0.6012738853503184
Accuracy score: 0.4467005076142132


In [None]:
log_reg_l1 = linear_model.LogisticRegression(penalty = 'l1',
                                             solver='saga', 
                                             C = 1)
log_reg_l1.fit(X_train,y_train)

y_train_pred_l1 = log_reg_l1.predict(X_train)
y_val_pred_l1 = log_reg_l1.predict(X_val)

In [None]:
print('coef',log_reg_l1.coef_) 
print('intercept',log_reg_l1.intercept_) 

coef [[ 0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.43537699
   0.          0.21361245  0.          0.          0.         -0.39369759
   0.          0.48929773  0.          0.          0.          0.
   0.          0.          0.         -0.13483496 -0.37003464 -0.98209043
   0.          0.46773833]]
intercept [0.13993056]


In [None]:
print('Accuracy score:', accuracy_score(y_train,y_train_pred_l1))
print('Accuracy score:', accuracy_score(y_val,y_val_pred_l1))

Accuracy score: 0.5732484076433121
Accuracy score: 0.5177664974619289


# 多クラス分類 

## データの整理

In [None]:
df_rate = pd.DataFrame()
df_rate['^N225 1 day return'] = df['^N225'].pct_change()
df_rate['9983 1 day return'] = df['9983.T'].pct_change()
df_rate = df_rate.dropna()

df_vol = df_rate['^N225 1 day return'].std()

In [None]:
df_rate.loc[df_rate['9983 1 day return'].shift(-1) 
            > df_rate['^N225 1 day return'].shift(-1) + df_vol,
            'target'] = 0

df_rate.loc[(df_rate['9983 1 day return'].shift(-1) 
             < df_rate['^N225 1 day return'].shift(-1) + df_vol) 
            & (df_rate['9983 1 day return'].shift(-1) 
               > df_rate['^N225 1 day return'].shift(-1) - df_vol),
            'target'] = 1

df_rate.loc[df_rate['9983 1 day return'].shift(-1) 
            < df_rate['^N225 1 day return'].shift(-1) - df_vol,
            'target'] = 2

df_rate = df_rate.dropna()
df_rate['target'] = df_rate['target'].astype(int)
df_rate

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


Unnamed: 0_level_0,^N225 1 day return,9983 1 day return,target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-03-31,-0.007127,-0.013151,2
2016-04-01,-0.035475,-0.054137,2
2016-04-04,-0.002530,-0.019372,2
2016-04-05,-0.024217,-0.041604,1
2016-04-06,-0.001110,-0.011867,2
...,...,...,...
2021-03-23,-0.006109,-0.001036,2
2021-03-24,-0.020361,-0.035253,1
2021-03-25,0.011419,0.013852,1
2021-03-26,0.015552,0.008834,1


In [None]:
diffs = [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,40,60,80,100,120,
         140,160,180,200,220,240]
    
for diff in diffs:
    df_rate['^N225 ' + str(diff) + ' days return']\
        = df['^N225'].pct_change(diff)
    df_rate['9983 ' + str(diff) + ' days return']\
        = df['9983.T'].pct_change(diff)

df_rate = df_rate.dropna(how = 'any')

X = df_rate[df_rate.columns[(df_rate.columns != 'target')]]
y = df_rate['target']

val_size = 0.2

X_train,X_val,y_train,y_val\
    = train_test_split(X,y,test_size = val_size,shuffle = False)

## ソフトマックス回帰の実装

In [None]:
log_reg_trino = linear_model.LogisticRegression(
    multi_class = 'multinomial',
    penalty = 'none'
)
log_reg_trino.fit(X_train,y_train)
y_train_pred = log_reg_trino.predict(X_train)
y_val_pred = log_reg_trino.predict(X_val)

print('Accuracy score:', accuracy_score(y_train,y_train_pred))
print('Accuracy score:', accuracy_score(y_val,y_val_pred))

Accuracy score: 0.7044585987261146
Accuracy score: 0.6192893401015228


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## 正則化

In [None]:
log_reg_trino =  linear_model.LogisticRegression(
    multi_class = 'multinomial',
    penalty = 'l1',C=0.01,solver = 'saga'
)
log_reg_trino.fit(X_train,y_train)

y_train_pred_multi = log_reg_trino.predict(X_train)
y_val_pred_multi = log_reg_trino.predict(X_val)

print('Accuracy score:', accuracy_score(y_train,y_train_pred_multi))
print('Accuracy score:', accuracy_score(y_val,y_val_pred_multi))

Accuracy score: 0.6840764331210191
Accuracy score: 0.6700507614213198
