# Get Stock stock_indexes_log_rrt from Yahoo! Finance

This script is designed for Google Colab.

Install Yahoo! finance package `yfinance`.

In [13]:
!pip install yfinance

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


*Restart the runtime after executing.*

Import packages.

In [14]:
import pickle
import yfinance
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.stattools import adfuller, pacf

## Clean dataset

Download stock indexes from Yahoo! finance.

The ticker name of indexes are from https://finance.yahoo.com/world-indices/ Accessed Aug 12, 2022.

In [15]:
stock_indexes = yfinance.download([
    '^GSPC',  # S&P 500
    '^DJI',  # Dow Jones Industrial Average
    '^IXIC',  # NASDAQ Composite
    '^N100',  # Euronext 100 Index
    '^N225',  # Nikkei 225
    '^HSI',  # HANG SENG INDEX
    '^NZ50',  # S&P/NZX 50 INDEX GROSS
    '000001.SS',  # SSE Composite Index
    '399001.SZ',  # Shenzhen Index
    '^STOXX50E', 'IMOEX.ME', '^KS11', '^STI', '^AXJO'
], start="2016-01-01", end="2019-12-31", timeout=3, auto_adjust=True)

[*********************100%***********************]  14 of 14 completed


We only use close price. After setting `auto_adjust=True`, the close price has been properly adjusted.

In [16]:
stock_indexes_price = stock_indexes['Close'].copy()
stock_indexes_price

Unnamed: 0_level_0,000001.SS,399001.SZ,IMOEX.ME,^AXJO,^DJI,^GSPC,^HSI,^IXIC,^KS11,^N100,^N225,^NZ50,^STI,^STOXX50E
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2016-01-04,3296.258057,11625.909180,1734.560059,5270.500000,17148.939453,2012.660034,21327.119141,4903.089844,1918.760010,885.000000,18450.980469,,2835.969971,3164.760010
2016-01-05,3287.710938,11467.931641,1754.130005,5184.399902,17158.660156,2016.709961,21188.720703,4891.430176,1930.530029,890.530029,18374.000000,6278.100098,2834.229980,3178.010010
2016-01-06,3361.840088,11724.749023,1748.569946,5123.100098,16906.509766,1990.260010,20980.810547,4835.759766,1925.430054,879.270020,18191.320312,6262.520020,2804.270020,3139.320068
2016-01-07,3125.001953,10760.149414,,5010.299805,16514.099609,1943.089966,20333.339844,4689.430176,1904.329956,864.859985,17767.339844,6213.390137,2729.909912,3084.679932
2016-01-08,3186.412109,10888.788086,,4990.799805,16346.450195,1922.030029,20453.710938,4643.629883,1917.619995,850.020020,17697.960938,6158.100098,2751.229980,3033.469971
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-25,2981.881104,10229.580078,,,,,,,,,23782.869141,,,
2019-12-26,3007.354980,10303.719727,3031.669922,,28621.390625,3239.909912,,9022.389648,2197.929932,,23924.919922,,3222.989990,
2019-12-27,3005.035889,10233.769531,3050.469971,6821.700195,28645.259766,3240.020020,28225.419922,9006.620117,2204.209961,1156.609985,23837.720703,11602.120117,3226.530029,3782.270020
2019-12-30,3040.023926,10365.959961,3045.870117,6804.899902,28462.140625,3221.290039,28319.390625,8945.990234,2197.669922,1146.560059,23656.619141,11556.450195,3222.439941,3748.469971


The first row has missing value, so we use backward filling to fill in the hole.

In [17]:
stock_indexes_price.iloc[0, :] = stock_indexes_price.fillna(method='bfill').iloc[0, :].copy()

Fill other rows with previous close price.

In [18]:
stock_indexes_price.fillna(method='ffill', inplace=True)

Assert no missing values exist after processing.

In [19]:
assert np.sum(np.isnan(stock_indexes_price.values)) == 0
stock_indexes_price

Unnamed: 0_level_0,000001.SS,399001.SZ,IMOEX.ME,^AXJO,^DJI,^GSPC,^HSI,^IXIC,^KS11,^N100,^N225,^NZ50,^STI,^STOXX50E
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2016-01-04,3296.258057,11625.909180,1734.560059,5270.500000,17148.939453,2012.660034,21327.119141,4903.089844,1918.760010,885.000000,18450.980469,6278.100098,2835.969971,3164.760010
2016-01-05,3287.710938,11467.931641,1754.130005,5184.399902,17158.660156,2016.709961,21188.720703,4891.430176,1930.530029,890.530029,18374.000000,6278.100098,2834.229980,3178.010010
2016-01-06,3361.840088,11724.749023,1748.569946,5123.100098,16906.509766,1990.260010,20980.810547,4835.759766,1925.430054,879.270020,18191.320312,6262.520020,2804.270020,3139.320068
2016-01-07,3125.001953,10760.149414,1748.569946,5010.299805,16514.099609,1943.089966,20333.339844,4689.430176,1904.329956,864.859985,17767.339844,6213.390137,2729.909912,3084.679932
2016-01-08,3186.412109,10888.788086,1748.569946,4990.799805,16346.450195,1922.030029,20453.710938,4643.629883,1917.619995,850.020020,17697.960938,6158.100098,2751.229980,3033.469971
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-25,2981.881104,10229.580078,3030.590088,6794.200195,28515.449219,3223.379883,27864.210938,8952.879883,2190.080078,1154.290039,23782.869141,11642.780273,3221.669922,3776.659912
2019-12-26,3007.354980,10303.719727,3031.669922,6794.200195,28621.390625,3239.909912,27864.210938,9022.389648,2197.929932,1154.290039,23924.919922,11642.780273,3222.989990,3776.659912
2019-12-27,3005.035889,10233.769531,3050.469971,6821.700195,28645.259766,3240.020020,28225.419922,9006.620117,2204.209961,1156.609985,23837.720703,11602.120117,3226.530029,3782.270020
2019-12-30,3040.023926,10365.959961,3045.870117,6804.899902,28462.140625,3221.290039,28319.390625,8945.990234,2197.669922,1146.560059,23656.619141,11556.450195,3222.439941,3748.469971


## Difference

Perform the logarithmic transformation.

In [20]:
stock_indexes_log_rrt = stock_indexes_price.apply(lambda x: np.diff(np.log(x)))
stock_indexes_log_rrt

Unnamed: 0,000001.SS,399001.SZ,IMOEX.ME,^AXJO,^DJI,^GSPC,^HSI,^IXIC,^KS11,^N100,^N225,^NZ50,^STI,^STOXX50E
0,-0.002596,-0.013682,0.011219,-0.016471,0.000567,0.002010,-0.006510,-0.002381,0.006115,0.006229,-0.004181,0.000000,-0.000614,0.004178
1,0.022297,0.022147,-0.003175,-0.011894,-0.014804,-0.013202,-0.009861,-0.011446,-0.002645,-0.012725,-0.009992,-0.002485,-0.010627,-0.012249
2,-0.073054,-0.085852,0.000000,-0.022264,-0.023484,-0.023986,-0.031346,-0.030727,-0.011019,-0.016524,-0.023583,-0.007876,-0.026875,-0.017558
3,0.019461,0.011884,0.000000,-0.003900,-0.010204,-0.010898,0.005902,-0.009815,0.006955,-0.017308,-0.003913,-0.008938,0.007779,-0.016741
4,-0.054731,-0.064136,-0.038494,-0.011811,0.003183,0.000853,-0.028023,-0.001215,-0.011950,-0.001483,0.000000,-0.009016,-0.015524,-0.001973
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1034,-0.000268,0.003946,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.002004,0.000000,0.000000,0.000000
1035,0.008507,0.007221,0.000356,0.000000,0.003708,0.005115,0.000000,0.007734,0.003578,0.000000,0.005955,0.000000,0.000410,0.000000
1036,-0.000771,-0.006812,0.006182,0.004039,0.000834,0.000034,0.012880,-0.001749,0.002853,0.002008,-0.003651,-0.003498,0.001098,0.001484
1037,0.011576,0.012834,-0.001509,-0.002466,-0.006413,-0.005798,0.003324,-0.006754,-0.002971,-0.008727,-0.007626,-0.003944,-0.001268,-0.008977


In [None]:
with open('indexes_alex2018.pkl', 'wb') as f:
    pickle.dump(stock_indexes_log_rrt, f)

In [None]:
train_size = round(0.8 * stock_indexes_log_rrt.shape[0])
indexes_train, indexes_test = stock_indexes_log_rrt.values[:train_size], stock_indexes_log_rrt.values[train_size:]

ADF test (stationary)

$H_0$: has unit root = non-stationary

In [None]:
adf = stock_indexes_log_rrt.apply(adfuller, axis=0).T
adf.columns = ['adf', 'pvalue', 'usedlag', 'nobs', 'critical values', 'icbest']
adf

PACF determining the order (AR -> p -> PACF)

In [None]:
pacf_ = stock_indexes_log_rrt.apply(lambda x: pacf(x, nlags=25), axis=0)
pacf_['N_significance'] = np.bitwise_or(pacf_ < -0.05, pacf_ > 0.05).sum(axis=1)
pacf_

Standardization.

In [None]:
std_scaler = StandardScaler()
indexes_train_std = std_scaler.fit_transform(indexes_train)
indexes_test_std = std_scaler.transform(indexes_test)
with open('indexes_std_scaler.pkl', 'wb') as f:
    pickle.dump(std_scaler, f)

Moving window.

In [None]:
def moving_window(ts, k_):
    """
    Make moving window samples from time series.
    :param ts: Time series.
    :param k_: Length of the window.
    :return: x_, y_: fraction used as input, fraction used as output.
    """
    length = ts.shape[0]
    y_ = ts[k_:]
    indices = np.tile(np.arange(k_), [length - k_, 1]) + np.arange(length - k_)[:, np.newaxis]
    x_ = ts[indices]
    return x_, y_

x_train, y_train = moving_window(indexes_train_std, k_=12)
x_test, y_test = moving_window(indexes_test_std, k_=12)

## Export 

Save the result `dataframe`.

In [21]:
with open('indexes_std.pkl', 'wb') as f:
    pickle.dump([x_train, y_train, x_test, y_test], f)

Save column names.

In [None]:
with open('indexes_names.pkl', 'wb') as f:
    pickle.dump(stock_indexes_log_rrt.columns.tolist(), f)