This notebook was created by Donna Faith Go.

In [1]:
import sys
!{sys.executable} -m pip install -qq -r requirements.txt

In [2]:
# standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import os

# data gathering
import yfinance as yf
import time
import pandas_datareader.data as web
from datetime import datetime, timedelta

# ignore warnings
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

## Daily Closing Prices

## Stock Tickers

In [3]:
filepath = 'SSE companies list.pkl'
with open(filepath, 'rb') as f:
    companies_df = pickle.load(f)
companies_df.head()
companies_df = companies_df[:600]

In [4]:
# add the suffix to make it compatible with yfinance
tickers_list = companies_df['Company Code'].to_list()
tickers_list = [ticker + ".SS" for ticker in tickers_list]

# check if it worked
print(tickers_list[:5])

['600000.SS', '600004.SS', '600006.SS', '600007.SS', '600008.SS']


In [5]:
# sanity checks
print(tickers_list[0])
print(companies_df['Company Name'][0])
print(len(companies_df) == len(tickers_list)) # should be True

600000.SS
SHANGHAI PUDONG DEVELOPMENT BANK CO., LTD.
True


## yfinance

In [6]:
# pull individual stock data from yfinance
def download_info_per_stock(ticker, verbose=False, 
                            start_date='2000-01-01', 
                            end_date='2026-01-01'):
        try:
            # get data for the ticker
            ticker_data = yf.download(
                ticker,
                start=start_date,
                end=end_date,
                progress=False,
                timeout=120 # in case of slow internet, in seconds
            )
            return pd.DataFrame(ticker_data)
            
        except Exception as e:
            if verbose:
                print(f"Error downloading {ticker}.")
            return None

# saving individual stock data
def save_info_per_stock(ticker_list, delay=1, 
                        verbose=False, override=False,
                        start_date='2000-01-01', 
                        end_date='2026-01-01'):
    
    # create the data folder
    os.makedirs("data", exist_ok=True)

    for i in range(0, len(ticker_list)):
        # declare company name and filepath
        ticker_name = companies_df['Company Name'][i]
        filepath = f"data/{ticker_name}.pkl"

        # skip if not override and file exists
        if override == False and os.path.exists(filepath):
            if verbose:
                print(f"Skipped {ticker_name}.")
                continue
        else:                    
            # get the data for each stock
            if verbose:
                print()
                print(f"Downloading for ticker: {ticker_list[i]}")
            ticker_data = download_info_per_stock(ticker_list[i],
                                                  start_date=start_date,
                                                  end_date=end_date)
    
            # saving data as a pkl file
            if ticker_data is not None and not ticker_data.empty:
                ticker_data.to_pickle(filepath)
                if verbose == True:
                    print(f"Saved data for {ticker_name}.")
            
            # avoid rate limiting
            time.sleep(delay)

    print("Done downloading all data!")

In [None]:
save_info_per_stock(tickers_list, verbose=True)

Skipped SHANGHAI PUDONG DEVELOPMENT BANK CO., LTD..
Skipped Guangzhou Baiyun International Airport Co.,ltd..
Skipped Dongfeng  Automobile  Co.,LTD.
Skipped China World Trade Center Company Ltd..
Skipped Beijing Capital Eco-Environment Protection Group Co.,Ltd..
Skipped Shanghai International Airport Co., Ltd..
Skipped Inner Mongolia BaoTou Steel Union Co.,Ltd..
Skipped Huaneng Power International, INC..
Skipped Anhui Expressway Company Limited.
Skipped HUA XIA BANK CO., Limited.
Skipped CHINA MINSHENG BANK.
Skipped RIZHAO PORT CO.,LTD..
Skipped Shanghai International Port (Group) Co., Ltd.
Skipped Baoshan Iron & Steel Co., Ltd..
Skipped Henan Zhongyuan Expressway Company Limited.
Skipped SHANGHAI ELECTRIC POWER COMPANY LIMITED.
Skipped SHANDONG IRON AND STEEL COMPANY LTD.
Skipped Zhejiang Zheneng Electric Power Co.,Ltd.
Skipped Huaneng Lancang River Hydropower Inc..
Skipped COSCO SHIPPING Energy Transportation Co., Ltd..
Skipped Huadian Power International Corporation Limited.
Skipped 


1 Failed download:
['600567.SS']: Timeout('Failed to perform, curl: (28) Operation timed out after 430884 milliseconds with 93650 bytes received. See https://curl.se/libcurl/c/libcurl-errors.html first for more details.')

1 Failed download:
['600569.SS']: ConnectionError('Failed to perform, curl: (7) Failed to connect to query2.finance.yahoo.com port 443 after 128 ms: Could not connect to server. See https://curl.se/libcurl/c/libcurl-errors.html first for more details.')



Downloading for ticker: 600569.SS



1 Failed download:
['600570.SS']: ConnectionError('Failed to perform, curl: (7) Failed to connect to query2.finance.yahoo.com port 443 after 56 ms: Could not connect to server. See https://curl.se/libcurl/c/libcurl-errors.html first for more details.')



Downloading for ticker: 600570.SS

Downloading for ticker: 600571.SS
Saved data for SUNYARD TECHNOLOGY CO.,LTD.

Downloading for ticker: 600572.SS
Saved data for ZHEJIANG CONBA PHARMACEUTICAL CO.,LTD..

Downloading for ticker: 600573.SS
Saved data for FuJian YanJing HuiQuan Brewery Co.,Ltd.

Downloading for ticker: 600575.SS
Saved data for Huaihe Energy (Group) Co.,Ltd.

Downloading for ticker: 600576.SS
Saved data for Zhejiang Sunriver Culture Tourism Co.,.

Downloading for ticker: 600577.SS
Saved data for TONGLING JINGDA SPECIAL MAGNET WIRE CO.,LTD..

Downloading for ticker: 600578.SS
Saved data for BEIJING JINGNENG POWER CO.,LTD..

Downloading for ticker: 600579.SS
Saved data for Sinochem Equipment Technology (Qingdao) Company Limited.

Downloading for ticker: 600580.SS
Saved data for WOLONG ELECTRIC GROUP CO.，LTD..

Downloading for ticker: 600581.SS
Saved data for XinJiang Ba Yi Iron & Steel Co., Ltd.

Downloading for ticker: 600582.SS
Saved data for Tiandi Science & Technology Co


1 Failed download:
['600588.SS']: ConnectionError('Failed to perform, curl: (56) Recv failure: Connection was reset. See https://curl.se/libcurl/c/libcurl-errors.html first for more details.')



Downloading for ticker: 600589.SS



1 Failed download:
['600589.SS']: ConnectionError('Failed to perform, curl: (7) Failed to connect to query2.finance.yahoo.com port 443 after 37210 ms: Could not connect to server. See https://curl.se/libcurl/c/libcurl-errors.html first for more details.')



Downloading for ticker: 600590.SS



1 Failed download:
['600590.SS']: DNSError('Failed to perform, curl: (6) Could not resolve host: query2.finance.yahoo.com. See https://curl.se/libcurl/c/libcurl-errors.html first for more details.')



Downloading for ticker: 600592.SS



1 Failed download:
['600592.SS']: DNSError('Failed to perform, curl: (6) Could not resolve host: query2.finance.yahoo.com. See https://curl.se/libcurl/c/libcurl-errors.html first for more details.')



Downloading for ticker: 600593.SS



1 Failed download:
['600593.SS']: DNSError('Failed to perform, curl: (6) Could not resolve host: query2.finance.yahoo.com. See https://curl.se/libcurl/c/libcurl-errors.html first for more details.')



Downloading for ticker: 600594.SS



1 Failed download:
['600594.SS']: DNSError('Failed to perform, curl: (6) Could not resolve host: query2.finance.yahoo.com. See https://curl.se/libcurl/c/libcurl-errors.html first for more details.')



Downloading for ticker: 600595.SS



1 Failed download:
['600595.SS']: DNSError('Failed to perform, curl: (6) Could not resolve host: query2.finance.yahoo.com. See https://curl.se/libcurl/c/libcurl-errors.html first for more details.')



Downloading for ticker: 600596.SS



1 Failed download:
['600596.SS']: DNSError('Failed to perform, curl: (6) Could not resolve host: query2.finance.yahoo.com. See https://curl.se/libcurl/c/libcurl-errors.html first for more details.')



Downloading for ticker: 600597.SS



1 Failed download:
['600597.SS']: DNSError('Failed to perform, curl: (6) Could not resolve host: query2.finance.yahoo.com. See https://curl.se/libcurl/c/libcurl-errors.html first for more details.')



Downloading for ticker: 600598.SS



1 Failed download:
['600598.SS']: DNSError('Failed to perform, curl: (6) Could not resolve host: query2.finance.yahoo.com. See https://curl.se/libcurl/c/libcurl-errors.html first for more details.')



Downloading for ticker: 600600.SS



1 Failed download:
['600600.SS']: DNSError('Failed to perform, curl: (6) Could not resolve host: query2.finance.yahoo.com. See https://curl.se/libcurl/c/libcurl-errors.html first for more details.')



Downloading for ticker: 600601.SS



1 Failed download:
['600601.SS']: DNSError('Failed to perform, curl: (6) Could not resolve host: query2.finance.yahoo.com. See https://curl.se/libcurl/c/libcurl-errors.html first for more details.')



Downloading for ticker: 600602.SS



1 Failed download:
['600602.SS']: DNSError('Failed to perform, curl: (6) Could not resolve host: query2.finance.yahoo.com. See https://curl.se/libcurl/c/libcurl-errors.html first for more details.')



Downloading for ticker: 600603.SS
Saved data for GUANGHUI LOGISTICS CO.，LTD.

Downloading for ticker: 600604.SS
Saved data for SHANGHAI SHIBEI HI-TECH CO.,LTD..

Downloading for ticker: 600605.SS
Saved data for SHANGHAI HUITONG ENERGY CO.,LTD..

Downloading for ticker: 600606.SS
Saved data for Greenland Holdings Corporation Limited.

Downloading for ticker: 600609.SS
Saved data for SHENYANG JINBEI AUTOMOTIVE COMPANY LIMTED.

Downloading for ticker: 600610.SS
Saved data for Guizhou Zhong Yi Da Co.,Ltd.

Downloading for ticker: 600611.SS
Saved data for DAZHONG TRANSPORTATION（GROUP）CO.,LTD..

Downloading for ticker: 600612.SS
Saved data for LAO FENG XIANG CO.,LTD..

Downloading for ticker: 600613.SS
Saved data for Shanghai Shenqi Pharmaceutical Investment Management Co., Ltd..

Downloading for ticker: 600615.SS
Saved data for Chongqing Shineray Intelligent Manufacturing Technology Co., Ltd..

Downloading for ticker: 600616.SS
Saved data for SHANGHAI JINFENG WINE COMPANY LIMITED.

Downloa


1 Failed download:
['600808.SS']: SSLError('Failed to perform, curl: (35) Recv failure: Connection was reset. See https://curl.se/libcurl/c/libcurl-errors.html first for more details.')



Downloading for ticker: 600809.SS
Saved data for SHANXI XINGHUACUN FEN WINE FACTORY  CO.,LTD.


Note: The code above takes around 40-50 minutes to completely run. 

## Discarded Code

This code was disregarded because the files it generated were too big to be committed on the GitHub.

In [None]:
# declare start and end dates
start_date = '2000-01-01'
end_date = '2026-01-01'

def download_stocks_in_batches(tickers, batch_size=5, delay=1, verbose=False):
    """
    Download stock data in batches to avoid rate limiting
    """
    all_data = {}
    
    for i in range(0, len(tickers), batch_size):
        batch = tickers[i:i + batch_size]
        if verbose:
            print(f"Downloading batch {i//batch_size + 1}: {batch}")
        
        try:
            # Download the batch
            batch_data = yf.download(
                batch,
                start=start_date,
                end=end_date,
                progress=False
            )
            
            # Extract closing prices for this batch
            if not batch_data.empty and 'Close' in batch_data.columns:
                closes = batch_data['Close']
                if isinstance(closes, pd.Series):
                    all_data[batch[0]] = closes
                else:
                    for ticker in closes.columns:
                        all_data[ticker] = closes[ticker]
                if verbose:
                    print(f"Successfully downloaded {len(batch)} stocks")
            else:
                print(f"No data returned for batch: {batch}")
            
        except Exception as e:
            print(f"Error downloading batch {batch}: {e}")
        
        # Add delay to avoid rate limiting
        if i + batch_size < len(tickers):
            if verbose:
                print(f"Waiting {delay} seconds before next batch...")
            time.sleep(delay)
    
    if all_data:
        return pd.DataFrame(all_data)
    else:
        return pd.DataFrame()

In [None]:
# # Download the closing prices
# closing_df = download_stocks_in_batches(
#     tickers_list, 
#     batch_size=5, 
#     delay=5
# )

# # removing unnecessary columns and rows
# closing_df.dropna(how='all', axis=1, inplace=True)
# closing_df.dropna(how='all', axis=0, inplace=True)

# # # save to pkl
# # if not closing_df.empty:
# #     closing_df.to_pickle('SSE companies closing prices.pkl')

# # cut into parts
# closing_df1 = closing_df.iloc[:, :int(np.floor(len(closing_df) / 2))]
# closing_df2 = closing_df.iloc[:, int(np.floor(len(closing_df) / 2)):]

# # save as pkl files
# if not closing_df1.empty and closing_df2.empty:
#     closing_df1.to_pickle('data/01 SSE companies closing prices.pkl')
#     closing_df2.to_pickle('data/02 SSE companies closing prices.pkl')

Note: The code above takes around 30-45 minutes to completely run. 