This notebook was created by Donna Faith Go.

In [None]:
import sys
!{sys.executable} -m pip install -qq -r requirements.txt

In [1]:
# standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import os

# data gathering
import yfinance as yf
import time
import pandas_datareader.data as web
from datetime import datetime, timedelta

# ignore warnings
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

## Closing Prices

## Stock Tickers

In [2]:
filepath = 'SSE companies list.pkl'
with open(filepath, 'rb') as f:
    companies_df = pickle.load(f)
companies_df.head()

Unnamed: 0,Company Name,Company Code
0,"SHANGHAI PUDONG DEVELOPMENT BANK CO., LTD.",600000
1,"Guangzhou Baiyun International Airport Co.,ltd.",600004
2,"Dongfeng Automobile Co.,LTD",600006
3,China World Trade Center Company Ltd.,600007
4,Beijing Capital Eco-Environment Protection Gro...,600008


In [3]:
tickers_list = companies_df['Company Code'].to_list()
tickers_list = [ticker + ".SS" for ticker in tickers_list]
print(tickers_list[:5])

['600000.SS', '600004.SS', '600006.SS', '600007.SS', '600008.SS']


In [4]:
# sanity checks
print(tickers_list[0])
print(companies_df['Company Name'][0])
print(len(companies_df) == len(tickers_list)) # should be True

600000.SS
SHANGHAI PUDONG DEVELOPMENT BANK CO., LTD.
True


## yfinance

In [5]:
# pull individual stock data from yfinance
def download_info_per_stock(ticker, verbose=False, 
                            start_date='2000-01-01', 
                            end_date='2026-01-01'):
        try:
            # get data for the ticker
            ticker_data = yf.download(
                ticker,
                start=start_date,
                end=end_date,
                progress=False,
                timeout=120 # in case of slow internet, in seconds
            )
            return pd.DataFrame(ticker_data)
            
        except Exception as e:
            if verbose:
                print(f"Error downloading batch {batch}: {e}")
            return None

# saving individual stock data
def save_info_per_stock(ticker_list, delay=1, 
                        verbose=False, override=False,
                        start_date='2000-01-01', 
                        end_date='2026-01-01'):
    
    # create the data folder
    os.makedirs("data", exist_ok=True)

    for i in range(0, len(ticker_list)):
        # declare company name and filepath
        ticker_name = companies_df['Company Name'][i]
        filepath = f"data/{ticker_name}.pkl"

        # skip if not override and file exists
        if not override and os.path.exists(filepath):
            if verbose:
                print(f"Skipped {ticker_name}.")
                continue
                
        # get the data for each stock
        if verbose:
            print(f"Downloading for ticker: {ticker_list[i]}")
        ticker_data = download_info_per_stock(ticker_list[i],
                                              start_date=start_date,
                                              end_date=end_date)

        # saving data as a pkl file
        if ticker_data is not None and not ticker_data.empty:
            ticker_data.to_pickle(filepath)
            if verbose == True:
                print(f"Saved data for {ticker_list[i]}.")
        
        # avoid rate limiting
        time.sleep(delay)

    print("Done downloading all data!")

In [6]:
save_info_per_stock(tickers_list)


1 Failed download:
['600000.SS']: DNSError('Failed to perform, curl: (6) Could not resolve host: query1.finance.yahoo.com. See https://curl.se/libcurl/c/libcurl-errors.html first for more details.')


KeyboardInterrupt: 

## Discarded Code

This code was disregarded because the files it generated were too big to be committed on the GitHub.

In [None]:
# declare start and end dates
start_date = '2000-01-01'
end_date = '2026-01-01'

def download_stocks_in_batches(tickers, batch_size=5, delay=1, verbose=False):
    """
    Download stock data in batches to avoid rate limiting
    """
    all_data = {}
    
    for i in range(0, len(tickers), batch_size):
        batch = tickers[i:i + batch_size]
        if verbose:
            print(f"Downloading batch {i//batch_size + 1}: {batch}")
        
        try:
            # Download the batch
            batch_data = yf.download(
                batch,
                start=start_date,
                end=end_date,
                progress=False
            )
            
            # Extract closing prices for this batch
            if not batch_data.empty and 'Close' in batch_data.columns:
                closes = batch_data['Close']
                if isinstance(closes, pd.Series):
                    all_data[batch[0]] = closes
                else:
                    for ticker in closes.columns:
                        all_data[ticker] = closes[ticker]
                if verbose:
                    print(f"Successfully downloaded {len(batch)} stocks")
            else:
                print(f"No data returned for batch: {batch}")
            
        except Exception as e:
            print(f"Error downloading batch {batch}: {e}")
        
        # Add delay to avoid rate limiting
        if i + batch_size < len(tickers):
            if verbose:
                print(f"Waiting {delay} seconds before next batch...")
            time.sleep(delay)
    
    if all_data:
        return pd.DataFrame(all_data)
    else:
        return pd.DataFrame()

In [None]:
# # Download the closing prices
# closing_df = download_stocks_in_batches(
#     tickers_list, 
#     batch_size=5, 
#     delay=5
# )

# # removing unnecessary columns and rows
# closing_df.dropna(how='all', axis=1, inplace=True)
# closing_df.dropna(how='all', axis=0, inplace=True)

# # # save to pkl
# # if not closing_df.empty:
# #     closing_df.to_pickle('SSE companies closing prices.pkl')

# # cut into parts
# closing_df1 = closing_df.iloc[:, :int(np.floor(len(closing_df) / 2))]
# closing_df2 = closing_df.iloc[:, int(np.floor(len(closing_df) / 2)):]

# # save as pkl files
# if not closing_df1.empty and closing_df2.empty:
#     closing_df1.to_pickle('data/01 SSE companies closing prices.pkl')
#     closing_df2.to_pickle('data/02 SSE companies closing prices.pkl')

Note: The code above takes around 30-45 minutes to completely run. 