In [None]:
%pip install pandas_ta

Note: you may need to restart the kernel to use updated packages.


In [1]:
import yfinance as yf
import numpy as np
import pandas as pd
from datetime import datetime
import os
import matplotlib.pyplot as plt
import pandas_ta as ta
from scipy import stats

# 1. Data Collection

The whole dataset is retrieved by yfinance package.

The retrieved date is 11/14/2024, time frame is 5 year.

In [27]:
def download_data(ticker, ideal_period='5y'):
  """
  Try to download data for the given ticker with a specified period:5y.
  After several trys, I realize for some tickers, the 5y time frame is unavailable.
  Then it'll fall back to shorter periods.
  """

  periods = [ideal_period, "2y", "1y"]
  for period in periods:
    try:
      data = yf.download(ticker, period=period, interval='1d')
      return data

    except Exception as e:
      print(f"{ticker}: Period '{period}' unavailable, trying next period")
      continue
  print(f"Failed to download {ticker} data.")
  return None

In [28]:
def collect_and_save_data(ticker_list, output_file_path):
  """
  Collects stock data for a list of tickers and saves it to a CSV file.
  """

  all_data=[]

  for ticker in ticker_list:
    ticker_data = download_data(ticker)
    if ticker_data is not None:
      ticker_data = ticker_data.reset_index() # avoid redudant index cols
      ticker_data.columns = ['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
      ticker_data['Ticker'] = ticker
      all_data.append(ticker_data)

  if all_data:
    combined_data = pd.concat(all_data, axis=0) # combine all data downloaded

    output_dir = os.path.dirname(output_file_path) # save to csv file
    os.makedirs(output_dir, exist_ok=True)

    column_order = ['Date', 'Ticker', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
    combined_data = combined_data[column_order]
    combined_data.to_csv(output_file_path, index=False)
    print(f"Data successfully saved to {output_file_path}")
  else:
    print("No data to save.")

In [29]:
# EXAMPLE USEAGE
tickers_list = [
    'CI', 'WST', 'QGEN', 'PODD', 'GMED', 'CTLT', 'PEN', 'INCY', 'VKTX', 'MTD', 'LLY',
    'VEEV', 'ALNY', 'IQV', 'ITCI', 'TMO', 'CNC', 'RVMD', 'HUM', 'JAZZ', 'DXCM', 'BDX',
    'INSM', 'BIO', 'WAT', 'ABBV', 'MRNA', 'CRL', 'ABT', 'ENSG', 'RGEN', 'ICLR', 'SOLV',
    'CHE', 'HQY', 'AMGN', 'STE', 'WBA', 'BSX', 'MDT', 'COR', 'BMRN', 'ALGN', 'COO', 'DHR',
    'SMMT', 'ELV', 'ZTS', 'VTRS', 'RPRX', 'RMD', 'SRPT', 'LNTH', 'BMY', 'A', 'ZBH', 'PCVX',
    'EXAS', 'ISRG', 'DVA', 'ILMN', 'ROIV', 'TECH', 'GILD', 'IDXX', 'EW', 'AVTR', 'MOH', 'UTHR',
    'NTRA', 'EXEL', 'MRK', 'BIIB', 'HSIC', 'BAX', 'ATR', 'TFX', 'VRTX', 'CVS', 'UHS', 'EHC', 'MCK',
    'GEHC', 'BRKR', 'JNJ', 'NBIX', 'RVTY', 'HCA', 'UNH', 'THC', 'SYK', 'LH', 'DOCS', 'HOLX', 'DGX',
    'REGN', 'MEDP', 'CAH', 'MASI', 'PFE'
]

output_file_path = '../output/corps_ori_data.csv'
collect_and_save_data(tickers_list, output_file_path)


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

Data successfully saved to ../output/corps_ori_data.csv
