## Get the full raw price data from Yfinance

In [7]:
import pandas as pd
import numpy as np
import yfinance as yf
from datetime import datetime
import time
from tqdm import tqdm
import os

In [8]:
def fetch_stock_data(ticker):
    try:
        stock = yf.Ticker(ticker)
        data = stock.history(start="2010-01-01", end=datetime.now().strftime("%Y-%m-%d"))
        data = data.reset_index()
        data['Ticker'] = ticker
        return data
    except Exception as e:
        print(f"Error fetching data for {ticker}: {str(e)}")
        return pd.DataFrame()

def process_batch(tickers, batch_size=1800, time_limit=3600):
    results = []
    start_time = time.time()

    for i, ticker in enumerate(tqdm(tickers)):
        results.append(fetch_stock_data(ticker))

        if (i + 1) % batch_size == 0:
            elapsed_time = time.time() - start_time
            if elapsed_time < time_limit:
                time.sleep(time_limit - elapsed_time)
            start_time = time.time()

    return pd.concat(results, ignore_index=True)

def main():
    # Read the CSV file
    df = pd.read_csv('~/Small-Cap-Scout/raw_data/cik_ticker_pairs.csv')

    # Get the list of tickers
    tickers = df['TICKER'].tolist()

    all_data = pd.DataFrame()
    batch_size = 1800  # Slightly under 2000 to account for potential errors

    for i in range(0, len(tickers), batch_size):
        batch = tickers[i:i+batch_size]
        print(f"Processing batch {i//batch_size + 1} of {len(tickers)//batch_size + 1}")
        batch_data = process_batch(batch)
        all_data = pd.concat([all_data, batch_data], ignore_index=True)

        # Save intermediate results
        all_data.to_csv(f'yahoo_stock_data_since_2010_batch_{i//batch_size + 1}.csv', index=False)

    # Save final results
    all_data.to_csv('yahoo_stock_data_since_2010_complete.csv', index=False)

    print("Data collection complete. Final results saved to yahoo_stock_data_since_2010_complete.csv")

if __name__ == "__main__":
    main()

Processing batch 1 of 4


  0%|          | 3/1800 [00:01<10:06,  2.96it/s]$ACET.Q: possibly delisted; no timezone found
  0%|          | 8/1800 [00:03<09:00,  3.32it/s]$BYI: possibly delisted; no price data found  (1d 2010-01-01 -> 2024-09-05)
  1%|          | 17/1800 [00:05<06:17,  4.73it/s]$IDSA: possibly delisted; no timezone found
  2%|▏         | 35/1800 [00:09<06:02,  4.87it/s]$SSI: possibly delisted; no timezone found
  2%|▏         | 38/1800 [00:10<08:40,  3.38it/s]$TREC: possibly delisted; no timezone found
  3%|▎         | 52/1800 [00:14<06:30,  4.48it/s]$AVP: possibly delisted; no timezone found
  3%|▎         | 56/1800 [00:15<07:48,  3.72it/s]$PTVC.B: possibly delisted; no timezone found
  3%|▎         | 58/1800 [00:16<10:45,  2.70it/s]$BCR: possibly delisted; no price data found  (1d 2010-01-01 -> 2024-09-05)
  3%|▎         | 61/1800 [00:17<07:36,  3.81it/s]$ESTE: possibly delisted; no timezone found
  4%|▍         | 73/1800 [00:20<07:05,  4.06it/s]$BWL A: possibly delisted; no timezone found
  4%|

Processing batch 2 of 4


  0%|          | 1/1800 [00:00<24:21,  1.23it/s]$CVO: possibly delisted; no price data found  (1d 2010-01-01 -> 2024-09-05)
  0%|          | 3/1800 [00:01<11:17,  2.65it/s]$ITG: possibly delisted; no timezone found
  0%|          | 5/1800 [00:04<30:37,  1.02s/it]$BVSN.Q: possibly delisted; no timezone found
  0%|          | 6/1800 [00:05<31:45,  1.06s/it]$LJPC: possibly delisted; no timezone found
  1%|          | 13/1800 [00:08<09:45,  3.05it/s]$HEOP: possibly delisted; no price data found  (1d 2010-01-01 -> 2024-09-05)
  1%|          | 14/1800 [00:08<08:51,  3.36it/s]$RBCA.A: possibly delisted; no timezone found
  1%|          | 15/1800 [00:09<15:22,  1.93it/s]$MGPC: possibly delisted; no timezone found
  1%|          | 17/1800 [00:11<17:00,  1.75it/s]$VYFC: possibly delisted; no price data found  (1d 2010-01-01 -> 2024-09-05)
  1%|          | 19/1800 [00:11<12:01,  2.47it/s]$HCBK: possibly delisted; no price data found  (1d 2010-01-01 -> 2024-09-05)
  1%|▏         | 23/1800 [00:12<0

Processing batch 3 of 4


  0%|          | 0/1800 [00:00<?, ?it/s]$OSIR: possibly delisted; no timezone found
  0%|          | 6/1800 [00:03<12:30,  2.39it/s]  $ORBC: possibly delisted; no timezone found
  0%|          | 8/1800 [00:05<16:04,  1.86it/s]$PSMH: possibly delisted; no timezone found
  1%|          | 13/1800 [00:07<12:06,  2.46it/s]$PLPM: possibly delisted; no price data found  (1d 2010-01-01 -> 2024-09-05)
  1%|          | 14/1800 [00:07<09:26,  3.15it/s]$AYR: possibly delisted; no timezone found
  1%|          | 15/1800 [00:08<15:26,  1.93it/s]$PGSI: possibly delisted; no timezone found
  1%|          | 21/1800 [00:11<10:03,  2.95it/s]$MNRK: possibly delisted; no price data found  (1d 2010-01-01 -> 2024-09-05)
  1%|▏         | 25/1800 [00:12<08:31,  3.47it/s]$LEAF: possibly delisted; no timezone found
  2%|▏         | 32/1800 [00:14<08:46,  3.36it/s]$BSHF: possibly delisted; no timezone found
  2%|▏         | 33/1800 [00:16<16:18,  1.81it/s]$AMRS.Q: possibly delisted; no timezone found
  2%|▏      

Processing batch 4 of 4


  0%|          | 0/523 [00:00<?, ?it/s]$SQBG.Q: possibly delisted; no timezone found
  2%|▏         | 13/523 [00:03<01:28,  5.75it/s]$WBT: possibly delisted; no timezone found
  3%|▎         | 14/523 [00:04<02:51,  2.97it/s]$FOCS: possibly delisted; no timezone found
  3%|▎         | 15/523 [00:05<05:03,  1.68it/s]$ELVT: possibly delisted; no timezone found
  3%|▎         | 16/523 [00:06<05:24,  1.56it/s]$ACIA: possibly delisted; no timezone found
  4%|▍         | 20/523 [00:07<02:58,  2.81it/s]$TRHC: possibly delisted; no timezone found
  4%|▍         | 21/523 [00:08<04:10,  2.01it/s]$DMTK.Q: possibly delisted; no timezone found
  5%|▍         | 25/523 [00:10<02:55,  2.83it/s]$PBBI: possibly delisted; no timezone found
  5%|▌         | 27/523 [00:11<03:21,  2.47it/s]$IEA: possibly delisted; no timezone found
  6%|▋         | 33/523 [00:13<02:10,  3.77it/s]$ASAP.Q: possibly delisted; no timezone found
  7%|▋         | 39/523 [00:15<02:00,  4.03it/s]$DCPH: possibly delisted; no timezone

Data collection complete. Final results saved to yahoo_stock_data_since_2010_complete.csv


## Check it out

In [10]:
complete_data_df = pd.read_csv('~/Small-Cap-Scout/notebooks/yahoo_stock_data_since_2010_complete.csv')

In [11]:
complete_data_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker,Adj Close,Capital Gains
0,2010-01-04 00:00:00-05:00,20.958153,21.482332,20.7774,21.482332,460600.0,0.0,0.0,AIR,,
1,2010-01-05 00:00:00-05:00,21.437139,21.943245,21.437139,21.898056,496300.0,0.0,0.0,AIR,,
2,2010-01-06 00:00:00-05:00,21.563669,23.470598,21.563669,23.000645,848600.0,0.0,0.0,AIR,,
3,2010-01-07 00:00:00-05:00,22.810853,23.416371,22.78374,23.362146,330700.0,0.0,0.0,AIR,,
4,2010-01-08 00:00:00-05:00,23.271768,23.570008,22.937377,22.991604,234000.0,0.0,0.0,AIR,,


In [12]:
complete_data_df.tail()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker,Adj Close,Capital Gains
12886355,2024-08-28 00:00:00-04:00,7.05,7.075,6.69,6.83,625300.0,0.0,0.0,RLAY,,
12886356,2024-08-29 00:00:00-04:00,6.94,7.07,6.74,6.76,690100.0,0.0,0.0,RLAY,,
12886357,2024-08-30 00:00:00-04:00,6.84,7.0,6.61,6.79,1166800.0,0.0,0.0,RLAY,,
12886358,2024-09-03 00:00:00-04:00,6.76,7.0,6.47,6.47,702600.0,0.0,0.0,RLAY,,
12886359,2024-09-04 00:00:00-04:00,6.45,6.62,6.21,6.35,876200.0,0.0,0.0,RLAY,,


In [13]:
complete_data_df.shape

(12886360, 11)

In [14]:
count_true = complete_data_df.isna().any(axis=1).sum()
count_true

np.int64(12886360)

## Get the monthly averages and volatility

In [8]:
all_data = pd.read_csv('~/Small-Cap-Scout/notebooks/yahoo_stock_data_since_2010_complete.csv', parse_dates=['Date'])

# Ensure the 'Date' column is in datetime format, removing the timezone information
all_data['Date'] = pd.to_datetime(all_data['Date'], utc=True).dt.tz_convert(None)

# Define the date range (modify as needed)
start_date = '2010-01-01'
end_date = datetime.today()

# Filter the data by the specified date range
all_data = all_data[(all_data['Date'] >= start_date) & (all_data['Date'] <= end_date)]

# Initialize a list to hold the processed data
processed_data = []

# Loop through each unique ticker in the data
for ticker in all_data['Ticker'].unique():
    # Filter the data for the current ticker
    df = all_data[all_data['Ticker'] == ticker]

    # Ensure the dataframe is not empty after filtering
    if not df.empty:
        # Calculate the monthly average prices
        df['Month'] = df['Date'].dt.to_period('M')
        monthly_avg = df.groupby('Month')['Close'].mean().reset_index()
        monthly_avg.rename(columns={'Close': 'Monthly_Avg_Close'}, inplace=True)

        # Calculate the monthly total volume
        monthly_volume_total = df.groupby('Month')['Volume'].sum().reset_index()
        monthly_volume_total.rename(columns={'Volume': 'Monthly_Volume_Total'}, inplace=True)

        # Calculate the monthly average volume
        monthly_volume_avg = df.groupby('Month')['Volume'].mean().reset_index()
        monthly_volume_avg.rename(columns={'Volume': 'Monthly_Volume_Avg'}, inplace=True)

        # Calculate the monthly volatility
        df['Log_Return'] = df['Close'].pct_change().apply(lambda x: np.log(1 + x))

        monthly_volatility = df.groupby('Month')['Log_Return'].std().reset_index()

        # Dynamically calculate the number of months for volatility scaling
        num_days = df.groupby(['Month'])['Log_Return'].count()
        #if num_days > 0:
        monthly_volatility['Monthly_Volatility'] = monthly_volatility['Log_Return'] * np.sqrt(num_days)
        #else:
         #   monthly_volatility['Monthly_Volatility'] = np.nan  # In case of no data

        monthly_volatility.drop(columns=['Log_Return'], inplace=True)

        # Merge the monthly average, total volume, average volume, and volatility data
        result = pd.merge(monthly_avg, monthly_volume_total, on='Month', how='inner')
        result = pd.merge(result, monthly_volume_avg, on='Month', how='inner')
        result = pd.merge(result, monthly_volatility, on='Month', how='inner')
        result['Ticker'] = ticker

        # Rearrange columns to put 'Ticker' first
        column_order = ['Ticker'] + [col for col in result.columns if col != 'Ticker']
        result = result[column_order]

        # Append the result to the list
        processed_data.append(result)

# Check if there's any data to concatenate
if processed_data:
    # Concatenate all data into a single dataframe
    final_df = pd.concat(processed_data, ignore_index=True)

    # Save the final dataframe to a CSV file
    output_file = os.path.expanduser('~/Small-Cap-Scout/raw_data/processed_yahoo_data_revised_volatility.csv')
    final_df.to_csv(output_file, index=False)
    print(f"Data processing complete. Results saved to {output_file}. WHOOP WHOOP!")

    # Optionally, display the first few rows of the final DataFrame
    print("First 5 rows of the processed data:")
    print(final_df.head())
else:
    print("No data to process. Please check your files and filters, por favor.")

# Load the processed data into a DataFrame to inspect
processed_df = pd.read_csv(output_file)

# Display the last 5 rows of the DataFrame
print(processed_df.tail())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Month'] = df['Date'].dt.to_period('M')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Log_Return'] = df['Close'].pct_change().apply(lambda x: np.log(1 + x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Month'] = df['Date'].dt.to_period('M')
A value is trying to be set on a copy of a s

Data processing complete. Results saved to /Users/eoingaynard/Small-Cap-Scout/raw_data/processed_yahoo_data_revised_volatility.csv. WHOOP WHOOP!
First 5 rows of the processed data:
  Ticker    Month  Monthly_Avg_Close  Monthly_Volume_Total  \
0    AIR  2010-01          21.941817             6383700.0   
1    AIR  2010-02          20.758845             5145100.0   
2    AIR  2010-03          22.245808             6584700.0   
3    AIR  2010-04          22.615037             5569000.0   
4    AIR  2010-05          20.316029             9641700.0   

   Monthly_Volume_Avg  Monthly_Volatility  
0       335984.210526                 NaN  
1       270794.736842                 NaN  
2       286291.304348                 NaN  
3       265190.476190                 NaN  
4       482085.000000                 NaN  
       Ticker    Month  Monthly_Avg_Close  Monthly_Volume_Total  \
612199   RLAY  2024-05           6.722045            27627900.0   
612200   RLAY  2024-06           7.089737       

In [9]:
processed_df.head()


Unnamed: 0,Ticker,Month,Monthly_Avg_Close,Monthly_Volume_Total,Monthly_Volume_Avg,Monthly_Volatility
0,AIR,2010-01,21.941817,6383700.0,335984.210526,
1,AIR,2010-02,20.758845,5145100.0,270794.736842,
2,AIR,2010-03,22.245808,6584700.0,286291.304348,
3,AIR,2010-04,22.615037,5569000.0,265190.47619,
4,AIR,2010-05,20.316029,9641700.0,482085.0,


In [10]:
processed_df.shape

(612204, 6)

In [11]:
processed_df.tail()


Unnamed: 0,Ticker,Month,Monthly_Avg_Close,Monthly_Volume_Total,Monthly_Volume_Avg,Monthly_Volatility
612199,RLAY,2024-05,6.722045,27627900.0,1255814.0,
612200,RLAY,2024-06,7.089737,29045000.0,1528684.0,
612201,RLAY,2024-07,7.965682,28054700.0,1275214.0,
612202,RLAY,2024-08,6.976364,18659200.0,848145.5,
612203,RLAY,2024-09,6.41,1578800.0,789400.0,
