### Optional Download (You shouldn't need to do this)

In [None]:
# uncomment to download data. 
# Not needed if you have the .parquet file in the data/historical folder

# !python qualifier\utils\download_stock_data.py

### Load Dataset from file

In [1]:
import pandas as pd
import numpy as np

# Load the Parquet file
df = pd.read_parquet("data/historical/all_stocks_historical.parquet")

print("Dataset loaded")

print(f"\nShape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"\nData types:\n{df.dtypes}")


Dataset loaded

Shape: (89859344, 7)
Columns: ['ticker', 'date', 'open', 'high', 'low', 'close', 'volume']

Data types:
ticker            object
date      datetime64[us]
open             float64
high             float64
low              float64
close            float64
volume           float64
dtype: object


In [2]:
# asic info about the dataset
print("=== Dataset Overview ===")
print(f"Total rows: {len(df):,}")
print(f"Unique tickers: {df['ticker'].nunique():,}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print(f"Total trading days: {(df['date'].max() - df['date'].min()).days} days")
print(f"\nMissing values:\n{df.isnull().sum()}")

=== Dataset Overview ===
Total rows: 89,859,344
Unique tickers: 7,144
Date range: 1962-01-02 00:00:00 to 2025-11-13 00:00:00
Total trading days: 23326 days

Missing values:
ticker           0
date             0
open      63587034
high      63587034
low       63587034
close     63587033
volume    63587031
dtype: int64


### Trimming data to start in 2015 so we dont drop all stocks when we clean.

In [3]:
# triom data to start on 2015-01-01
df = df[df['date'] >= '2015-01-01']

# remove nans and then check missing values again
df = df.dropna()
print(f"Total rows: {len(df):,}")
print(f"Unique tickers: {df['ticker'].nunique():,}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print(f"Total trading days: {(df['date'].max() - df['date'].min()).days} days")
print(f"\nMissing values:\n{df.isnull().sum()}")


Total rows: 12,711,497
Unique tickers: 7,093
Date range: 2015-01-02 00:00:00 to 2025-11-13 00:00:00
Total trading days: 3968 days

Missing values:
ticker    0
date      0
open      0
high      0
low       0
close     0
volume    0
dtype: int64


In [5]:
# Basic statistics
print("=== Descriptive Statistics ===")
df[['ticker', 'open', 'high', 'low', 'close', 'volume']].describe()


=== Descriptive Statistics ===


Unnamed: 0,open,high,low,close,volume
count,12711500.0,12711500.0,12711500.0,12711500.0,12711500.0
mean,64382590.0,67478670.0,60860880.0,63682100.0,1441609.0
std,10899880000.0,11424050000.0,10317390000.0,10784330000.0,9907111.0
min,-2783.0,-2865.5,-2704.9,-2788.5,0.0
25%,9.365682,9.53,9.193228,9.359791,24300.0
50%,21.12367,21.4375,20.81005,21.12,172900.0
75%,48.78992,49.45642,48.0933,48.77124,818700.0
max,3614625000000.0,4662562000000.0,3417188000000.0,3599437000000.0,3692928000.0
