In [None]:
import pandas as pd
import os
from datetime import datetime


In [None]:
def read_files():
    df1 = pd.read_csv('data/spy.csv')
    df2 = pd.read_csv('data/spy_prev.csv')
    # combine the two dataframes
    df = pd.concat([df1, df2], ignore_index=True)
    return df

In [None]:

def split_spy_data_by_year(df, output_dir="data"):
    """
    Split SPY CSV data by year and save each year to a separate file
    
    Args:
        df (pd.DataFrame): DataFrame containing the SPY data
        output_dir (str): Directory to save the split files
    """
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
        
    # Convert date column to datetime
    df['date'] = pd.to_datetime(df['date'])
    
    # Extract year from date
    df['year'] = df['date'].dt.year
    
    # Get unique years
    years = sorted(df['year'].unique())
    print(f"Found data for years: {years}")
    
    # Split and save by year
    for year in years:
        year_data = df[df['year'] == year].copy()
        
        # Drop the temporary year column
        year_data = year_data.drop('year', axis=1)

        # distinct and sort year data by date and minute
        year_data = year_data.sort_values(by=['date', 'minute']).drop_duplicates(subset=['date', 'minute'])
        
        # Create filename
        output_file = os.path.join(output_dir, f"spy_{year}.csv")
        
        # Save to CSV
        year_data.to_csv(output_file, index=False)
        
        print(f"Saved {len(year_data):,} rows for year {year} to {output_file}")
    
    print("\nData splitting completed!")
    
    # Summary statistics
    print(f"\nSummary:")
    print(f"Total rows processed: {len(df):,}")
    print(f"Date range: {df['date'].min().date()} to {df['date'].max().date()}")
    print(f"Number of years: {len(years)}")

if __name__ == "__main__":
    df = read_files()
    split_spy_data_by_year(df)
