In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

# Display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)

In [2]:
# Paths
RAW_DATA_DIR = Path("../data/raw")
PROCESSED_DATA_DIR = Path("../data/processed")
PROCESSED_DATA_DIR.mkdir(exist_ok=True)

In [3]:
# Helper function
def clean_market_data(df):
    # Ensure datetime type
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    df = df.dropna(subset=['Date'])
    df = df.sort_values('Date')

    # Remove duplicates
    df = df.drop_duplicates(subset=['Date'])

    # Forward fill missing values
    df = df.ffill()

    return df

In [4]:
# Load and clean each dataset
market_files = [
    "AAPL_market.csv",
    "IBEX35_market.csv",
    "MSFT_market.csv",
    "NASDAQ_market.csv",
    "S&P500_market.csv"
]

for file in market_files:
    file_path = RAW_DATA_DIR / file
    if file_path.exists():
        print(f"Cleaning {file_path}...")
        df = pd.read_csv(file_path)
        df_clean = clean_market_data(df)

        # Save cleaned version
        output_path = PROCESSED_DATA_DIR / file
        df_clean.to_csv(output_path, index=False)
        print(f"Saved cleaned file to {output_path}")
    else:
        print(f"File not found: {file}")

Cleaning ..\data\raw\AAPL_market.csv...
Saved cleaned file to ..\data\processed\AAPL_market.csv
Cleaning ..\data\raw\IBEX35_market.csv...
Saved cleaned file to ..\data\processed\IBEX35_market.csv
Cleaning ..\data\raw\MSFT_market.csv...
Saved cleaned file to ..\data\processed\MSFT_market.csv
Cleaning ..\data\raw\NASDAQ_market.csv...
Saved cleaned file to ..\data\processed\NASDAQ_market.csv
Cleaning ..\data\raw\S&P500_market.csv...
Saved cleaned file to ..\data\processed\S&P500_market.csv


In [5]:
# Quick sanity check
sample_df = pd.read_csv(PROCESSED_DATA_DIR / "AAPL_market.csv")
sample_df.head()

Unnamed: 0,Date,Close,High,Low,Open,Volume,Ticker
0,2015-01-02,24.288582,24.757336,23.848707,24.746228,212818400,AAPL
1,2015-01-05,23.604336,24.137516,23.417723,24.057539,257142000,AAPL
2,2015-01-06,23.606554,23.866479,23.244435,23.668758,263188400,AAPL
3,2015-01-07,23.937574,24.037545,23.704307,23.815387,160423600,AAPL
4,2015-01-08,24.857307,24.915069,24.148621,24.266367,237458000,AAPL
