In [1]:
# 📦 Import libraries
import yfinance as yf
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_ta as ta

# 📊 Set plot style
sns.set(style="whitegrid")

# 📈 Step 1: Download historical stock data (clean single-level format)
ticker = "AAPL"
start_date = "2020-01-01"
end_date = "2024-12-31"
data = yf.download(ticker, start=start_date, end=end_date)

# 📛 Rename columns to standard lowercase names
data.columns = ['open', 'high', 'low', 'close', 'volume']

# 🛠 Step 2: Flatten MultiIndex columns if present (fix for yfinance format)
if isinstance(data.columns, pd.MultiIndex):
    data.columns = data.columns.get_level_values(1)

# Rename columns to lowercase for consistency
data.columns = [col.lower() for col in data.columns]

# ✅ Preview cleaned columns
print("Cleaned columns:", data.columns.tolist())

# 🔁 Step 3: Calculate Daily Return
data['daily_return'] = data['close'].pct_change()

# 🧠 Step 4: Add Technical Indicators
data['SMA_20'] = ta.sma(data['close'], length=20)
data['SMA_50'] = ta.sma(data['close'], length=50)
data['RSI'] = ta.rsi(data['close'], length=14)

# MACD
macd = ta.macd(data['close'])
if macd is not None:
    data['MACD'] = macd['MACD_12_26_9']
    data['Signal_Line'] = macd['MACDs_12_26_9']

# Bollinger Bands
bb = ta.bbands(data['close'])

if bb is not None and bb.shape[1] >= 3:
    # Automatically get the first 3 columns (Upper, Middle, Lower)
    data['BB_upper'] = bb.iloc[:, 0]
    data['BB_middle'] = bb.iloc[:, 1]
    data['BB_lower'] = bb.iloc[:, 2]
else:
    print("⚠️ Bollinger Bands not returned correctly.")

# ✨ Step 5: Create ML Labels (Buy/Sell/Hold)
# Look ahead 3 trading days to capture momentum
lookahead_days = 3
data['future_return'] = data['close'].shift(-lookahead_days) / data['close'] - 1

# Set wider thresholds to catch real movements
def classify_signal(x):
    if x > 0.02:
        return 1   # Buy
    elif x < -0.02:
        return -1  # Sell
    else:
        return 0   # Hold

data['Signal'] = data['future_return'].apply(classify_signal)

# 🧹 Step 6: Final cleanup — drop rows with missing features or labels
required_cols = ['SMA_20', 'SMA_50', 'RSI', 'MACD', 'BB_upper', 'BB_middle', 'BB_lower', 'Signal']
available = [col for col in required_cols if col in data.columns]
data.dropna(subset=available, inplace=True)

# Convert target to integer
data['Signal'] = data['Signal'].astype(int)

# 💾 Step 7: Save the final processed dataset to CSV
data.to_csv('../data/final_stock_data.csv')
print("✅ Final dataset saved to /data/final_stock_data.csv")
print("📊 Final shape:", data.shape)


YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed

Cleaned columns: ['open', 'high', 'low', 'close', 'volume']
✅ Final dataset saved to /data/final_stock_data.csv
📊 Final shape: (1208, 16)



