### Import libraries and packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
import requests
from bs4 import BeautifulSoup
import datetime as dt
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import os
import re
from typing import Optional, Tuple, Dict

In [2]:
# Initialize sentiment analyzer once
try:
    nltk.data.find('vader_lexicon')
except LookupError:
    nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/gauravkhanal/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [3]:
# Constants
TRANSACTION_COST = 0.001  # 0.1% transaction cost

In [4]:
def fetch_stock_data(ticker: str, start_date: str, end_date: str) -> pd.DataFrame:
    """Fetch historical stock data from Yahoo Finance."""
    data = yf.download(ticker, start=start_date, end=end_date, progress=False)
    if data.empty:
        raise ValueError(f"No stock data found for ticker {ticker}")
    return data

In [5]:
def clean_text(text: str) -> str:
    """Preprocess text for sentiment analysis."""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)      # Remove numbers
    return text.strip()

In [6]:
def generate_synthetic_news(ticker: str, days: int = 30) -> pd.DataFrame:
    """Generate synthetic news data for demonstration."""
    dates = pd.date_range(end=dt.datetime.now(), periods=days).tolist()
    headlines = [
        f"{ticker} announces new product",
        f"{ticker} beats earnings expectations",
        f"{ticker} stock downgraded by analysts",
        f"{ticker} expands into new markets",
        f"{ticker} faces regulatory challenges",
        f"{ticker} reports strong quarterly growth",
        f"{ticker} CEO steps down",
        f"{ticker} partners with major industry player",
        f"{ticker} cuts revenue forecast",
        f"{ticker} announces layoffs"
    ]
    
    news_data = []
    for date in dates:
        daily_headlines = np.random.choice(
            headlines, 
            size=np.random.randint(0, 4), 
            replace=False
        ).tolist()
        
        for headline in daily_headlines:
            news_data.append({
                'date': date.strftime('%Y-%m-%d'),
                'headline': headline,
                'clean_headline': clean_text(headline)
            })
    
    return pd.DataFrame(news_data)

In [7]:
def analyze_sentiment(news_df: pd.DataFrame) -> pd.DataFrame:
    """Analyze sentiment of news headlines."""
    sentiment_cols = ['compound_score', 'positive_score', 'negative_score', 'neutral_score']
    
    for col in sentiment_cols:
        if col not in news_df.columns:
            news_df[col] = np.nan
            
    for idx, row in news_df.iterrows():
        scores = sia.polarity_scores(row['clean_headline'])
        news_df.at[idx, 'compound_score'] = scores['compound']
        news_df.at[idx, 'positive_score'] = scores['pos']
        news_df.at[idx, 'negative_score'] = scores['neg']
        news_df.at[idx, 'neutral_score'] = scores['neu']
    
    # Aggregate by date and add rolling average
    sentiment_by_date = news_df.groupby('date').agg({
        'compound_score': 'mean',
        'positive_score': 'mean',
        'negative_score': 'mean',
        'neutral_score': 'mean',
        'headline': 'count'
    }).rename(columns={'headline': 'news_count'})
    
    sentiment_by_date['rolling_compound'] = (
        sentiment_by_date['compound_score']
        .rolling(window=3, min_periods=1)
        .mean()
    )
    
    return sentiment_by_date

In [None]:
def generate_signals(stock_df: pd.DataFrame, 
                    sentiment_df: pd.DataFrame,
                    threshold: float = 0.2,
                    use_rolling: bool = True) -> pd.DataFrame:
    """Generate trading signals by merging stock and sentiment data."""
    # Make copies of input DataFrames
    stock_data = stock_df.copy()
    sentiment_data = sentiment_df.copy()
    
    # Convert indices to datetime and ensure single-level index
    stock_data.index = pd.to_datetime(stock_data.index)
    sentiment_data.index = pd.to_datetime(sentiment_data.index)
    
    # If stock_data has MultiIndex, convert to single level
    if isinstance(stock_data.index, pd.MultiIndex):
        stock_data = stock_data.reset_index(level=1, drop=True)
    
    # If sentiment_data has MultiIndex, convert to single level
    if isinstance(sentiment_data.index, pd.MultiIndex):
        sentiment_data = sentiment_data.reset_index(level=1, drop=True)
    
    # Merge data using join (more robust for index merging)
    merged = stock_data.join(sentiment_data, how='left')
    
    # Forward fill sentiment scores
    sentiment_cols = ['compound_score', 'positive_score', 
                     'negative_score', 'neutral_score', 'rolling_compound']
    merged[sentiment_cols] = merged[sentiment_cols].fillna(method='ffill')
    
    # Generate signals
    sentiment_col = 'rolling_compound' if use_rolling else 'compound_score'
    merged['signal'] = 0
    merged.loc[merged[sentiment_col] > threshold, 'signal'] = 1
    merged.loc[merged[sentiment_col] < -threshold, 'signal'] = -1
    
    # Calculate returns
    merged['returns'] = merged['Adj Close'].pct_change()
    merged['strategy_returns'] = (
        merged['signal'].shift(1) * merged['returns'] - 
        abs(merged['signal'].diff()) * TRANSACTION_COST
    )
    
    return merged

In [29]:
def backtest_strategy(signals_df: pd.DataFrame) -> Dict:
    """Calculate performance metrics from trading signals."""
    signals_df['cumulative_returns'] = (1 + signals_df['returns']).cumprod() - 1
    signals_df['cumulative_strategy_returns'] = (
        (1 + signals_df['strategy_returns'].fillna(0)).cumprod() - 1
    )
    
    total_days = len(signals_df)
    positive_returns = signals_df['strategy_returns'] > 0
    negative_returns = signals_df['strategy_returns'] < 0
    
    annualized_return = (
        (1 + signals_df['cumulative_strategy_returns'].iloc[-1]) ** 
        (365 / total_days) - 1
        if total_days > 0 else 0
    )
    
    downside_returns = signals_df.loc[negative_returns, 'strategy_returns']
    sortino_ratio = (
        annualized_return / (downside_returns.std() * np.sqrt(252))
        if len(downside_returns) > 0 else 0
    )
    
    return {
        'total_days': total_days,
        'profitable_days': positive_returns.sum(),
        'win_rate': positive_returns.mean(),
        'annualized_return': annualized_return,
        'cumulative_return': signals_df['cumulative_strategy_returns'].iloc[-1],
        'buy_hold_return': signals_df['cumulative_returns'].iloc[-1],
        'max_drawdown': (
            signals_df['cumulative_strategy_returns'].cummax() - 
            signals_df['cumulative_strategy_returns']).max(),
        'volatility': signals_df['strategy_returns'].std() * np.sqrt(252),
        'sharpe_ratio': (
            signals_df['strategy_returns'].mean() / 
            signals_df['strategy_returns'].std()) * np.sqrt(252) 
            if signals_df['strategy_returns'].std() > 0 else 0,
        'sortino_ratio': sortino_ratio
    }

In [30]:
def plot_results(ticker: str, signals_df: pd.DataFrame) -> plt.Figure:
    """Visualize trading signals and performance."""
    fig, axes = plt.subplots(3, 1, figsize=(14, 18), 
                           gridspec_kw={'height_ratios': [3, 1, 2]})
    
    # Price and sentiment
    ax1 = axes[0]
    ax1.set_title(f'{ticker} Price and Sentiment')
    ax1.plot(signals_df.index, signals_df['Adj Close'], label='Price', color='blue')
    ax1.set_ylabel('Price ($)', color='blue')
    ax1.tick_params(axis='y', labelcolor='blue')
    
    ax1b = ax1.twinx()
    scatter = ax1b.scatter(
        signals_df.index, 
        signals_df['rolling_compound'],
        c=signals_df['rolling_compound'], 
        cmap='RdYlGn', 
        alpha=0.6
    )
    ax1b.set_ylabel('Sentiment Score', color='green')
    ax1b.tick_params(axis='y', labelcolor='green')
    plt.colorbar(scatter, ax=ax1b).set_label('Sentiment Score')
    
    # Trading signals
    ax2 = axes[1]
    ax2.set_title('Trading Signals')
    ax2.plot(signals_df.index, signals_df['signal'], 'o-', markersize=4)
    ax2.set_ylabel('Signal (1=Buy, -1=Sell)')
    ax2.set_ylim([-1.5, 1.5])
    ax2.grid(True)
    
    # Performance
    ax3 = axes[2]
    ax3.set_title('Strategy Performance')
    ax3.plot(signals_df.index, signals_df['cumulative_returns'], label='Buy & Hold')
    ax3.plot(signals_df.index, signals_df['cumulative_strategy_returns'], label='Strategy')
    ax3.set_ylabel('Cumulative Returns')
    ax3.legend()
    ax3.grid(True)
    
    plt.tight_layout()
    return fig

In [31]:
def run_analysis(ticker: str, 
                start_date: str, 
                end_date: str,
                threshold: float = 0.2) -> Tuple[pd.DataFrame, Dict, plt.Figure]:
    """Complete analysis pipeline."""
    print(f"Analyzing {ticker} from {start_date} to {end_date}")
    
    # 1. Fetch data
    stock_data = fetch_stock_data(ticker, start_date, end_date)
    news_data = generate_synthetic_news(ticker)
    
    # 2. Analyze sentiment
    sentiment_scores = analyze_sentiment(news_data)
    
    # 3. Generate signals
    signals_df = generate_signals(stock_data, sentiment_scores, threshold)
    
    # 4. Backtest
    metrics = backtest_strategy(signals_df)
    
    # 5. Visualize
    fig = plot_results(ticker, signals_df)
    
    return signals_df, metrics, fig

In [32]:
# Example usage
if __name__ == "__main__":
    signals, metrics, fig = run_analysis(
        ticker="AAPL",
        start_date="2023-01-01",
        end_date="2023-12-31",
        threshold=0.2
    )
    
    print("\nPerformance Metrics:")
    for k, v in metrics.items():
        print(f"{k:>20}: {v:.4f}")
    
    plt.show()

Analyzing AAPL from 2023-01-01 to 2023-12-31


MergeError: Not allowed to merge between different levels. (2 levels on the left, 1 on the right)