In [None]:
import os
import re
import numpy as np
import pandas as pd
import warnings
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from numpy.random import dirichlet
from scipy.optimize import minimize
import yfinance as yf

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

warnings.filterwarnings('ignore')

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Options for optimization
options = {
    'maxiter': 3000,
    'ftol': 1e-7,
    'gtol': 1e-7,
    'disp': False,
    'eps': 1e-7
}

##############################################
#         Data Loading & Global Setup        #
##############################################

# Load news data – CSV must have columns: 'ticker', 'date', and 'title'
df = pd.read_csv('news_data.csv')
# Ensure df['date'] is in UTC
df['date'] = pd.to_datetime(df['date']).dt.tz_convert('UTC')

# Cache news data grouped by ticker for fast slicing later
news_cache = {ticker: group.sort_values('date') for ticker, group in df.groupby('ticker')}

# Define the folder where portfolio files (with tickers) are stored
portfolio_folder = "portfolios"
if os.path.exists(portfolio_folder):
    portfolio_files = [os.path.join(portfolio_folder, file) for file in os.listdir(portfolio_folder) if file.endswith('.txt')]
else:
    print(f"Portfolio folder '{portfolio_folder}' not found.")
    portfolio_files = []

# Global cache for returns per ticker (to avoid repeated yfinance calls)
returns_cache = {}

def get_cached_returns(ticker, global_start, global_end):
    yf_ticker = ticker.replace('.', '-')
    if ticker not in returns_cache:
        try:
            data = yf.download(yf_ticker, start=global_start, end=global_end, progress=False)
            if 'Adj Close' in data.columns:
                data = data['Adj Close']
            elif 'Close' in data.columns:
                data = data['Close']
            else:
                return None
            if data.empty:
                return None
            ret = data.pct_change().dropna()
            returns_cache[ticker] = ret
        except Exception as e:
            print(f"Error fetching data for {ticker}: {e}")
            return None
    ret = returns_cache[ticker]
    return ret if not ret.empty else None

##############################################
#         Helper Functions                   #
##############################################

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    words = word_tokenize(text)
    return [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

def fetch_news_from_cache(ticker, start_date=None, end_date=None):
    key = (ticker, start_date, end_date)
    if key in news_slice_cache:
        return news_slice_cache[key]
    if ticker not in news_cache:
        news_slice_cache[key] = []
        return []
    group = news_cache[ticker].copy()
    # Ensure group dates are timezone-naive for comparison
    group['date'] = group['date'].dt.tz_localize(None)
    if start_date:
        group = group[group['date'] >= pd.to_datetime(start_date)]
    if end_date:
        group = group[group['date'] <= pd.to_datetime(end_date)]
    headlines = group['title'].tolist()
    news_slice_cache[key] = headlines
    return headlines

def build_universal_vocabulary(start_date, end_date):
    """Build a universal vocabulary from all headlines in the entire analysis period."""
    # Localize start and end dates to UTC to match df['date']
    start = pd.to_datetime(start_date).tz_localize('UTC')
    end = pd.to_datetime(end_date).tz_localize('UTC')
    df_filtered = df[(df['date'] >= start) & (df['date'] <= end)]
    all_headlines = df_filtered['title'].tolist()
    combined_text = " ".join(all_headlines)
    processed_words = preprocess_text(combined_text)
    vocabulary = sorted(set(processed_words))
    return vocabulary

# Build universal vocabulary once
global_start = '2018-01-01'
global_end = '2024-06-30'
news_slice_cache = {}  # cache for news slices
universal_vocab = build_universal_vocabulary(global_start, global_end)
print(f"Universal vocabulary size: {len(universal_vocab)}")

def build_term_frequency_matrix(headlines_dict, vocabulary=None):
    all_texts = [' '.join(preprocess_text(' '.join(headlines))) for headlines in headlines_dict.values()]
    from sklearn.feature_extraction.text import CountVectorizer
    if vocabulary is not None:
        vectorizer = CountVectorizer(vocabulary=vocabulary)
    else:
        vectorizer = CountVectorizer()
    term_freq_matrix = vectorizer.fit_transform(all_texts)
    return csr_matrix(term_freq_matrix), vectorizer.get_feature_names_out()

def calculate_lr_matrix(term_freq_matrix, weights):
    weighted_matrix = term_freq_matrix.T.dot(weights)
    weighted_counts = np.array(weighted_matrix).flatten()
    total_weight = np.sum(weighted_counts)
    if total_weight == 0:
        return np.nan
    probabilities = weighted_counts / total_weight
    H_combined = -np.sum(probabilities * np.log(probabilities + 1e-10))
    m = term_freq_matrix.shape[1]  # This m is from the universal vocabulary now
    return H_combined / np.log(m) if m > 1 else np.nan

def calculate_weighted_return_percentage(returns, weights):
    return np.sum(returns.mean() * weights) * 252

def fetch_historical_returns_for_window(tickers, window_start, window_end, global_start, global_end):
    all_returns = pd.DataFrame()
    valid_tickers = []
    for ticker in tickers:
        ret = get_cached_returns(ticker, global_start, global_end)
        if ret is not None:
            ret_window = ret[(ret.index >= pd.to_datetime(window_start)) & (ret.index <= pd.to_datetime(window_end))]
            if not ret_window.empty:
                all_returns[ticker] = ret_window
                valid_tickers.append(ticker)
    return all_returns, valid_tickers

def generate_rolling_windows(start_date, end_date, window_length_days=180, step_days=30):
    windows = []
    current = pd.to_datetime(start_date)
    end_dt = pd.to_datetime(end_date)
    while current + timedelta(days=window_length_days) <= end_dt:
        window_start = current
        window_end = current + timedelta(days=window_length_days)
        windows.append((window_start.strftime('%Y-%m-%d'), window_end.strftime('%Y-%m-%d')))
        current = current + timedelta(days=step_days)
    return windows

##############################################
#   Optimization Function for LR Weights     #
##############################################

def optimize_portfolio_lr(tickers, returns, target_returns, headlines_dict):
    # Use the universal vocabulary so m is fixed
    term_freq_matrix, _ = build_term_frequency_matrix(headlines_dict, vocabulary=universal_vocab)
    n = len(tickers)
    results = []
    for target_return in target_returns:
        def objective(weights):
            lr = calculate_lr_matrix(term_freq_matrix, weights)
            return -lr  # maximize LR
        constraints = [
            {'type': 'eq', 'fun': lambda w: np.sum(w) - 1},
            {'type': 'ineq', 'fun': lambda w: calculate_weighted_return_percentage(returns, w) - target_return}
        ]
        bounds = tuple((0, 1) for _ in range(n))
        initial_weights = np.ones(n) / n
        result = minimize(objective, initial_weights, method='SLSQP', bounds=bounds, options=options, constraints=constraints)
        if result.success:
            results.append((target_return, result.x))
    return results

##############################################
#   Plot Mean Optimized LR Time Series (Rolling Windows)  #
##############################################

# Global LR cache: key = (window_start, window_end, tuple(sorted(tickers))) ; value = computed LR
lr_cache = {}

def plot_mean_optimized_lr_time_series(portfolio_files, start_date='2018-01-01', end_date='2024-06-30', 
                                       window_length_days=180, step_days=30, target_returns=[0.07, 0.1, 0.13, 0.16]):
    """
    For each portfolio file, use a rolling window approach:
      - Each window is 180 days long and slides forward by 30 days.
      - For each window:
          * Fetch news headlines (from cache) for each ticker.
          * Fetch historical returns (from cache) for these tickers.
          * Optimize weights for multiple target returns using window data.
          * Average the resulting weight vectors to obtain a mean weight.
          * Build a term frequency matrix from the window's headlines using the universal vocabulary.
          * Calculate LR using that mean weight and the fixed m.
          * Cache LR calculations for identical windows and ticker sets.
      - Plot the resulting LR time series using the window end date as the x-axis.
    """
    windows = generate_rolling_windows(start_date, end_date, window_length_days, step_days)
    
    for file_name in portfolio_files:
        with open(file_name, 'r') as f:
            tickers = [line.strip() for line in f if line.strip()]
        if not tickers:
            print(f"No tickers found in {file_name}")
            continue
        
        lr_values = []
        time_labels = []
        
        for window_start, window_end in windows:
            # Fetch headlines for each ticker in the window
            headlines_dict = {}
            for ticker in tickers:
                headlines = fetch_news_from_cache(ticker, start_date=window_start, end_date=window_end)
                headlines_dict[ticker] = headlines
            valid_headlines_dict = {ticker: hl for ticker, hl in headlines_dict.items() if hl}
            valid_tickers = sorted(valid_headlines_dict.keys())
            key = (window_start, window_end, tuple(valid_tickers))
            if key in lr_cache:
                lr_val = lr_cache[key]
                print(f"Window {window_start} to {window_end}: Using cached LR value.")
            else:
                if not valid_tickers:
                    print(f"Window {window_start} to {window_end}: No valid headlines.")
                    lr_val = np.nan
                else:
                    returns, tickers_with_returns = fetch_historical_returns_for_window(valid_tickers, window_start, window_end, global_start, global_end)
                    if returns.empty:
                        print(f"Window {window_start} to {window_end}: No returns data.")
                        lr_val = np.nan
                    else:
                        opt_results = optimize_portfolio_lr(tickers_with_returns, returns, target_returns,
                                                            {t: headlines_dict[t] for t in tickers_with_returns})
                        if not opt_results:
                            mean_weight = np.ones(len(tickers_with_returns)) / len(tickers_with_returns)
                        else:
                            weights_array = np.array([res[1] for res in opt_results])
                            mean_weight = np.mean(weights_array, axis=0)
                        tf_matrix, _ = build_term_frequency_matrix({t: headlines_dict[t] for t in tickers_with_returns}, vocabulary=universal_vocab)
                        print(f"Window {window_start} to {window_end}: TF matrix shape {tf_matrix.shape}, total count {tf_matrix.sum()}")
                        if tf_matrix.sum() == 0 or tf_matrix.shape[1] == 0:
                            lr_val = np.nan
                        else:
                            lr_val = calculate_lr_matrix(tf_matrix, mean_weight)
                lr_cache[key] = lr_val
            lr_values.append(lr_val)
            time_labels.append(datetime.strptime(window_end, '%Y-%m-%d'))
        
        time_index = pd.to_datetime(time_labels)
        fig, ax = plt.subplots(figsize=(10, 6))
        ax.plot(time_index, lr_values, marker='o', linestyle='-', color='blue', label='Mean Optimized LR')
        ax.set_xlabel('Window End Date')
        ax.set_ylabel('Mean Optimized Lexical Ratio (LR)')
        ax.grid(True)
        plt.title(f'Mean Optimized LR (Rolling Windows) - {os.path.basename(file_name).replace(".txt", "")}')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.legend(loc='upper left')
        plt.show()

# Run the rolling-window optimized LR plot for all portfolio files
plot_mean_optimized_lr_time_series(portfolio_files)
