In [None]:
import os
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from numpy.random import dirichlet
#Loaing portoflios, they are .txt files with tickers on each line.
df = pd.read_csv('news_data.csv') #Loading the news data, the column title is headline, content is full text
os.chdir("portfolios")
file_names = os.listdir()
file_array = [file for file in file_names]

In [None]:
import os
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from numpy.random import dirichlet
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit
import statsmodels.api as sm
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from scipy.sparse import csr_matrix
import warnings
import yfinance as yf
from datetime import datetime, timedelta

warnings.filterwarnings('ignore')
from sklearn.metrics import mean_squared_error

# NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def fetch_historical_returns(tickers, start_date, end_date):
    valid_tickers = []
    all_returns = pd.DataFrame()
    
    for ticker in tickers:
        tickery = ticker.replace('.', '-') #yfinance form
        try:
            data = yf.download(tickery, start=start_date, end=end_date, progress=False)['Adj Close']
            if not data.empty:
                returns = data.pct_change().dropna()
                all_returns[ticker] = returns
                valid_tickers.append(ticker)
            else:
                pass
        except Exception:
            pass  
    
    return all_returns, valid_tickers

# Calculate portfolio standard deviation
def calculate_portfolio_sd(returns, weights):
    portfolio_returns = np.dot(returns, weights)
    return np.std(portfolio_returns) * np.sqrt(252)  # Annualized volatility

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return words

# Fetch news headlines from CSV, with date handling
def fetch_news_from_csv(ticker, df, start_date=None, end_date=None):
    try:
        df['date'] = pd.to_datetime(df['date'], utc=True).dt.tz_convert(None)
        filtered_df = df[df['ticker'] == ticker]
        if start_date:
            filtered_df = filtered_df[filtered_df['date'] >= pd.Timestamp(start_date)]
        if end_date:
            filtered_df = filtered_df[filtered_df['date'] <= pd.Timestamp(end_date)]
        if len(filtered_df['title'].tolist()) == 0:
            return [""]
        return filtered_df['title'].tolist()
    except Exception as e:
        print(f"Error fetching news for {ticker}: {e}")
        return []

# Function to build a term-frequency matrix for all tickers
def build_term_frequency_matrix(headlines_dict):
    all_texts = [' '.join(preprocess_text(' '.join(headlines))) for headlines in headlines_dict.values()]
    vectorizer = CountVectorizer()
    term_freq_matrix = vectorizer.fit_transform(all_texts)
    return csr_matrix(term_freq_matrix), vectorizer.get_feature_names_out()

# Function to calculate normalized entropy using sparse matrix operations
def calculate_lr_matrix(term_freq_matrix, weights):
    num_documents = term_freq_matrix.shape[0]
    if len(weights) != num_documents:
        raise ValueError(f"Dimension mismatch: term frequency matrix has {num_documents} documents but weights vector has {len(weights)} elements.")
    weighted_matrix = term_freq_matrix.T.dot(weights)
    weighted_matrix = np.array(weighted_matrix).flatten()
    total_weight = np.sum(weighted_matrix)
    probabilities = weighted_matrix / total_weight
    H_combined = -np.sum(probabilities * np.log(probabilities + 1e-10))
    m = term_freq_matrix.shape[1]
    LR = H_combined / np.log(m) if m > 1 else 0
    return LR

# Calculate DR based on Standard Deviation (dr_sd)
def calculate_dr_sd(total_volatility, individual_volatilities, weights):
    return total_volatility / np.sum(individual_volatilities * weights)

# Calculate portfolio VaR (DR based on VaR)
def calculate_portfolio_var(returns, weights, confidence_level=0.95):
    portfolio_returns = np.dot(returns, weights)
    var_value = np.percentile(portfolio_returns, (1 - confidence_level) * 100)
    return -var_value

# Generate random portfolios and calculate metrics
def generate_portfolios_and_calculate_metrics(tickers, returns, headlines_dict, weights):
    valid_tickers = [ticker for ticker in tickers if ticker in headlines_dict and len(headlines_dict[ticker]) > 0]
    term_freq_matrix, feature_names = build_term_frequency_matrix({ticker: headlines_dict[ticker] for ticker in valid_tickers})
    portfolio_metrics = []

    for _ in range(1000):
        lr_measure = calculate_lr_matrix(term_freq_matrix, weights)
        total_volatility = calculate_portfolio_sd(returns[valid_tickers], weights)
        individual_volatilities = np.std(returns[valid_tickers], axis=0) * np.sqrt(252)  # Annualized volatilities
        dr_sd = calculate_dr_sd(total_volatility, individual_volatilities, weights)
        dr_var = calculate_portfolio_var(returns[valid_tickers], weights)
        portfolio_metrics.append((lr_measure, total_volatility, dr_sd, dr_var))

    return portfolio_metrics

# Rolling Window Analysis for Robustness of Each Metric (180-day window, 90-day step)
def rolling_window_analysis(tickers, df, window_size=180, step_size=90):
    start_date = datetime(2018, 1, 1)
    end_date = datetime(2024, 6, 30)
    rolling_windows = pd.date_range(start=start_date, end=end_date, freq=f'{step_size}D')
    
    metrics_summary = {"lr_std": [], "sd_std": [], "dr_sd_std": [], "dr_var_std": [],
                       "lr_mean": [], "sd_mean": [], "dr_sd_mean": [], "dr_var_mean": [],
                       "lr_range": [], "sd_range": [], "dr_sd_range": [], "dr_var_range": []}

    all_lr, all_sd, all_dr_sd, all_dr_var = [], [], [], []

    for window_start in rolling_windows:
        window_end = window_start + timedelta(days=window_size)
        
        # Ensure the window doesn't exceed the end_date
        if window_end > end_date:
            break

        # Fetch historical returns for the window
        returns,valid = fetch_historical_returns(tickers, window_start, window_end)

        if returns.empty:
            print(f"No valid returns data for window: {window_start} to {window_end}")
            continue

        # Get news headlines for the window
        headlines_dict = {}
        for ticker in valid:
            headlines = fetch_news_from_csv(ticker, df, start_date=window_start, end_date=window_end)
            if headlines:
                headlines_dict[ticker] = headlines

        # Skip window if no news data
        if not headlines_dict:
            continue

        # Generate random portfolio weights
        valid_tickers = [ticker for ticker in valid if ticker in headlines_dict and len(headlines_dict[ticker]) > 0]
        if len(valid_tickers) == 0:
            continue

        n = len(valid_tickers)
        weights = np.random.dirichlet(np.ones(n), size=1)[0]

        # Calculate portfolio metrics
        portfolio_metrics = generate_portfolios_and_calculate_metrics(valid_tickers, returns, headlines_dict, weights)

        if portfolio_metrics:
            lr_vals, sd_vals, dr_sd_vals, dr_var_vals = zip(*portfolio_metrics)
            all_lr.extend(lr_vals)
            all_sd.extend(sd_vals)
            all_dr_sd.extend(dr_sd_vals)
            all_dr_var.extend(dr_var_vals)
    
    # Calculate robustness measures for each metric
    metrics_summary["lr_mean"] = np.mean(all_lr)
    metrics_summary["sd_mean"] = np.mean(all_sd)
    metrics_summary["dr_sd_mean"] = np.mean(all_dr_sd)
    metrics_summary["dr_var_mean"] = np.mean(all_dr_var)

    metrics_summary["lr_std"] = np.std(all_lr)
    metrics_summary["sd_std"] = np.std(all_sd)
    metrics_summary["dr_sd_std"] = np.std(all_dr_sd)
    metrics_summary["dr_var_std"] = np.std(all_dr_var)

    metrics_summary["lr_range"] = np.max(all_lr) - np.min(all_lr)
    metrics_summary["sd_range"] = np.max(all_sd) - np.min(all_sd)
    metrics_summary["dr_sd_range"] = np.max(all_dr_sd) - np.min(all_dr_sd)
    metrics_summary["dr_var_range"] = np.max(all_dr_var) - np.min(all_dr_var)

    metrics_summary["lr_cv"] = metrics_summary["lr_std"] / metrics_summary["lr_mean"]
    metrics_summary["sd_cv"] = metrics_summary["sd_std"] / metrics_summary["sd_mean"]
    metrics_summary["dr_sd_cv"] = metrics_summary["dr_sd_std"] / metrics_summary["dr_sd_mean"]
    metrics_summary["dr_var_cv"] = metrics_summary["dr_var_std"] / metrics_summary["dr_var_mean"]

    # Print results
    print(f"Mean for Lexical Ratio (LR): {metrics_summary['lr_mean']:.4f}")
    print(f"Mean for Standard Deviation (SD): {metrics_summary['sd_mean']:.4f}")
    print(f"Mean for DR-SD: {metrics_summary['dr_sd_mean']:.4f}")
    print(f"Mean for DR-Var: {metrics_summary['dr_var_mean']:.4f}")
    
    print(f"Standard Deviation (SD) for Lexical Ratio (LR): {metrics_summary['lr_std']:.4f}")
    print(f"Standard Deviation (SD) for Standard Deviation (SD): {metrics_summary['sd_std']:.4f}")
    print(f"Standard Deviation (SD) for DR-SD: {metrics_summary['dr_sd_std']:.4f}")
    print(f"Standard Deviation (SD) for DR-Var: {metrics_summary['dr_var_std']:.4f}")
    
    print(f"Max-Min Range for Lexical Ratio (LR): {metrics_summary['lr_range']:.4f}")
    print(f"Max-Min Range for Standard Deviation (SD): {metrics_summary['sd_range']:.4f}")
    print(f"Max-Min Range for DR-SD: {metrics_summary['dr_sd_range']:.4f}")
    print(f"Max-Min Range for DR-Var: {metrics_summary['dr_var_range']:.4f}")
    
    print(f"Coefficient of Variation (CV) for Lexical Ratio (LR): {metrics_summary['lr_cv']:.4f}")
    print(f"Coefficient of Variation (CV) for Standard Deviation (SD): {metrics_summary['sd_cv']:.4f}")
    print(f"Coefficient of Variation (CV) for DR-SD: {metrics_summary['dr_sd_cv']:.4f}")
    print(f"Coefficient of Variation (CV) for DR-Var: {metrics_summary['dr_var_cv']:.4f}")

# Main function to run the robustness analysis
def main():
    for file_name in file_array:
        print(f"Processing portfolio: {file_name}")
        with open(file_name, 'r') as file:
            tickers = [line.strip() for line in file]
        rolling_window_analysis(tickers, df)

if __name__ == "__main__":
    main()
