In [None]:
import os
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from numpy.random import dirichlet
#Loaing portoflios, they are .txt files with tickers on each line.
df = pd.read_csv('news_data.csv') #Loading the news data, the column title is headline, content is full text
os.chdir("portfolios")
file_names = os.listdir()
file_array = [file for file in file_names]

In [None]:
import os
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from numpy.random import dirichlet
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import statsmodels.api as sm
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from scipy.sparse import csr_matrix
import warnings
import yfinance as yf
from datetime import datetime, timedelta
warnings.filterwarnings('ignore')

# NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Fetch historical returns with error handling for missing tickers
def fetch_historical_returns(tickers, start_date, end_date):
    valid_tickers = []
    all_returns = pd.DataFrame()
    
    for ticker in tickers:
        tickery = ticker.replace('.', '-') #yfinance form
        try:
            data = yf.download(tickery, start=start_date, end=end_date, progress=False, ignore_tz=True)['Adj Close']
            if not data.empty:
                returns = data.pct_change().dropna()
                all_returns[ticker] = returns
                valid_tickers.append(ticker)
            else:
                pass  # Silently skip tickers with no data
        except Exception:
            pass  # Silently skip tickers that cause an error
    
    return all_returns, valid_tickers

# Calculate portfolio standard deviation
def calculate_portfolio_sd(returns, weights):
    portfolio_returns = np.dot(returns, weights)
    return np.std(portfolio_returns) * np.sqrt(252)  # Annualized volatility

# Calculate individual asset volatilities
def calculate_individual_sds(returns):
    return returns.std(axis=0) * np.sqrt(252)  # Annualized individual volatilities

# Calculate portfolio VaR
def calculate_portfolio_var(returns, weights, confidence_level=0.95):
    portfolio_returns = np.dot(returns, weights)
    return np.percentile(portfolio_returns, (1 - confidence_level) * 100)

# Calculate individual asset VaRs
def calculate_individual_vars(returns, confidence_level=0.95):
    return returns.apply(lambda x: np.percentile(x, (1 - confidence_level) * 100))

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return words

# Function to fetch news titles from the CSV file within a specified date range
def fetch_news_from_csv(ticker, df, start_date=None, end_date=None):
    try:
        filtered_df = df[df['ticker'] == ticker]
        if start_date:
            filtered_df = filtered_df[filtered_df['date'] >= start_date]
        if end_date:
            filtered_df = filtered_df[filtered_df['date'] <= end_date]
        return filtered_df['title'].tolist()
    except Exception:
        return []

# Function to build a term-frequency matrix for all tickers
def build_term_frequency_matrix(headlines_dict):
    all_texts = [' '.join(preprocess_text(' '.join(headlines))) for headlines in headlines_dict.values()]
    vectorizer = CountVectorizer()
    term_freq_matrix = vectorizer.fit_transform(all_texts)
    return csr_matrix(term_freq_matrix), vectorizer.get_feature_names_out()

# Function to calculate normalized entropy using sparse matrix operations
def calculate_lr_matrix(term_freq_matrix, weights):
    weighted_matrix = term_freq_matrix.T.dot(weights)
    weighted_matrix = np.array(weighted_matrix).flatten()
    total_weight = np.sum(weighted_matrix)
    probabilities = weighted_matrix / total_weight
    H_combined = -np.sum(probabilities * np.log(probabilities + 1e-10))
    m = term_freq_matrix.shape[1]
    LR = H_combined / np.log(m) if m > 1 else 0
    return LR

# Generate random portfolios and calculate metrics
def generate_portfolios_and_calculate_metrics(tickers, returns, headlines_dict, num_portfolios=1000):
    term_freq_matrix, feature_names = build_term_frequency_matrix(headlines_dict)
    n = len(tickers)
    portfolio_metrics = []

    for _ in range(num_portfolios):
        weights = np.random.dirichlet(np.ones(n), size=1)[0]
        valid_tickers = [tickers[i] for i in range(n) if tickers[i] in headlines_dict and len(headlines_dict[tickers[i]]) > 0]
        if not valid_tickers:
            continue
        valid_weights = [weights[i] for i in range(n) if tickers[i] in valid_tickers]
        lr_measure = calculate_lr_matrix(term_freq_matrix, valid_weights)
        total_volatility = calculate_portfolio_sd(returns[valid_tickers], valid_weights)
        individual_volatilities = calculate_individual_sds(returns[valid_tickers])
        total_var = calculate_portfolio_var(returns[valid_tickers], valid_weights)
        individual_vars = calculate_individual_vars(returns[valid_tickers])
        
        if np.any(np.isnan(individual_volatilities)) or np.isnan(total_volatility):
            continue

        diversification_ratio_vol = total_volatility / np.sum(individual_volatilities * valid_weights)
        diversification_ratio_var = total_var / np.sum(individual_vars * valid_weights)
        
        portfolio_metrics.append((lr_measure, diversification_ratio_vol, diversification_ratio_var))

    return portfolio_metrics

# Perform linear regression analysis
def perform_linear_regression_analysis(lr_measures, diversification_ratios_vol, diversification_ratios_var):
    X = np.array(lr_measures).reshape(-1, 1)
    y_vol = np.array(diversification_ratios_vol)
    y_var = np.array(diversification_ratios_var)

    X_train, X_test, y_train_vol, y_test_vol, y_train_var, y_test_var = train_test_split(X, y_vol, y_var, test_size=0.2, random_state=42)

    linear_model_vol = LinearRegression()
    linear_model_vol.fit(X_train, y_train_vol)
    predictions_linear_vol = linear_model_vol.predict(X_test)
    r2_linear_vol = r2_score(y_test_vol, predictions_linear_vol)
    mse_linear_vol = mean_squared_error(y_test_vol, predictions_linear_vol)

    linear_model_var = LinearRegression()
    linear_model_var.fit(X_train, y_train_var)
    predictions_linear_var = linear_model_var.predict(X_test)
    r2_linear_var = r2_score(y_test_var, predictions_linear_var)
    mse_linear_var = mean_squared_error(y_test_var, predictions_linear_var)

    X_train_sm = sm.add_constant(X_train)  # Add a constant term for the intercept

    model_vol = sm.OLS(y_train_vol, X_train_sm).fit()
    p_values_vol = model_vol.pvalues
    significance_vol = model_vol.summary2().tables[1]['P>|t|'][1]  # p-value of the LR measure for volatility

    model_var = sm.OLS(y_train_var, X_train_sm).fit()
    p_values_var = model_var.pvalues
    significance_var = model_var.summary2().tables[1]['P>|t|'][1]  # p-value of the LR measure for VaR

    print("\nLinear Regression (Volatility):")
    print(f"R² Score: {r2_linear_vol}")
    print(f"Mean Squared Error: {mse_linear_vol}")
    print(f"p-value: {significance_vol}")

    print("\nLinear Regression (VaR):")
    print(f"R² Score: {r2_linear_var}")
    print(f"Mean Squared Error: {mse_linear_var}")
    print(f"p-value: {significance_var}")

    return {
        'Linear_Vol': (r2_linear_vol, mse_linear_vol, significance_vol),
        'Linear_Var': (r2_linear_var, mse_linear_var, significance_var)
    }

# Function to generate sliding date ranges
def generate_date_ranges(start_date, end_date, interval_months):
    date_ranges = []
    start = start_date
    while start < end_date:
        end = start + timedelta(days=interval_months * 30)  # Approximate month duration
        if end > end_date:
            end = end_date
        date_ranges.append((start.strftime('%Y-%m-%d'), end.strftime('%Y-%m-%d')))
        start = start + timedelta(days=interval_months * 15)  # Move to next interval (6 months shift)
    return date_ranges

# Main function with updates
def main(file_name, df):
    print("Processing portfolio:", file_name)
    with open(file_name, 'r') as file:
        tickers = [line.strip() for line in file]

    start_date = datetime(2018, 1, 1)
    end_date = datetime(2024, 6, 30)
    interval_months = 12

    # Generate sliding windows
    date_ranges = generate_date_ranges(start_date, end_date, interval_months)

    # Perform analysis for each window
    for start_dat, end_dat in date_ranges:
        returns, valid_tickers = fetch_historical_returns(tickers, start_dat, end_dat)
        if not valid_tickers:
            continue  # Skip this period if no valid tickers

        headlines_dict = {}
        for ticker in valid_tickers:
            headlines = fetch_news_from_csv(ticker, df, start_date=start_dat, end_date=end_dat)
            if headlines:
                headlines_dict[ticker] = headlines

        # Only use tickers with valid headlines
        valid_tickers_with_headlines = [ticker for ticker in valid_tickers if ticker in headlines_dict]
        returns = returns[valid_tickers_with_headlines]

        portfolio_metrics = generate_portfolios_and_calculate_metrics(valid_tickers_with_headlines, returns, headlines_dict)
        portfolio_metrics = [(lr, drv, drvar) for lr, drv, drvar in portfolio_metrics if not np.isnan(drv) and not np.isnan(drvar)]

        if portfolio_metrics:
            lr_measures, diversification_ratios_vol, diversification_ratios_var = zip(*portfolio_metrics)
        
            correlation_vol = np.corrcoef(lr_measures, diversification_ratios_vol)[0, 1]
            correlation_var = np.corrcoef(lr_measures, diversification_ratios_var)[0, 1]
            print(f"Correlation between LR Measure and Diversification Ratio (Volatility): {correlation_vol}")
            print(f"Correlation between LR Measure and Diversification Ratio (VaR): {correlation_var}")

            regression_results = perform_linear_regression_analysis(lr_measures, diversification_ratios_vol, diversification_ratios_var)

            plt.figure(figsize=(14, 8))
            plt.scatter(lr_measures, diversification_ratios_vol, alpha=0.5, label='Data Points (Volatility)')
            plt.xlabel('LR Measure (Normalized Shannon Entropy)')
            plt.ylabel('Diversification Ratio (Total Volatility / Sum of Individual Volatilities)')
            plt.title(f'Relationship between LR Measure and Diversification Ratio (Volatility)\nPeriod: {start_dat} to {end_dat}')

            linear_model_vol = LinearRegression()
            linear_model_vol.fit(np.array(lr_measures).reshape(-1, 1), diversification_ratios_vol)
            plt.plot(lr_measures, linear_model_vol.predict(np.array(lr_measures).reshape(-1, 1)), color='red', label=f'Linear Fit (Volatility) (R²={regression_results["Linear_Vol"][0]:.2f})')

            plt.legend()
            plt.show()

            plt.figure(figsize=(14, 8))
            plt.scatter(lr_measures, diversification_ratios_var, alpha=0.5, label='Data Points (VaR)')
            plt.xlabel('LR Measure (Normalized Shannon Entropy)')
            plt.ylabel('Diversification Ratio (VaR)')
            plt.title(f'Relationship between LR Measure and Diversification Ratio (VaR)\nPeriod: {start_dat} to {end_dat}')

            linear_model_var = LinearRegression()
            linear_model_var.fit(np.array(lr_measures).reshape(-1, 1), diversification_ratios_var)
            plt.plot(lr_measures, linear_model_var.predict(np.array(lr_measures).reshape(-1, 1)), color='blue', label=f'Linear Fit (VaR) (R²={regression_results["Linear_Var"][0]:.2f})')

            plt.legend()
            plt.show()

if __name__ == "__main__":
    for file_name in file_array:
        main(file_name, df)
