In [None]:
import os
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from numpy.random import dirichlet
#Loaing portoflios, they are .txt files with tickers on each line.
df = pd.read_csv('news_data.csv') #Loading the news data, the column title is headline, content is full text
os.chdir("portfolios")
file_names = os.listdir()
file_array = [file for file in file_names]

In [None]:
import os
import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist, squareform
from sklearn.feature_extraction.text import CountVectorizer
import yfinance as yf
from scipy.sparse import csr_matrix
import warnings
import re
from datetime import datetime, timedelta
import nltk
from decimal import Decimal, getcontext
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

warnings.filterwarnings('ignore')

# NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
getcontext().prec = 50

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
def normalize(series):
    return (series - series.min()) / (series.max() - series.min())

# Function to fetch news titles from the CSV file within a specified date range
def fetch_news_from_csv(ticker, df, start_date=None, end_date=None):
    try:
        filtered_df = df[df['ticker'] == ticker]
        if start_date:
            filtered_df = filtered_df[filtered_df['date'] >= start_date]
        if end_date:
            filtered_df = filtered_df[filtered_df['date'] <= end_date]
        # Using the titles here, could be modified to content or both
        return filtered_df['title'].tolist()
    except Exception as e:
        print(f"Error fetching news for {ticker}: {e}")
        return []


def generate_date_ranges(start_date, end_date, interval_months):
    date_ranges = []
    start = start_date
    while start < end_date:
        end = start + timedelta(days=interval_months * 30)  # Approximate month duration
        if end > end_date:
            end = end_date
        date_ranges.append((start.strftime('%Y-%m-%d'), end.strftime('%Y-%m-%d')))
        start = start + timedelta(days=180)  # Move to next interval
    return date_ranges

# Fetch historical returns with error handling for missing tickers
def fetch_historical_returns(tickers, start_date, end_date):
    valid_tickers = []
    all_returns = pd.DataFrame()
    for ticker in tickers:
        tickery = ticker.replace('.', '-') #yfinance form
        try:
            data = yf.download(tickery, start=start_date, end=end_date, progress=False)['Adj Close']
            if not data.empty:
                returns = data.pct_change().dropna()
                all_returns[ticker] = returns
                valid_tickers.append(ticker)
            else:
                pass
        except Exception as e:
            pass
    
    return all_returns, valid_tickers

# Fetch and normalize VIX data
def fetch_vix_data(start_date, end_date):
    vix_data = yf.download('^VIX', start=start_date, end=end_date, progress=False)['Adj Close']
    return normalize(vix_data).dropna()

# Function to preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenize the text
    words = word_tokenize(text)
    # Remove stopwords and lemmatize the words
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return words

# Function to build a term-frequency matrix
def build_term_frequency_matrix(headlines_dict):
    all_texts = [' '.join(preprocess_text(' '.join(headlines))) for headlines in headlines_dict.values()]
    vectorizer = CountVectorizer()
    term_freq_matrix = vectorizer.fit_transform(all_texts)
    return csr_matrix(term_freq_matrix), vectorizer.get_feature_names_out()

# Function to calculate normalized entropy (LR) using sparse matrix operations
def calculate_lr_matrix(term_freq_matrix, weights):
    weighted_matrix = term_freq_matrix.T.dot(weights)
    weighted_matrix = np.array(weighted_matrix).flatten()
    total_weight = np.sum(weighted_matrix)
    probabilities = weighted_matrix / total_weight
    H_combined = -np.sum(probabilities * np.log(probabilities + 1e-10))
    m = term_freq_matrix.shape[1]
    LR = H_combined / np.log(m) if m > 1 else 0
    return LR

# Calculate portfolio Markowitz (Standard Deviation)
def calculate_markowitz(returns, weights):
    portfolio_returns = np.dot(returns, weights)
    return np.std(portfolio_returns) * np.sqrt(252)  # Annualized volatility

# Calculate portfolio VaR (DR based on VaR)
def calculate_portfolio_var(returns, weights, confidence_level=0.95):
    portfolio_returns = np.dot(returns, weights)
    return np.percentile(portfolio_returns, (1 - confidence_level) * 100)

# Calculate individual asset volatilities
def calculate_individual_sds(returns):
    return returns.std(axis=0) * np.sqrt(252)  # Annualized individual volatilities

# Calculate individual asset VaRs
def calculate_individual_vars(returns, confidence_level=0.95):
    return returns.apply(lambda x: np.percentile(x, (1 - confidence_level) * 100))

# Function to calculate DR based on Standard Deviation (dr_sd)
def calculate_dr_sd(total_volatility, individual_volatilities, weights):
    return total_volatility / np.sum(individual_volatilities * weights)

from scipy.spatial import KDTree
import numpy as np
from scipy.spatial import cKDTree

def Tn(Y, Z, X=None, epsilon=1e-10):
    n = len(Y)
    Y = np.array([Decimal(float(y)) for y in Y])
    X = np.array([Decimal(float(y)) for y in X])
    X = np.array(X).reshape(-1, 1)
    Z = np.array([Decimal(float(y)) for y in Z])
    R = np.array([Decimal(int(rank)) for rank in np.argsort(np.argsort(Y)) + 1])
    if X is None:
        # Case p = 0 (no conditioning)
        tree = cKDTree(Z)
        _, M = tree.query(Z, k=2) # Nearest neighbor index
        M = M[:, 1]  # Nearest neighbor index based on (X, Z)
        L = n - R + 1  # Number of Y values >= Yi
        numerator = np.sum(n * np.minimum(R, R[M-1]) - L ** 2)
        denominator = np.sum(L * (n - L))
        numerator = Decimal(numerator)
        denominator = Decimal(denominator)
        
    else:
        # Case p >= 1 (conditioning on X)
        X_tree = cKDTree(X)
        XZ_tree = cKDTree(np.column_stack((X, Z)))
        
        _, N = X_tree.query(X, k=2)
        N = N[:, 1]  # Nearest neighbor index based on X
        
        _, M = XZ_tree.query(np.column_stack((X, Z)), k=2)
        M = M[:, 1]  # Nearest neighbor index based on (X, Z)
        numerator = max(epsilon, np.sum(np.minimum(R, R[M-1]) - np.minimum(R, R[N])))
        denominator = max(epsilon, np.sum(R - np.minimum(R, R[N])))
        numerator = Decimal(numerator)
        denominator = Decimal(denominator)
    return numerator / denominator

# Conditional dependence given X
def main(file_name, df):
    portfolio_metrics = []
    start_date = datetime(2018, 1, 1)
    end_date = datetime(2024, 6, 30)
    # Generate date ranges
    interval_months = 12
    date_ranges = generate_date_ranges(start_date, end_date, interval_months)
    print("Processing portfolio:", file_name)
    
    with open(file_name, 'r') as file:
        tickers = [line.strip() for line in file]
    
    for start_dat, end_dat in date_ranges:
        # Fetch historical returns and VIX data
        returns, valid_tickers = fetch_historical_returns(tickers, start_dat, end_dat)
        if not valid_tickers:
            print(f"No valid tickers for the period {start_dat} to {end_dat}. Skipping this period.")
            continue
        
        vix_data = fetch_vix_data(start_dat, end_dat)
        headlines_dict = {}
        
        # Fetch news headlines for each valid ticker
        for ticker in valid_tickers:
            headlines = fetch_news_from_csv(ticker, df, start_dat, end_dat)
            if headlines:
                headlines_dict[ticker] = headlines
        
        if not headlines_dict:
            print(f"No headlines found for valid tickers in the period {start_dat} to {end_dat}. Skipping this period.")
            continue
        
        # Ensure that valid_tickers only includes tickers with headlines
        valid_tickers_with_headlines = [ticker for ticker in valid_tickers if ticker in headlines_dict]
        
        if not valid_tickers_with_headlines:
            print(f"No valid tickers with headlines for the period {start_dat} to {end_dat}. Skipping this period.")
            continue
        
        # Build term frequency matrix for valid tickers with headlines
        term_freq_matrix, _ = build_term_frequency_matrix({ticker: headlines_dict[ticker] for ticker in valid_tickers_with_headlines})
        
        # Use equal weights for valid tickers with headlines
        weights = np.ones(len(valid_tickers_with_headlines)) / len(valid_tickers_with_headlines)
        
        # Calculate LR for the current interval
        lr_measure = calculate_lr_matrix(term_freq_matrix, weights)
        
        # Calculate Markowitz (Standard Deviation) for the portfolio
        total_volatility = calculate_markowitz(returns[valid_tickers_with_headlines], weights)
        
        # Calculate DR based on Standard Deviation (dr_sd)
        individual_volatilities = calculate_individual_sds(returns[valid_tickers_with_headlines])
        dr_sd = calculate_dr_sd(total_volatility, individual_volatilities, weights)
        
        # Calculate DR based on VaR (dr_var)
        total_var = calculate_portfolio_var(returns[valid_tickers_with_headlines], weights)
        individual_vars = calculate_individual_vars(returns[valid_tickers_with_headlines])
        dr_var = total_var / np.sum(individual_vars * weights)
        
        # VIX value for the period
        vix_value = vix_data.mean()
        
        # Store the portfolio metrics for this interval
        portfolio_metrics.append((vix_value, lr_measure, total_volatility, dr_sd, dr_var))
    
    # Extract stored metrics
    if portfolio_metrics:
        vix_values, lr_measures, markowitz_measures, dr_sd_measures, dr_var_measures = zip(*portfolio_metrics)
        
        # Perform Azadkia-Chatterjee Method for Markowitz (SD)
        azadkia_result_markowitz = Tn(lr_measures, markowitz_measures, vix_values)
        print(f"Azadkia-Chatterjee Conditional Dependence Result (Markowitz/SD): {azadkia_result_markowitz}")
        
        # Perform Azadkia-Chatterjee Method for DR based on Standard Deviation (dr_sd)
        azadkia_result_sd = Tn(lr_measures, dr_sd_measures, vix_values)
        print(f"Azadkia-Chatterjee Conditional Dependence Result (DR/SD): {azadkia_result_sd}")
        
        # Perform Azadkia-Chatterjee Method for DR based on VaR (dr_var)
        azadkia_result_var = Tn(lr_measures, dr_var_measures, vix_values)
        print(f"Azadkia-Chatterjee Conditional Dependence Result (DR/Var): {azadkia_result_var}")
    else:
        print(f"No valid portfolio metrics for {file_name}")


if __name__ == "__main__":
    for file_name in file_array:
        main(file_name, df)
