In [None]:
import os
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from numpy.random import dirichlet
#Loaing portoflios, they are .txt files with tickers on each line.
df = pd.read_csv('news_data.csv') #Loading the news data, the column title is headline, content is full text
os.chdir("portfolios")
file_names = os.listdir()
file_array = [file for file in file_names]

In [None]:
import os
import re
import numpy as np
import pandas as pd
import yfinance as yf
from scipy.optimize import minimize
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix
import string
import warnings
from datetime import datetime, timedelta
import matplotlib.pyplot as plt


warnings.filterwarnings('ignore')

# NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Options for optimization
options = {
    'maxiter': 3000,
    'ftol': 1e-7,
    'gtol': 1e-7,
    'disp': False,
    'eps': 1e-7
}

# Preprocess text (Lemmatization, stopword removal, etc.)
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return words

# Fetch news titles from CSV for a specified date range
def fetch_news_from_csv(ticker, start_date=None, end_date=None):
    try:
        filtered_df = df[df['ticker'] == ticker]
        if start_date:
            filtered_df = filtered_df[filtered_df['date'] >= start_date]
        if end_date:
            filtered_df = filtered_df[filtered_df['date'] <= end_date]
        return filtered_df['title'].tolist()
    except Exception:
        return []

# Build a term-frequency matrix for all tickers
def build_term_frequency_matrix(headlines_dict):
    all_texts = [' '.join(preprocess_text(' '.join(headlines))) for headlines in headlines_dict.values()]
    vectorizer = CountVectorizer()
    term_freq_matrix = vectorizer.fit_transform(all_texts)
    return csr_matrix(term_freq_matrix), vectorizer.get_feature_names_out()

# Calculate Lexical Ratio using term frequency matrix
def calculate_lr_matrix(term_freq_matrix, weights):
    weighted_matrix = term_freq_matrix.T.dot(weights)
    weighted_matrix = np.array(weighted_matrix).flatten()
    total_weight = np.sum(weighted_matrix)
    probabilities = weighted_matrix / total_weight
    H_combined = -np.sum(probabilities * np.log(probabilities + 1e-10))
    m = term_freq_matrix.shape[1]
    LR = H_combined / np.log(m) if m > 1 else 0
    return LR

# Calculate weighted return percentage
def calculate_weighted_return_percentage(returns, weights):
    return np.sum(returns.mean() * weights) * 252

# Function to calculate key metrics
def calculate_metrics(weights, returns, risk_free_rate=0.024):
    portfolio_returns = np.dot(returns, weights)
    portfolio_return = np.sum(returns.mean() * weights) * 252
    portfolio_volatility = np.sqrt(np.dot(weights.T, np.dot(returns.cov() * 252, weights)))
    excess_return = portfolio_return - risk_free_rate
    sharpe_ratio = excess_return / portfolio_volatility
    downside_returns = np.where(portfolio_returns < 0, portfolio_returns, 0)
    downside_volatility = np.std(downside_returns) * np.sqrt(252)
    sortino_ratio = excess_return / downside_volatility if downside_volatility != 0 else np.nan
    return {
        "Sharpe Ratio": sharpe_ratio,
        "Sortino Ratio": sortino_ratio,
        "Annualized Return": round(portfolio_return, 4),
        "Annualized Volatility": round(portfolio_volatility, 4),
        "Downside Volatility": round(downside_volatility, 4)
    }
# Build a term-frequency matrix for all tickers
def build_term_frequency_matrix(headlines_dict):
    all_texts = [' '.join(preprocess_text(' '.join(headlines))) for headlines in headlines_dict.values()]
    vectorizer = CountVectorizer()
    term_freq_matrix = vectorizer.fit_transform(all_texts)
    return csr_matrix(term_freq_matrix), vectorizer.get_feature_names_out()

# Calculate Lexical Ratio using term frequency matrix
def calculate_lr_matrix(term_freq_matrix, weights):
    weighted_matrix = term_freq_matrix.T.dot(weights)
    weighted_matrix = np.array(weighted_matrix).flatten()
    total_weight = np.sum(weighted_matrix)
    probabilities = weighted_matrix / total_weight
    H_combined = -np.sum(probabilities * np.log(probabilities + 1e-10))
    m = term_freq_matrix.shape[1]
    LR = H_combined / np.log(m) if m > 1 else 0
    return LR

# Optimization using Lexical Ratio (LR)
def optimize_portfolio_lr(tickers, returns, target_returns, headlines_dict):
    term_freq_matrix, feature_names = build_term_frequency_matrix(headlines_dict)
    n = len(tickers)
    results = []

    for target_return in target_returns:
        def objective(weights):
            lr = calculate_lr_matrix(term_freq_matrix, weights)
            return -lr  # Maximize LR by minimizing negative LR

        constraints = [
            {'type': 'eq', 'fun': lambda weights: np.sum(weights) - 1},
            {'type': 'ineq', 'fun': lambda weights: calculate_weighted_return_percentage(returns, weights) - target_return}
        ]
        bounds = tuple((0, 1) for _ in range(n))
        initial_weights = np.ones(n) / n
        result = minimize(objective, initial_weights, method='SLSQP', bounds=bounds, options=options, constraints=constraints)
        if result.success:
            results.append((target_return, result.x))

    return results

# Markowitz Optimization
def traditional_markowitz(tickers, returns, target_returns):
    n = len(tickers)
    results = []

    for target_return in target_returns:
        def objective(weights):
            portfolio_volatility = np.sqrt(np.dot(weights.T, np.dot(returns.cov() * 252, weights)))
            return portfolio_volatility

        constraints = [
            {'type': 'eq', 'fun': lambda weights: np.sum(weights) - 1},
            {'type': 'ineq', 'fun': lambda weights: np.sum(returns.mean() * weights) * 252 - target_return}
        ]
        bounds = tuple((0, 1) for _ in range(n))
        initial_weights = np.ones(n) / n
        result = minimize(objective, initial_weights, method='SLSQP', bounds=bounds, options=options, constraints=constraints)
        if result.success:
            results.append((target_return, result.x))

    return results

# DR-SD Optimization
def optimize_portfolio_dr_sd(tickers, returns, target_returns):
    n = len(tickers)
    results = []

    for target_return in target_returns:
        def objective(weights):
            portfolio_sd = np.std(np.dot(returns, weights)) * np.sqrt(252)
            individual_sds = np.std(returns, axis=0) * np.sqrt(252)
            dr_sd = portfolio_sd / np.mean(individual_sds)
            return dr_sd

        constraints = [
            {'type': 'eq', 'fun': lambda weights: np.sum(weights) - 1},
            {'type': 'ineq', 'fun': lambda weights: np.sum(returns.mean() * weights) * 252 - target_return}
        ]
        bounds = tuple((0, 1) for _ in range(n))
        initial_weights = np.ones(n) / n

        result = minimize(objective, initial_weights, method='SLSQP', bounds=bounds, options=options, constraints=constraints)
        if result.success:
            results.append((target_return, result.x))

    return results

# DR-VaR Optimization
def optimize_portfolio_dr_var(tickers, returns, target_returns, alpha=0.05):
    n = len(tickers)
    results = []

    def value_at_risk(returns, alpha=alpha):
        return np.percentile(returns, 100 * alpha)

    for target_return in target_returns:
        def objective(weights):
            portfolio_returns = np.dot(returns, weights)
            portfolio_var = value_at_risk(portfolio_returns) * np.sqrt(252)
            individual_vars = np.array([value_at_risk(returns.iloc[:, i]) for i in range(returns.shape[1])])
            dr_var = portfolio_var / np.mean(individual_vars)
            return dr_var

        constraints = [
            {'type': 'eq', 'fun': lambda weights: np.sum(weights) - 1},
            {'type': 'ineq', 'fun': lambda weights: np.sum(returns.mean() * weights) * 252 - target_return}
        ]
        bounds = tuple((0, 1) for _ in range(n))
        initial_weights = np.ones(n) / n

        result = minimize(objective, initial_weights, method='SLSQP', bounds=bounds, options=options, constraints=constraints)
        if result.success:
            results.append((target_return, result.x))

    return results

# Fetch historical returns for tickers with error handling
def fetch_historical_returns(tickers, start_date, end_date):
    valid_tickers = []
    all_returns = pd.DataFrame()
    for ticker in tickers:
        tickery=ticker.replace(".","-") #yfinance form
        try:
            data = yf.download(tickery, start=start_date, end=end_date, progress=False)['Adj Close']
            if not data.empty:
                returns = data.pct_change().dropna()
                all_returns[ticker] = returns
                valid_tickers.append(ticker)
        except Exception:
            pass 
    
    return all_returns, valid_tickers

# Generate date ranges for optimization and testing
def generate_date_ranges(start_date, end_date, opt_period_months, test_period_months):
    date_ranges = []
    start = start_date
    while start < end_date:
        opt_end = start + timedelta(days=opt_period_months * 30)
        test_start = opt_end
        test_end = test_start + timedelta(days=test_period_months * 30)

        if test_end > end_date:
            break

        date_ranges.append(((start.strftime('%Y-%m-%d'), opt_end.strftime('%Y-%m-%d')),
                            (test_start.strftime('%Y-%m-%d'), test_end.strftime('%Y-%m-%d'))))
        
        start = start + timedelta(days=3 * 30)

    return date_ranges

# Function to plot results
def plot(portfolio_files, start_date='2018-01-01', end_date='2024-06-30', opt_period_months=24, test_period_months=6, target_returns=[0.7,0.1,0.13,0.16], risk_free_rate=0.024):
    for file_name in portfolio_files:
        print(f"\nProcessing portfolio: {file_name}\n")
        
        # Load portfolio tickers
        with open(file_name, 'r') as file:
            tickers = [line.strip() for line in file]

        if not tickers:
            print(f"No tickers found in file: {file_name}")
            continue

        # Generate date ranges
        date_ranges = generate_date_ranges(datetime.strptime(start_date, '%Y-%m-%d'), datetime.strptime(end_date, '%Y-%m-%d'), opt_period_months, test_period_months)

        # Initialize dictionaries to store results
        sortino_values = {method: [] for method in ['Lexical Ratio', 'Volatility (Markowitz)', 'Diversification Ratio based on Standard Deviation', 'Diversification Ratio based on Value at Risk']}
        sharpe_values = {method: [] for method in ['Lexical Ratio', 'Volatility (Markowitz)', 'Diversification Ratio based on Standard Deviation', 'Diversification Ratio based on Value at Risk']}
        downside_vol_values = {method: [] for method in ['Lexical Ratio', 'Volatility (Markowitz)', 'Diversification Ratio based on Standard Deviation', 'Diversification Ratio based on Value at Risk']}
        annual_vol_values = {method: [] for method in ['Lexical Ratio', 'Volatility (Markowitz)', 'Diversification Ratio based on Standard Deviation', 'Diversification Ratio based on Value at Risk']}
        annual_return_values = {method: [] for method in ['Lexical Ratio', 'Volatility (Markowitz)', 'Diversification Ratio based on Standard Deviation', 'Diversification Ratio based on Value at Risk']}
        date_labels = []

        for opt_window, test_window in date_ranges:
            window_start, window_end = opt_window
            test_start, test_end = test_window
            
            print(f"\nOptimizing on window: {window_start} to {window_end}\nTesting on window: {test_start} to {test_end}\n")
            
            # Fetch historical returns for the optimization window
            returns, valid_tickers = fetch_historical_returns(tickers, window_start, window_end)
            if not valid_tickers:
                print(f"No valid tickers found for optimization in file: {file_name} for window: {window_start} to {window_end}")
                continue

            # Fetch and store news headlines for each stock in the optimization window
            headlines_dict = {}
            for ticker in valid_tickers:
                headlines = fetch_news_from_csv(ticker, start_date=window_start, end_date=window_end)
                headlines_dict[ticker] = headlines

            # Optimize portfolios for the current window
            lr_results = optimize_portfolio_lr(valid_tickers, returns, target_returns, headlines_dict)
            markowitz_results = traditional_markowitz(valid_tickers, returns, target_returns=target_returns)
            dr_sd_results = optimize_portfolio_dr_sd(valid_tickers, returns, target_returns)
            dr_var_results = optimize_portfolio_dr_var(valid_tickers, returns, target_returns)

            # Fetch historical returns for the testing window
            test_returns, valid_tickers_test = fetch_historical_returns(valid_tickers, test_start, test_end)
            if not valid_tickers_test:
                print(f"No valid tickers for testing window: {test_start} to {test_end}")
                continue

            # Calculate and collect performance metrics for the testing window
            try:
                lr_weights = lr_results[0][1]  # First target return for plotting
                lr_metrics = calculate_metrics(lr_weights, test_returns, risk_free_rate)
                markowitz_weights = markowitz_results[0][1]
                markowitz_metrics = calculate_metrics(markowitz_weights, test_returns, risk_free_rate)
                dr_sd_weights = dr_sd_results[0][1]
                dr_sd_metrics = calculate_metrics(dr_sd_weights, test_returns, risk_free_rate)
                dr_var_weights = dr_var_results[0][1]
                dr_var_metrics = calculate_metrics(dr_var_weights, test_returns, risk_free_rate)
            except IndexError as e:
                print(f"Optimization results missing for some methods: {e}")
                continue

            # Collect the metrics for plotting
            sortino_values['Lexical Ratio'].append(lr_metrics['Sortino Ratio'])
            sortino_values['Volatility (Markowitz)'].append(markowitz_metrics['Sortino Ratio'])
            sortino_values['Diversification Ratio based on Standard Deviation'].append(dr_sd_metrics['Sortino Ratio'])
            sortino_values['Diversification Ratio based on Value at Risk'].append(dr_var_metrics['Sortino Ratio'])

            sharpe_values['Lexical Ratio'].append(lr_metrics['Sharpe Ratio'])
            sharpe_values['Volatility (Markowitz)'].append(markowitz_metrics['Sharpe Ratio'])
            sharpe_values['Diversification Ratio based on Standard Deviation'].append(dr_sd_metrics['Sharpe Ratio'])
            sharpe_values['Diversification Ratio based on Value at Risk'].append(dr_var_metrics['Sharpe Ratio'])


            downside_vol_values['Lexical Ratio'].append(lr_metrics['Downside Volatility'])
            downside_vol_values['Volatility (Markowitz)'].append(markowitz_metrics['Downside Volatility'])
            downside_vol_values['Diversification Ratio based on Standard Deviation'].append(dr_sd_metrics['Downside Volatility'])
            downside_vol_values['Diversification Ratio based on Value at Risk'].append(dr_var_metrics['Downside Volatility'])

            annual_vol_values['Lexical Ratio'].append(lr_metrics['Annualized Volatility'])
            annual_vol_values['Volatility (Markowitz)'].append(markowitz_metrics['Annualized Volatility'])
            annual_vol_values['Diversification Ratio based on Standard Deviation'].append(dr_sd_metrics['Annualized Volatility'])
            annual_vol_values['Diversification Ratio based on Value at Risk'].append(dr_var_metrics['Annualized Volatility'])

            annual_return_values['Lexical Ratio'].append(lr_metrics['Annualized Return'])
            annual_return_values['Volatility (Markowitz)'].append(markowitz_metrics['Annualized Return'])
            annual_return_values['Diversification Ratio based on Standard Deviation'].append(dr_sd_metrics['Annualized Return'])
            annual_return_values['Diversification Ratio based on Value at Risk'].append(dr_var_metrics['Annualized Return'])

            # Store the date label for the x-axis
            date_labels.append(f"{test_start} to {test_end}")

        # Define line styles and markers
        line_styles = {
            'Lexical Ratio': ('-', 'o'),  # Solid line with circles
            'Volatility (Markowitz)': ('--', 's'),  # Dashed line with squares
            'Diversification Ratio based on Standard Deviation': (':', '^'),  # Dotted line with triangles
            'Diversification Ratio based on Value at Risk': ('-.', 'D')  # Dash-dot line with diamonds
        }

        # Plot Sortino Ratio graph
        plt.figure(figsize=(10, 6))
        for method, values in sortino_values.items():
            linestyle, marker = line_styles[method]
            plt.plot(date_labels, values, label=method, linestyle=linestyle, marker=marker, markersize=8)
        plt.title(f'Sortino Ratio - {file_name.replace(".txt", "")}')
        plt.xlabel('Date Interval Tested On')
        plt.ylabel('Sortino Ratio')
        plt.xticks(rotation=45)
        plt.legend()
        plt.tight_layout()
        plt.show()

        # Plot Sharpe Ratio graph
        plt.figure(figsize=(10, 6))
        for method, values in sharpe_values.items():
            linestyle, marker = line_styles[method]
            plt.plot(date_labels, values, label=method, linestyle=linestyle, marker=marker, markersize=8)
        plt.title(f'Sharpe Ratio - {file_name.replace(".txt", "")}')
        plt.xlabel('Date Interval Tested On')
        plt.ylabel('Sharpe Ratio')
        plt.xticks(rotation=45)
        plt.legend()
        plt.tight_layout()
        plt.show()

        # Plot Downside Volatility graph
        plt.figure(figsize=(10, 6))
        for method, values in downside_vol_values.items():
            linestyle, marker = line_styles[method]
            plt.plot(date_labels, values, label=method, linestyle=linestyle, marker=marker, markersize=8)
        plt.title(f'Downside Volatility - {file_name.replace(".txt", "")}')
        plt.xlabel('Date Interval Tested On')
        plt.ylabel('Downside Volatility')
        plt.xticks(rotation=45)
        plt.legend()
        plt.tight_layout()
        plt.show()

        # Plot Annualized Volatility graph
        plt.figure(figsize=(10, 6))
        for method, values in annual_vol_values.items():
            linestyle, marker = line_styles[method]
            plt.plot(date_labels, values, label=method, linestyle=linestyle, marker=marker, markersize=8)
        plt.title(f'Annualized Volatility - {file_name.replace(".txt", "")}')
        plt.xlabel('Date Interval Tested On')
        plt.ylabel('Annualized Volatility')
        plt.xticks(rotation=45)
        plt.legend()
        plt.tight_layout()
        plt.show()

        # Plot Annualized Return graph
        plt.figure(figsize=(10, 6))
        for method, values in annual_return_values.items():
            linestyle, marker = line_styles[method]
            plt.plot(date_labels, values, label=method, linestyle=linestyle, marker=marker, markersize=8)
        plt.title(f'Annualized Return - {file_name.replace(".txt", "")}')
        plt.xlabel('Date Interval Tested On')
        plt.ylabel('Annualized Return')
        plt.xticks(rotation=45)
        plt.legend()
        plt.tight_layout()
        plt.show()



In [None]:
#Ranking and values
def ranks(portfolio_files, start_date='2018-01-01', end_date='2024-06-30', opt_period_months=24, test_period_months=6, target_returns=[0.07, 0.1, 0.13, 0.16], risk_free_rate=0.024):
    
    # Initialize a dictionary to store results for each portfolio
    all_portfolio_metrics = {file_name: {'lr': [], 'markowitz': [], 'dr_sd': [], 'dr_var': []} for file_name in portfolio_files}
    
    for file_name in portfolio_files:
        print(f"\nProcessing portfolio: {file_name}\n")
        
        # Load portfolio tickers
        with open(file_name, 'r') as file:
            tickers = [line.strip() for line in file]

        if not tickers:
            print(f"No tickers found in file: {file_name}")
            continue

        # Generate sliding date ranges
        date_ranges = generate_date_ranges(datetime.strptime(start_date, '%Y-%m-%d'), datetime.strptime(end_date, '%Y-%m-%d'), opt_period_months, test_period_months)

        # Initialize a dictionary to accumulate metrics across all intervals
        metrics_accumulator = {method: [] for method in all_portfolio_metrics[file_name].keys()}
        
        for opt_window, test_window in date_ranges:
            window_start, window_end = opt_window
            test_start, test_end = test_window
            
            print(f"\nOptimizing on window: {window_start} to {window_end}\nTesting on window: {test_start} to {test_end}\n")
            
            # Fetch historical returns for the optimization window
            returns , valid_tick = fetch_historical_returns(tickers, window_start, window_end)
            returns_test , valid_tickers = fetch_historical_returns(tickers, test_start, test_end)
            # Fetch and store news headlines for each stock in the optimization window
            headlines_dict = {}
            for ticker in valid_tickers:
                headlines = fetch_news_from_csv(ticker, start_date=window_start, end_date=window_end)
                headlines_dict[ticker] = headlines
            if not any(headlines_dict.values()):
                print(f"No news fetched for any tickers in file: {file_name} for window: {window_start} to {window_end}")
                continue

            # Optimize portfolios for the current window
            lr_results = optimize_portfolio_lr(valid_tickers, returns, target_returns, headlines_dict)
            markowitz_results = traditional_markowitz(valid_tickers, returns, target_returns)
            dr_sd_results = optimize_portfolio_dr_sd(valid_tickers, returns, target_returns)
            dr_var_results = optimize_portfolio_dr_var(valid_tickers, returns, target_returns)

            # Fetch historical returns for the testing window
            test_returns, valid_tickers_test = fetch_historical_returns(valid_tickers , test_start, test_end)
            # Calculate performance metrics for the testing window across all target returns
            for i in range(len(target_returns)):
                try:
                    lr_weights = lr_results[i][1]
                    lr_metrics = calculate_metrics(lr_weights, test_returns, risk_free_rate)
                    markowitz_weights = markowitz_results[i][1]
                    markowitz_metrics = calculate_metrics(markowitz_weights, test_returns, risk_free_rate)
                    dr_sd_weights = dr_sd_results[i][1]
                    dr_sd_metrics = calculate_metrics(dr_sd_weights, test_returns, risk_free_rate)
                    dr_var_weights = dr_var_results[i][1]
                    dr_var_metrics = calculate_metrics(dr_var_weights, test_returns, risk_free_rate)
                    
                    # Accumulate metrics for each method across intervals and returns
                    metrics_accumulator['lr'].append(lr_metrics)
                    metrics_accumulator['markowitz'].append(markowitz_metrics)
                    metrics_accumulator['dr_sd'].append(dr_sd_metrics)
                    metrics_accumulator['dr_var'].append(dr_var_metrics)
                    
                except IndexError as e:
                    continue

        # After all intervals for the current portfolio are processed, calculate the mean metrics over all intervals and target returns
        mean_metrics = {method: {} for method in metrics_accumulator.keys()}
        for method, metrics_list in metrics_accumulator.items():
            if metrics_list:
                for metric in metrics_list[0].keys():
                    # Compute the mean of all intervals and target returns
                    mean_metrics[method][metric] = np.mean([metrics[metric] for metrics in metrics_list])

        # Print mean metrics for the current portfolio
        print(f"\nMean Metrics over all intervals for Portfolio: {file_name}")
        for method, metrics in mean_metrics.items():
            print(f"\nMethod: {method.capitalize()}")
            for metric, value in metrics.items():
                print(f"{metric}: {value:.4f}")

        # Ranking based on metrics
        print(f"\nRanking for Portfolio: {file_name}")
        ranking = {metric: sorted(mean_metrics.keys(), key=lambda x: mean_metrics[x][metric], reverse=(metric not in ["Annualized Volatility", "Downside Volatility"])) for metric in mean_metrics['lr'].keys()}
        
        print(f"\nRanking of Methods for Portfolio: {file_name}")
        for metric, methods in ranking.items():
            print(f"\nMetric: {metric}")
            for rank, method in enumerate(methods, 1):
                print(f"{rank}. {method.capitalize()} with {metric}: {mean_metrics[method][metric]:.4f}")




In [None]:
portfolio_files = file_array
ranks(portfolio_files)

In [None]:
plot(portfolio_files)