<a href="https://colab.research.google.com/github/gustikresna/LLMs-StockMovement-Forecasting/blob/main/Portfolio_Analysis_Fine_Tuned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **IMPORT LIBRARIES AND LOAD DATA**

In [None]:
#import libraries
import pandas as pd
import numpy as np
from google.colab import files, runtime

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# load test dataset
test_path = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Dataset/test_data.csv'
test_df = pd.read_csv(test_path)
test_df['start_date'] = pd.to_datetime(test_df['start_date'])

# group by start_date of the week since the original test_df still contains multiple instances per week
test_df_weekly = test_df.groupby(['permco', 'start_date'])['weekly_ret'].last().reset_index()

In [None]:
# load S&P 500 for benchmark
spx500_path = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Dataset/spx500_weekly_returns.csv'
spx500_df = pd.read_csv(spx500_path)
spx500_df['caldt'] = pd.to_datetime(spx500_df['caldt'])

In [None]:
# load market cap of selected stocks
marketcap_path = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Dataset/market_cap.csv'
marketcap_df = pd.read_csv(marketcap_path)
marketcap_df['week_start_date'] = pd.to_datetime(marketcap_df['week_start_date'])

# **BERT**

## **Portfolio**

In [None]:
# load bert's accuracy of rolling window prediction
bert_path_accuracy = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/bert_rolling_finetuned_accuracy.csv'
bert_accuracy = pd.read_csv(bert_path_accuracy)

# load bert's prediction of rolling window prediction
bert_path_pred = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/bert_rolling_finetuned_prediction.csv'
bert_pred = pd.read_csv(bert_path_pred)

#change to datetime
bert_pred['week_date'] = pd.to_datetime(bert_pred['week_date'])

In [None]:
# encode price_direction
label_mapping = {0: 'negative', 1: 'positive'}
bert_pred['prediction'] = bert_pred['prediction'].map(label_mapping)
bert_pred['actual'] = bert_pred['actual'].map(label_mapping)

In [None]:
# merge to get return data
bert_ret = pd.merge(bert_pred, test_df_weekly, how='left', left_on=['company', 'week_date'], right_on=['permco', 'start_date'])
bert_ret.drop(['permco', 'start_date'], axis=1, inplace=True)

In [None]:
# merge to get market cap data
bert_ret = pd.merge(bert_ret, marketcap_df, how='left', left_on=['company', 'week_date'], right_on=['permco', 'week_start_date'])
bert_ret.drop(['permco', 'week_start_date'], axis=1, inplace=True)

In [None]:
# merge to get market (S&P 500) return
bert_df = pd.merge(bert_ret, spx500_df, left_on='week_date', right_on='caldt')
bert_df.drop('caldt', axis=1, inplace=True)

# sort
bert_df.sort_values(by=['company', 'week_date'], inplace=True)
bert_df.reset_index(drop=True, inplace=True)

In [None]:
cumulative_log_returns_by_date = {
    'date': [],
    'cum_EL_return': [],
    'cum_ES_return': [],
    'cum_ELS_return': [],
    'cum_VL_return': [],
    'cum_VS_return': [],
    'cum_VLS_return': [],
    'cum_market_return': []
}

# group by week_date
grouped_neg = bert_df.loc[bert_df.groupby(['week_date', 'company'])['probability_neg'].idxmax()]
grouped_pos = bert_df.loc[bert_df.groupby(['week_date', 'company'])['probability_pos'].idxmax()]

# initialise cumulative log returns
cum_EL_return = 0
cum_ES_return = 0
cum_ELS_return = 0
cum_VL_return = 0
cum_VS_return = 0
cum_VLS_return = 0
cum_market_return = 0

# iterate over week
for week in bert_df['week_date'].unique():
    group_neg = grouped_neg.loc[grouped_neg['week_date'] == week]
    group_pos = grouped_pos.loc[grouped_pos['week_date'] == week]

    # sort by probability for positive and negative predictions
    top_positive = group_pos.sort_values(by='probability_pos', ascending=False).head(5)
    top_negative = group_neg.sort_values(by='probability_neg', ascending=False).head(5)

    # calculate log returns
    top_positive['log_return'] = np.log1p(top_positive['weekly_ret'])
    top_negative['log_return'] = np.log1p(top_negative['weekly_ret'])
    bert_df.loc[bert_df['week_date'] == week, 'log_return'] = np.log1p(bert_df.loc[bert_df['week_date'] == week, 'weekly_ret'])

    # Equal-Weighted Long Log Returns
    equal_long_log_return = top_positive['log_return'].mean()

    # Equal-Weighted Short Log Returns
    equal_short_log_return = top_negative['log_return'].mean() * -1  # Negate for short

    # Equal-Weighted Long-Short Log Returns
    equal_long_short_log_return = equal_long_log_return + equal_short_log_return

    # Value-Weighted Long Log Returns based on market cap
    total_market_cap_positive = top_positive['market_cap'].sum()
    value_long_log_return = (top_positive['log_return'] * top_positive['market_cap']).sum() / total_market_cap_positive

    # Value-Weighted Short Log Returns based on market cap
    total_market_cap_negative = top_negative['market_cap'].sum()
    value_short_log_return = (top_negative['log_return'] * top_negative['market_cap']).sum() / total_market_cap_negative * -1  # Negate for short

    # Value-Weighted Long-Short Log Returns
    value_long_short_log_return = value_long_log_return + value_short_log_return

    # Market Log Return (equal-weighted average of all assets in the group)
    market_log_return = bert_df.loc[bert_df['week_date'] == week, 'log_return'].mean()

    # update cumulative log returns
    cum_EL_return += equal_long_log_return
    cum_ES_return += equal_short_log_return
    cum_ELS_return += equal_long_short_log_return
    cum_VL_return += value_long_log_return
    cum_VS_return += value_short_log_return
    cum_VLS_return += value_long_short_log_return
    cum_market_return += market_log_return

    # append results for this date
    cumulative_log_returns_by_date['date'].append(week)
    cumulative_log_returns_by_date['cum_EL_return'].append(cum_EL_return)
    cumulative_log_returns_by_date['cum_ES_return'].append(cum_ES_return)
    cumulative_log_returns_by_date['cum_ELS_return'].append(cum_ELS_return)
    cumulative_log_returns_by_date['cum_VL_return'].append(cum_VL_return)
    cumulative_log_returns_by_date['cum_VS_return'].append(cum_VS_return)
    cumulative_log_returns_by_date['cum_VLS_return'].append(cum_VLS_return)
    cumulative_log_returns_by_date['cum_market_return'].append(cum_market_return)

# convert to dataframe
cumulative_log_returns_bert = pd.DataFrame(cumulative_log_returns_by_date)

In [None]:
# define path to save results
path_bert = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Portfolio Analysis - Fine Tune/bert_portfolio.csv'

# save to csv
cumulative_log_returns_bert.to_csv(path_bert, index=False)

## **Sharpe Ratio**

In [None]:
# calculate weekly log returns from cumulative log returns
log_returns = cumulative_log_returns_bert.set_index('date').diff().dropna()

# define function to calculate Sharpe Ratio
def calculate_sharpe_ratio(return_series, risk_free_rate=0):
    mean_return = return_series.mean()
    std_return = return_series.std()
    excess_return = mean_return - risk_free_rate
    sharpe_ratio = (excess_return / std_return) * np.sqrt(52)
    return mean_return, std_return, sharpe_ratio

sharpe_ratios = []

for column in log_returns.columns:
    mean_return, std_return, sharpe_ratio = calculate_sharpe_ratio(log_returns[column])
    sharpe_ratios.append({
        'Portfolio': column,
        'Mean Return': mean_return,
        'Standard Deviation': std_return,
        'Sharpe Ratio': sharpe_ratio
    })

# convert to dataframe
sharpe_ratios_df = pd.DataFrame(sharpe_ratios)

In [None]:
sharpe_ratios_df

Unnamed: 0,Portfolio,Mean Return,Standard Deviation,Sharpe Ratio
0,cum_EL_return,0.003318,0.025703,0.931024
1,cum_ES_return,-0.003587,0.02651,-0.975805
2,cum_ELS_return,-0.000269,0.017809,-0.108858
3,cum_VL_return,0.002836,0.024365,0.839324
4,cum_VS_return,-0.003499,0.025969,-0.971606
5,cum_VLS_return,-0.000663,0.016811,-0.284459
6,cum_market_return,0.002172,0.023843,0.656927


In [None]:
# disconnect run time
runtime.unassign()

# **RoBERTa**

## **Portfolio**

In [None]:
# load roberta's accuracy of rolling window prediction
roberta_path_accuracy = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/roberta_rolling_finetuned_accuracy.csv'
roberta_accuracy = pd.read_csv(roberta_path_accuracy)

# load roberta's prediction of rolling window prediction
roberta_path_pred = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/roberta_rolling_finetuned_prediction.csv'
roberta_pred = pd.read_csv(roberta_path_pred)

# change to datetime
roberta_pred['week_date'] = pd.to_datetime(roberta_pred['week_date'])

In [None]:
# encode price_direction
label_mapping = {0: 'negative', 1: 'positive'}
roberta_pred['prediction'] = roberta_pred['prediction'].map(label_mapping)
roberta_pred['actual'] = roberta_pred['actual'].map(label_mapping)

In [None]:
# merge to get return data
roberta_ret = pd.merge(roberta_pred, test_df_weekly, how='left', left_on=['company', 'week_date'], right_on=['permco', 'start_date'])
roberta_ret.drop(['permco', 'start_date'], axis=1, inplace=True)

In [None]:
# merge to get market cap data
roberta_ret = pd.merge(roberta_ret, marketcap_df, how='left', left_on=['company', 'week_date'], right_on=['permco', 'week_start_date'])
roberta_ret.drop(['permco', 'week_start_date'], axis=1, inplace=True)

In [None]:
# merge to get market (S&P 500) return
roberta_df = pd.merge(roberta_ret, spx500_df, left_on='week_date', right_on='caldt')
roberta_df.drop('caldt', axis=1, inplace=True)

# sort
roberta_df.sort_values(by=['company', 'week_date'], inplace=True)
roberta_df.reset_index(drop=True, inplace=True)

In [None]:
cumulative_log_returns_by_date = {
    'date': [],
    'cum_EL_return': [],
    'cum_ES_return': [],
    'cum_ELS_return': [],
    'cum_VL_return': [],
    'cum_VS_return': [],
    'cum_VLS_return': [],
    'cum_market_return': []
}

# group by week_date and company to get the maximum probabilities for each company in a week
grouped_max_neg = roberta_df.loc[roberta_df.groupby(['week_date', 'company'])['probability_neg'].idxmax()]
grouped_max_pos = roberta_df.loc[roberta_df.groupby(['week_date', 'company'])['probability_pos'].idxmax()]

# group by week_date
grouped_neg = grouped_max_neg.groupby('week_date')
grouped_pos = grouped_max_pos.groupby('week_date')

# initialise cumulative log returns
cum_EL_return = 0
cum_ES_return = 0
cum_ELS_return = 0
cum_VL_return = 0
cum_VS_return = 0
cum_VLS_return = 0
cum_market_return = 0

# iterate over week
for week in roberta_df['week_date'].unique():
    group_neg = grouped_neg.get_group(week)
    group_pos = grouped_pos.get_group(week)

    # sort by probability for positive and negative predictions
    top_positive = group_pos.sort_values(by='probability_pos', ascending=False).head(5)
    top_negative = group_neg.sort_values(by='probability_neg', ascending=False).head(5)

    # calculate log returns
    top_positive['log_return'] = np.log1p(top_positive['weekly_ret'])
    top_negative['log_return'] = np.log1p(top_negative['weekly_ret'])
    roberta_df.loc[roberta_df['week_date'] == week, 'log_return'] = np.log1p(roberta_df.loc[roberta_df['week_date'] == week, 'weekly_ret'])

    # Equal-Weighted Long Log Returns
    equal_long_log_return = top_positive['log_return'].mean()

    # Equal-Weighted Short Log Returns
    equal_short_log_return = top_negative['log_return'].mean() * -1  # Negate for short

    # Equal-Weighted Long-Short Log Returns
    equal_long_short_log_return = equal_long_log_return + equal_short_log_return

    # Value-Weighted Long Log Returns based on market cap
    total_market_cap_positive = top_positive['market_cap'].sum()
    value_long_log_return = (top_positive['log_return'] * top_positive['market_cap']).sum() / total_market_cap_positive

    # Value-Weighted Short Log Returns based on market cap
    total_market_cap_negative = top_negative['market_cap'].sum()
    value_short_log_return = (top_negative['log_return'] * top_negative['market_cap']).sum() / total_market_cap_negative * -1  # Negate for short

    # Value-Weighted Long-Short Log Returns
    value_long_short_log_return = value_long_log_return + value_short_log_return

    # Market Log Return (equal-weighted average of all assets in the group)
    market_log_return = roberta_df.loc[roberta_df['week_date'] == week, 'log_return'].mean()

    # update cumulative log returns
    cum_EL_return += equal_long_log_return
    cum_ES_return += equal_short_log_return
    cum_ELS_return += equal_long_short_log_return
    cum_VL_return += value_long_log_return
    cum_VS_return += value_short_log_return
    cum_VLS_return += value_long_short_log_return
    cum_market_return += market_log_return

    # append results for this date
    cumulative_log_returns_by_date['date'].append(week)
    cumulative_log_returns_by_date['cum_EL_return'].append(cum_EL_return)
    cumulative_log_returns_by_date['cum_ES_return'].append(cum_ES_return)
    cumulative_log_returns_by_date['cum_ELS_return'].append(cum_ELS_return)
    cumulative_log_returns_by_date['cum_VL_return'].append(cum_VL_return)
    cumulative_log_returns_by_date['cum_VS_return'].append(cum_VS_return)
    cumulative_log_returns_by_date['cum_VLS_return'].append(cum_VLS_return)
    cumulative_log_returns_by_date['cum_market_return'].append(cum_market_return)

# convert to dataframe
cumulative_log_returns_roberta = pd.DataFrame(cumulative_log_returns_by_date)

In [None]:
# define path to save results
path_roberta = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Portfolio Analysis - Fine Tune/roberta_portfolio.csv'

# save to csv
cumulative_log_returns_roberta.to_csv(path_roberta, index=False)

## **Sharpe Ratio**

In [None]:
# calculate weekly log returns from cumulative log returns
log_returns = cumulative_log_returns_roberta.set_index('date').diff().dropna()

# define function to calculate Sharpe Ratio
def calculate_sharpe_ratio(return_series, risk_free_rate=0):
    mean_return = return_series.mean()
    std_return = return_series.std()
    excess_return = mean_return - risk_free_rate
    sharpe_ratio = (excess_return / std_return) * np.sqrt(52)
    return mean_return, std_return, sharpe_ratio

sharpe_ratios = []

for column in log_returns.columns:
    mean_return, std_return, sharpe_ratio = calculate_sharpe_ratio(log_returns[column])
    sharpe_ratios.append({
        'Portfolio': column,
        'Mean Return': mean_return,
        'Standard Deviation': std_return,
        'Sharpe Ratio': sharpe_ratio
    })

# convert to dataframe
sharpe_ratios_df = pd.DataFrame(sharpe_ratios)

In [None]:
sharpe_ratios_df

Unnamed: 0,Portfolio,Mean Return,Standard Deviation,Sharpe Ratio
0,cum_EL_return,0.001766,0.026691,0.477135
1,cum_ES_return,-0.002725,0.024995,-0.786106
2,cum_ELS_return,-0.000959,0.018877,-0.366239
3,cum_VL_return,0.001691,0.026219,0.465172
4,cum_VS_return,-0.002469,0.024926,-0.714219
5,cum_VLS_return,-0.000777,0.018503,-0.302966
6,cum_market_return,0.002172,0.023843,0.656927


In [None]:
# disconnect run time
runtime.unassign()

# **DistilBERT**

## **Portfolio**

In [None]:
# load distilbert's accuracy of rolling window prediction
distilbert_path_accuracy = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/distilbert_rolling_finetuned_accuracy.csv'
distilbert_accuracy = pd.read_csv(distilbert_path_accuracy)

# load distilbert's prediction of rolling window prediction
distilbert_path_pred = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/distilbert_rolling_finetuned_prediction.csv'
distilbert_pred = pd.read_csv(distilbert_path_pred)

# change to datetime
distilbert_pred['week_date'] = pd.to_datetime(distilbert_pred['week_date'])

In [None]:
# encode price_direction
label_mapping = {0: 'negative', 1: 'positive'}
distilbert_pred['prediction'] = distilbert_pred['prediction'].map(label_mapping)
distilbert_pred['actual'] = distilbert_pred['actual'].map(label_mapping)

In [None]:
# merge to get return data
distilbert_ret = pd.merge(distilbert_pred, test_df_weekly, how='left', left_on=['company', 'week_date'], right_on=['permco', 'start_date'])
distilbert_ret.drop(['permco', 'start_date'], axis=1, inplace=True)

In [None]:
# merge to get market cap data
distilbert_ret = pd.merge(distilbert_ret, marketcap_df, how='left', left_on=['company', 'week_date'], right_on=['permco', 'week_start_date'])
distilbert_ret.drop(['permco', 'week_start_date'], axis=1, inplace=True)

In [None]:
# merge to get market (S&P 500) return
distilbert_df = pd.merge(distilbert_ret, spx500_df, left_on='week_date', right_on='caldt')
distilbert_df.drop('caldt', axis=1, inplace=True)

# sort
distilbert_df.sort_values(by=['company', 'week_date'], inplace=True)
distilbert_df.reset_index(drop=True, inplace=True)

In [None]:
cumulative_log_returns_by_date = {
    'date': [],
    'cum_EL_return': [],
    'cum_ES_return': [],
    'cum_ELS_return': [],
    'cum_VL_return': [],
    'cum_VS_return': [],
    'cum_VLS_return': [],
    'cum_market_return': []
}

# group by week_date and company to get the maximum probabilities for each company in a week
grouped_max_neg = distilbert_df.loc[distilbert_df.groupby(['week_date', 'company'])['probability_neg'].idxmax()]
grouped_max_pos = distilbert_df.loc[distilbert_df.groupby(['week_date', 'company'])['probability_pos'].idxmax()]

# group by week_date
grouped_neg = grouped_max_neg.groupby('week_date')
grouped_pos = grouped_max_pos.groupby('week_date')

# initialise cumulative log returns
cum_EL_return = 0
cum_ES_return = 0
cum_ELS_return = 0
cum_VL_return = 0
cum_VS_return = 0
cum_VLS_return = 0
cum_market_return = 0

# iterate over week
for week in distilbert_df['week_date'].unique():
    group_neg = grouped_neg.get_group(week)
    group_pos = grouped_pos.get_group(week)

    # sort by probability for positive and negative predictions
    top_positive = group_pos.sort_values(by='probability_pos', ascending=False).head(5)
    top_negative = group_neg.sort_values(by='probability_neg', ascending=False).head(5)

    # calculate log returns
    top_positive['log_return'] = np.log1p(top_positive['weekly_ret'])
    top_negative['log_return'] = np.log1p(top_negative['weekly_ret'])
    distilbert_df.loc[distilbert_df['week_date'] == week, 'log_return'] = np.log1p(distilbert_df.loc[distilbert_df['week_date'] == week, 'weekly_ret'])

    # Equal-Weighted Long Log Returns
    equal_long_log_return = top_positive['log_return'].mean()

    # Equal-Weighted Short Log Returns
    equal_short_log_return = top_negative['log_return'].mean() * -1  # Negate for short

    # Equal-Weighted Long-Short Log Returns
    equal_long_short_log_return = equal_long_log_return + equal_short_log_return

    # Value-Weighted Long Log Returns based on market cap
    total_market_cap_positive = top_positive['market_cap'].sum()
    value_long_log_return = (top_positive['log_return'] * top_positive['market_cap']).sum() / total_market_cap_positive

    # Value-Weighted Short Log Returns based on market cap
    total_market_cap_negative = top_negative['market_cap'].sum()
    value_short_log_return = (top_negative['log_return'] * top_negative['market_cap']).sum() / total_market_cap_negative * -1  # Negate for short

    # Value-Weighted Long-Short Log Returns
    value_long_short_log_return = value_long_log_return + value_short_log_return

    # Market Log Return (equal-weighted average of all assets in the group)
    market_log_return = distilbert_df.loc[distilbert_df['week_date'] == week, 'log_return'].mean()

    # update cumulative log returns
    cum_EL_return += equal_long_log_return
    cum_ES_return += equal_short_log_return
    cum_ELS_return += equal_long_short_log_return
    cum_VL_return += value_long_log_return
    cum_VS_return += value_short_log_return
    cum_VLS_return += value_long_short_log_return
    cum_market_return += market_log_return

    # append results for this date
    cumulative_log_returns_by_date['date'].append(week)
    cumulative_log_returns_by_date['cum_EL_return'].append(cum_EL_return)
    cumulative_log_returns_by_date['cum_ES_return'].append(cum_ES_return)
    cumulative_log_returns_by_date['cum_ELS_return'].append(cum_ELS_return)
    cumulative_log_returns_by_date['cum_VL_return'].append(cum_VL_return)
    cumulative_log_returns_by_date['cum_VS_return'].append(cum_VS_return)
    cumulative_log_returns_by_date['cum_VLS_return'].append(cum_VLS_return)
    cumulative_log_returns_by_date['cum_market_return'].append(cum_market_return)

# convert to dataframe
cumulative_log_returns_distilbert = pd.DataFrame(cumulative_log_returns_by_date)

In [None]:
# define path to save results
path_distilbert = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Portfolio Analysis - Fine Tune/distilbert_portfolio.csv'

# save to csv
cumulative_log_returns_distilbert.to_csv(path_distilbert, index=False)

## **Sharpe Ratio**

In [None]:
# calculate weekly log returns from cumulative log returns
log_returns = cumulative_log_returns_distilbert.set_index('date').diff().dropna()

# define function to calculate Sharpe Ratio
def calculate_sharpe_ratio(return_series, risk_free_rate=0):
    mean_return = return_series.mean()
    std_return = return_series.std()
    excess_return = mean_return - risk_free_rate
    sharpe_ratio = (excess_return / std_return) * np.sqrt(52)
    return mean_return, std_return, sharpe_ratio

sharpe_ratios = []

for column in log_returns.columns:
    mean_return, std_return, sharpe_ratio = calculate_sharpe_ratio(log_returns[column])
    sharpe_ratios.append({
        'Portfolio': column,
        'Mean Return': mean_return,
        'Standard Deviation': std_return,
        'Sharpe Ratio': sharpe_ratio
    })

# convert to dataframe
sharpe_ratios_df = pd.DataFrame(sharpe_ratios)

In [None]:
sharpe_ratios_df

Unnamed: 0,Portfolio,Mean Return,Standard Deviation,Sharpe Ratio
0,cum_EL_return,0.002667,0.026866,0.715905
1,cum_ES_return,-0.003086,0.025827,-0.861717
2,cum_ELS_return,-0.000419,0.019346,-0.156218
3,cum_VL_return,0.003154,0.026637,0.853725
4,cum_VS_return,-0.002748,0.025663,-0.772161
5,cum_VLS_return,0.000406,0.018681,0.156547
6,cum_market_return,0.002172,0.023843,0.656927


In [None]:
# disconnect run time
runtime.unassign()

# **DistilRoBERTa**

## **Portfolio**

In [None]:
# load distilroberta's accuracy of rolling window prediction
distilroberta_path_accuracy = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/distilroberta_rolling_finetuned_accuracy.csv'
distilroberta_accuracy = pd.read_csv(distilroberta_path_accuracy)

# load distilroberta's prediction of rolling window prediction
distilroberta_path_pred = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/distilroberta_rolling_finetuned_prediction.csv'
distilroberta_pred = pd.read_csv(distilroberta_path_pred)

# change to datetime
distilroberta_pred['week_date'] = pd.to_datetime(distilroberta_pred['week_date'])

In [None]:
# encode price_direction
label_mapping = {0: 'negative', 1: 'positive'}
distilroberta_pred['prediction'] = distilroberta_pred['prediction'].map(label_mapping)
distilroberta_pred['actual'] = distilroberta_pred['actual'].map(label_mapping)

In [None]:
# merge to get return data
distilroberta_ret = pd.merge(distilroberta_pred, test_df_weekly, how='left', left_on=['company', 'week_date'], right_on=['permco', 'start_date'])
distilroberta_ret.drop(['permco', 'start_date'], axis=1, inplace=True)

In [None]:
# merge to get market cap data
distilroberta_ret = pd.merge(distilroberta_ret, marketcap_df, how='left', left_on=['company', 'week_date'], right_on=['permco', 'week_start_date'])
distilroberta_ret.drop(['permco', 'week_start_date'], axis=1, inplace=True)

In [None]:
# merge to get market (S&P 500) return
distilroberta_df = pd.merge(distilroberta_ret, spx500_df, left_on='week_date', right_on='caldt')
distilroberta_df.drop('caldt', axis=1, inplace=True)

# sort
distilroberta_df.sort_values(by=['company', 'week_date'], inplace=True)
distilroberta_df.reset_index(drop=True, inplace=True)

In [None]:
cumulative_log_returns_by_date = {
    'date': [],
    'cum_EL_return': [],
    'cum_ES_return': [],
    'cum_ELS_return': [],
    'cum_VL_return': [],
    'cum_VS_return': [],
    'cum_VLS_return': [],
    'cum_market_return': []
}

# group by week_date and company to get the maximum probabilities for each company in a week
grouped_max_neg = distilroberta_df.loc[distilroberta_df.groupby(['week_date', 'company'])['probability_neg'].idxmax()]
grouped_max_pos = distilroberta_df.loc[distilroberta_df.groupby(['week_date', 'company'])['probability_pos'].idxmax()]

# group by week_date
grouped_neg = grouped_max_neg.groupby('week_date')
grouped_pos = grouped_max_pos.groupby('week_date')

# initialise cumulative log returns
cum_EL_return = 0
cum_ES_return = 0
cum_ELS_return = 0
cum_VL_return = 0
cum_VS_return = 0
cum_VLS_return = 0
cum_market_return = 0

# iterate over week
for week in distilroberta_df['week_date'].unique():
    group_neg = grouped_neg.get_group(week)
    group_pos = grouped_pos.get_group(week)

    # sort by probability for positive and negative predictions
    top_positive = group_pos.sort_values(by='probability_pos', ascending=False).head(5)
    top_negative = group_neg.sort_values(by='probability_neg', ascending=False).head(5)

    # calculate log returns
    top_positive['log_return'] = np.log1p(top_positive['weekly_ret'])
    top_negative['log_return'] = np.log1p(top_negative['weekly_ret'])
    distilroberta_df.loc[distilroberta_df['week_date'] == week, 'log_return'] = np.log1p(distilroberta_df.loc[distilroberta_df['week_date'] == week, 'weekly_ret'])

    # Equal-Weighted Long Log Returns
    equal_long_log_return = top_positive['log_return'].mean()

    # Equal-Weighted Short Log Returns
    equal_short_log_return = top_negative['log_return'].mean() * -1  # Negate for short

    # Equal-Weighted Long-Short Log Returns
    equal_long_short_log_return = equal_long_log_return + equal_short_log_return

    # Value-Weighted Long Log Returns based on market cap
    total_market_cap_positive = top_positive['market_cap'].sum()
    value_long_log_return = (top_positive['log_return'] * top_positive['market_cap']).sum() / total_market_cap_positive

    # Value-Weighted Short Log Returns based on market cap
    total_market_cap_negative = top_negative['market_cap'].sum()
    value_short_log_return = (top_negative['log_return'] * top_negative['market_cap']).sum() / total_market_cap_negative * -1  # Negate for short

    # Value-Weighted Long-Short Log Returns
    value_long_short_log_return = value_long_log_return + value_short_log_return

    # Market Log Return (equal-weighted average of all assets in the group)
    market_log_return = distilroberta_df.loc[distilroberta_df['week_date'] == week, 'log_return'].mean()

    # update cumulative log returns
    cum_EL_return += equal_long_log_return
    cum_ES_return += equal_short_log_return
    cum_ELS_return += equal_long_short_log_return
    cum_VL_return += value_long_log_return
    cum_VS_return += value_short_log_return
    cum_VLS_return += value_long_short_log_return
    cum_market_return += market_log_return

    # append results for this date
    cumulative_log_returns_by_date['date'].append(week)
    cumulative_log_returns_by_date['cum_EL_return'].append(cum_EL_return)
    cumulative_log_returns_by_date['cum_ES_return'].append(cum_ES_return)
    cumulative_log_returns_by_date['cum_ELS_return'].append(cum_ELS_return)
    cumulative_log_returns_by_date['cum_VL_return'].append(cum_VL_return)
    cumulative_log_returns_by_date['cum_VS_return'].append(cum_VS_return)
    cumulative_log_returns_by_date['cum_VLS_return'].append(cum_VLS_return)
    cumulative_log_returns_by_date['cum_market_return'].append(cum_market_return)

# convert to dataframe
cumulative_log_returns_distilroberta = pd.DataFrame(cumulative_log_returns_by_date)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_positive['log_return'] = np.log1p(top_positive['weekly_ret'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_negative['log_return'] = np.log1p(top_negative['weekly_ret'])


In [None]:
# define path to save results
path_distilroberta = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Portfolio Analysis - Fine Tune/distilroberta_portfolio.csv'

# save to csv
cumulative_log_returns_distilroberta.to_csv(path_distilroberta, index=False)

## **Sharpe Ratio**

In [None]:
# calculate weekly log returns from cumulative log returns
log_returns = cumulative_log_returns_distilroberta.set_index('date').diff().dropna()

# define function to calculate Sharpe Ratio
def calculate_sharpe_ratio(return_series, risk_free_rate=0):
    mean_return = return_series.mean()
    std_return = return_series.std()
    excess_return = mean_return - risk_free_rate
    sharpe_ratio = (excess_return / std_return) * np.sqrt(52)
    return mean_return, std_return, sharpe_ratio

sharpe_ratios = []

for column in log_returns.columns:
    mean_return, std_return, sharpe_ratio = calculate_sharpe_ratio(log_returns[column])
    sharpe_ratios.append({
        'Portfolio': column,
        'Mean Return': mean_return,
        'Standard Deviation': std_return,
        'Sharpe Ratio': sharpe_ratio
    })

# convert to dataframe
sharpe_ratios_df = pd.DataFrame(sharpe_ratios)

In [None]:
sharpe_ratios_df

Unnamed: 0,Portfolio,Mean Return,Standard Deviation,Sharpe Ratio
0,cum_EL_return,0.002538,0.025772,0.710122
1,cum_ES_return,-0.003254,0.024675,-0.951084
2,cum_ELS_return,-0.000716,0.016094,-0.321005
3,cum_VL_return,0.002642,0.025827,0.737718
4,cum_VS_return,-0.003188,0.025178,-0.913091
5,cum_VLS_return,-0.000546,0.015402,-0.255575
6,cum_market_return,0.002172,0.023843,0.656927


In [None]:
# disconnect run time
runtime.unassign()

# **FinBERT**

## **Portfolio**

In [None]:
# load finbert's accuracy of rolling window prediction
finbert_path_accuracy = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/finbert_rolling_finetuned_accuracy.csv'
finbert_accuracy = pd.read_csv(finbert_path_accuracy)

# load finbert's prediction of rolling window prediction
finbert_path_pred = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/finbert_rolling_finetuned_prediction.csv'
finbert_pred = pd.read_csv(finbert_path_pred)

#change to datetime
finbert_pred['week_date'] = pd.to_datetime(finbert_pred['week_date'])

In [None]:
# encode price_direction
label_mapping = {0: 'negative', 1: 'positive'}
finbert_pred['prediction'] = finbert_pred['prediction'].map(label_mapping)
finbert_pred['actual'] = finbert_pred['actual'].map(label_mapping)

In [None]:
# merge to get return data
finbert_ret = pd.merge(finbert_pred, test_df_weekly, how='left', left_on=['company', 'week_date'], right_on=['permco', 'start_date'])
finbert_ret.drop(['permco', 'start_date'], axis=1, inplace=True)

In [None]:
# merge to get market cap data
finbert_ret = pd.merge(finbert_ret, marketcap_df, how='left', left_on=['company', 'week_date'], right_on=['permco', 'week_start_date'])
finbert_ret.drop(['permco', 'week_start_date'], axis=1, inplace=True)

In [None]:
# merge to get market (S&P 500) return
finbert_df = pd.merge(finbert_ret, spx500_df, left_on='week_date', right_on='caldt')
finbert_df.drop('caldt', axis=1, inplace=True)

# sort
finbert_df.sort_values(by=['company', 'week_date'], inplace=True)
finbert_df.reset_index(drop=True, inplace=True)

In [None]:
cumulative_log_returns_by_date = {
    'date': [],
    'cum_EL_return': [],
    'cum_ES_return': [],
    'cum_ELS_return': [],
    'cum_VL_return': [],
    'cum_VS_return': [],
    'cum_VLS_return': [],
    'cum_market_return': []
}

# group by week_date and company to get the maximum probabilities for each company in a week
grouped_max_neg = finbert_df.loc[finbert_df.groupby(['week_date', 'company'])['probability_neg'].idxmax()]
grouped_max_pos = finbert_df.loc[finbert_df.groupby(['week_date', 'company'])['probability_pos'].idxmax()]

# group by week_date
grouped_neg = grouped_max_neg.groupby('week_date')
grouped_pos = grouped_max_pos.groupby('week_date')

# initialise cumulative log returns
cum_EL_return = 0
cum_ES_return = 0
cum_ELS_return = 0
cum_VL_return = 0
cum_VS_return = 0
cum_VLS_return = 0
cum_market_return = 0

# iterate over week
for week in finbert_df['week_date'].unique():
    group_neg = grouped_neg.get_group(week)
    group_pos = grouped_pos.get_group(week)

    # sort by probability for positive and negative predictions
    top_positive = group_pos.sort_values(by='probability_pos', ascending=False).head(5)
    top_negative = group_neg.sort_values(by='probability_neg', ascending=False).head(5)

    # calculate log returns
    top_positive['log_return'] = np.log1p(top_positive['weekly_ret'])
    top_negative['log_return'] = np.log1p(top_negative['weekly_ret'])
    finbert_df.loc[finbert_df['week_date'] == week, 'log_return'] = np.log1p(finbert_df.loc[finbert_df['week_date'] == week, 'weekly_ret'])

    # Equal-Weighted Long Log Returns
    equal_long_log_return = top_positive['log_return'].mean()

    # Equal-Weighted Short Log Returns
    equal_short_log_return = top_negative['log_return'].mean() * -1  # Negate for short

    # Equal-Weighted Long-Short Log Returns
    equal_long_short_log_return = equal_long_log_return + equal_short_log_return

    # Value-Weighted Long Log Returns based on market cap
    total_market_cap_positive = top_positive['market_cap'].sum()
    value_long_log_return = (top_positive['log_return'] * top_positive['market_cap']).sum() / total_market_cap_positive

    # Value-Weighted Short Log Returns based on market cap
    total_market_cap_negative = top_negative['market_cap'].sum()
    value_short_log_return = (top_negative['log_return'] * top_negative['market_cap']).sum() / total_market_cap_negative * -1  # Negate for short

    # Value-Weighted Long-Short Log Returns
    value_long_short_log_return = value_long_log_return + value_short_log_return

    # Market Log Return (equal-weighted average of all assets in the group)
    market_log_return = finbert_df.loc[finbert_df['week_date'] == week, 'log_return'].mean()

    # update cumulative log returns
    cum_EL_return += equal_long_log_return
    cum_ES_return += equal_short_log_return
    cum_ELS_return += equal_long_short_log_return
    cum_VL_return += value_long_log_return
    cum_VS_return += value_short_log_return
    cum_VLS_return += value_long_short_log_return
    cum_market_return += market_log_return

    # append results for this date
    cumulative_log_returns_by_date['date'].append(week)
    cumulative_log_returns_by_date['cum_EL_return'].append(cum_EL_return)
    cumulative_log_returns_by_date['cum_ES_return'].append(cum_ES_return)
    cumulative_log_returns_by_date['cum_ELS_return'].append(cum_ELS_return)
    cumulative_log_returns_by_date['cum_VL_return'].append(cum_VL_return)
    cumulative_log_returns_by_date['cum_VS_return'].append(cum_VS_return)
    cumulative_log_returns_by_date['cum_VLS_return'].append(cum_VLS_return)
    cumulative_log_returns_by_date['cum_market_return'].append(cum_market_return)

# convert to dataframe
cumulative_log_returns_finbert = pd.DataFrame(cumulative_log_returns_by_date)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_positive['log_return'] = np.log1p(top_positive['weekly_ret'])


In [None]:
# define path to save results
path_finbert = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Portfolio Analysis - Fine Tune/finbert_portfolio.csv'

# save to csv
cumulative_log_returns_finbert.to_csv(path_finbert, index=False)

## **Sharpe Ratio**

In [None]:
# calculate weekly log returns from cumulative log returns
log_returns = cumulative_log_returns_finbert.set_index('date').diff().dropna()

# define function to calculate Sharpe Ratio
def calculate_sharpe_ratio(return_series, risk_free_rate=0):
    mean_return = return_series.mean()
    std_return = return_series.std()
    excess_return = mean_return - risk_free_rate
    sharpe_ratio = (excess_return / std_return) * np.sqrt(52)
    return mean_return, std_return, sharpe_ratio

sharpe_ratios = []

for column in log_returns.columns:
    mean_return, std_return, sharpe_ratio = calculate_sharpe_ratio(log_returns[column])
    sharpe_ratios.append({
        'Portfolio': column,
        'Mean Return': mean_return,
        'Standard Deviation': std_return,
        'Sharpe Ratio': sharpe_ratio
    })

# convert to dataframe
sharpe_ratios_df = pd.DataFrame(sharpe_ratios)

In [None]:
sharpe_ratios_df

Unnamed: 0,Portfolio,Mean Return,Standard Deviation,Sharpe Ratio
0,cum_EL_return,0.003132,0.027305,0.827072
1,cum_ES_return,-0.002908,0.026717,-0.784765
2,cum_ELS_return,0.000224,0.017776,0.090934
3,cum_VL_return,0.001942,0.025227,0.555131
4,cum_VS_return,-0.0028,0.025764,-0.783778
5,cum_VLS_return,-0.000858,0.016972,-0.364668
6,cum_market_return,0.002172,0.023843,0.656927


In [None]:
# disconnect run time
runtime.unassign()