# **IMPORT LIBRARIES**

In [None]:
#import libraries
import pandas as pd
import numpy as np
import spacy
import json
import pprint
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import plotly.express as px
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from collections import Counter
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.subplots import make_subplots
init_notebook_mode(connected=False)
pio.renderers.default = 'colab'
pio.templates.default = 'plotly_white'
from google.colab import files
from wordcloud import WordCloud

In [None]:
# mount google colab
from google.colab import drive
drive.mount('/content/drive')

# **ACCURACY**

## **Pre-Trained vs Fine-Tuned**

In [None]:
# get pre-trained accuracy
bert_pretrained_acc = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/bert_rolling_accuracy.csv')
roberta_pretrained_acc = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/roberta_rolling_accuracy.csv')
distilbert_pretrained_acc = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/distilbert_rolling_accuracy.csv')
distilroberta_pretrained_acc = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/distilroberta_rolling_accuracy.csv')
finbert_pretrained_acc = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/finbert_rolling_accuracy.csv')

In [None]:
# get fine-tuned accuracy
bert_finetuned_acc = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/bert_rolling_finetuned_accuracy.csv')
roberta_finetuned_acc = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/roberta_rolling_finetuned_accuracy.csv')
distilbert_finetuned_acc = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/distilbert_rolling_finetuned_accuracy.csv')
distilroberta_finetuned_acc = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/distilroberta_rolling_finetuned_accuracy.csv')
finbert_finetuned_acc = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/finbert_rolling_finetuned_accuracy.csv')

In [None]:
df = [
    ('BERT', 'Pre-trained', bert_pretrained_acc),
    ('BERT', 'Fine-Tuned', bert_finetuned_acc),
    ('RoBERTa', 'Pre-trained', roberta_pretrained_acc),
    ('RoBERTa', 'Fine-Tuned', roberta_finetuned_acc),
    ('DistilBERT', 'Pre-trained', distilbert_pretrained_acc),
    ('DistilBERT', 'Fine-Tuned', distilbert_finetuned_acc),
    ('DistilRoBERTa', 'Pre-trained', distilroberta_pretrained_acc),
    ('DistilRoBERTa', 'Fine-Tuned', distilroberta_finetuned_acc),
    ('FinBERT', 'Pre-trained', finbert_pretrained_acc),
    ('FinBERT', 'Fine-Tuned', finbert_finetuned_acc)
]

# list to store results
results = []

# loop through each dataframe
for model, tuning, df in df:
    mean_accuracies = df.groupby('test_period')['test_accuracy'].mean().reset_index()
    for _, row in mean_accuracies.iterrows():
        results.append({
            'Model': model,
            'Tuning': tuning,
            'Accuracy': row['test_accuracy']
        })

# convert results to a dataframe
results_df = pd.DataFrame(results)

In [None]:
df = [
    ('BERT', 'Pre-trained', bert_pretrained_acc),
    ('BERT', 'Fine-Tuned', bert_finetuned_acc),
    ('RoBERTa', 'Pre-trained', roberta_pretrained_acc),
    ('RoBERTa', 'Fine-Tuned', roberta_finetuned_acc),
    ('DistilBERT', 'Pre-trained', distilbert_pretrained_acc),
    ('DistilBERT', 'Fine-Tuned', distilbert_finetuned_acc),
    ('DistilRoBERTa', 'Pre-trained', distilroberta_pretrained_acc),
    ('DistilRoBERTa', 'Fine-Tuned', distilroberta_finetuned_acc),
    ('FinBERT', 'Pre-trained', finbert_pretrained_acc),
    ('FinBERT', 'Fine-Tuned', finbert_finetuned_acc)
]

# list to store results
results = []

# loop through each dataframe
for model, tuning, df in df:
    mean_accuracy = df['test_accuracy'].mean()
    results.append({
        'Model': model,
        'Tuning': tuning,
        'Accuracy': mean_accuracy
    })

# convert results to dataframe
results_df = pd.DataFrame(results)

In [None]:
# define custom colors
custom_colors = ['#ACCBF9', '#4A66AC']

# create the grouped bar chart
fig = px.bar(
    results_df,
    x='Model',
    y='Accuracy',
    color='Tuning',
    barmode='group',
    title='Model Accuracy: Pre-Trained vs Fine-Tuned',
    labels={'Accuracy': 'Accuracy', 'Model': 'Model'},
    color_discrete_sequence=custom_colors,
    height=600,
    width=1000,
    text='Accuracy',
)

# customize y-axis
fig.update_yaxes(
    title_text='Accuracy',
    range=[0.45, 0.55],
    ticks='outside',
    tick0=0.52,
    dtick=0.01,
    showgrid=True
)

# update accuracy labels
fig.update_traces(
    texttemplate='%{text:.3f}',
    textposition='outside',
    textfont_size=12
)

# add a horizontal line
fig.add_shape(
    type='line',
    x0=-0.5, x1=len(results_df['Model'].unique())-0.5,
    y0=0.5, y1=0.5,
    line=dict(color='Red', width=2, dash='dash')
)

# show the figure
fig.show()

# **YEARLY ACCURACY**

## **Pre-Trained**

In [None]:
# get the pretrained rolling window
bert_rolling_pretrained = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/bert_rolling_accuracy.csv')
roberta_rolling_pretrained = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/roberta_rolling_accuracy.csv')
distilbert_rolling_pretrained = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/distilbert_rolling_accuracy.csv')
distilroberta_rolling_pretrained = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/distilroberta_rolling_accuracy.csv')
finbert_rolling_pretrained = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/finbert_rolling_accuracy.csv')

In [None]:
# add model name to new column
bert_rolling_pretrained = bert_rolling_pretrained.assign(model='bert')
roberta_rolling_pretrained = roberta_rolling_pretrained.assign(model='roberta')
distilbert_rolling_pretrained = distilbert_rolling_pretrained.assign(model='distilbert')
distilroberta_rolling_pretrained = distilroberta_rolling_pretrained.assign(model='distilroberta')
finbert_rolling_pretrained = finbert_rolling_pretrained.assign(model='finbert')

In [None]:
# concat all model's dataframe
total_rolling_pretrained = pd.concat([bert_rolling_pretrained,
                                   roberta_rolling_pretrained,
                                   distilbert_rolling_pretrained,
                                   distilroberta_rolling_pretrained,
                                   finbert_rolling_pretrained], axis=0)

In [None]:
# group by model and predicted year
df_grouped = total_rolling_pretrained.groupby(['model', 'test_period']).agg({
    'test_accuracy': ['mean', 'std']
}).reset_index()

# flatten the multiindex columns
df_grouped.columns = ['model', 'test_period', 'accuracy_mean', 'accuracy_std']

In [None]:
# get a color sequence from plotly's default colors
colors = px.colors.qualitative.Plotly

# create a dictionary to map each model to a color
color_dict = {model: colors[i % len(colors)] for i, model in enumerate(df_grouped['model'].unique())}

# define function to make the color more transparent
def get_transparent_color(color, alpha=0.2):
    # convert hex to RGB and then to RGBA
    hex_color = color.lstrip('#')
    rgb_color = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
    return f'rgba({rgb_color[0]}, {rgb_color[1]}, {rgb_color[2]}, {alpha})'

# create a figure
fig = go.Figure()

# list of unique models
models = df_grouped['model'].unique()

# loop through each model
for model in models:
    # filter data for the current model
    df_model = df_grouped[df_grouped['model'] == model]

    # get the color for the current model
    color = color_dict[model]

    # add the mean line
    fig.add_trace(go.Scatter(
        x=df_model['test_period'],
        y=df_model['accuracy_mean'],
        mode='lines',
        name=f'{model}',
        line=dict(color=color, width=2)
    ))

    # add the standard deviation shaded area
    fig.add_trace(go.Scatter(
        x=pd.concat([df_model['test_period'], df_model['test_period'][::-1]]),
        y=pd.concat([df_model['accuracy_mean'] + df_model['accuracy_std'],
                     (df_model['accuracy_mean'] - df_model['accuracy_std'])[::-1]]),
        fill='toself',
        fillcolor=get_transparent_color(color, alpha=0.2),  # Use the same color with transparency
        line=dict(color='rgba(255,255,255,0)'),
        hoverinfo="skip",
        showlegend=False,
        name=f'{model} std dev'
    ))

# customize the y-axis
fig.update_yaxes(range=[.35, .7])

# customize layout
fig.update_layout(
    title='Rolling Window Accuracy (Average)',
    xaxis_title='Test Period (Year)',
    yaxis_title='Accuracy',
    template='plotly_white',
    width=900,
    height=700,
    showlegend=True
)
# add a horizontal line
fig.add_shape(
    type='line',
    x0=2016, x1=2023,
    y0=0.5, y1=0.5,
    line=dict(color='Red', width=2, dash='dash')
)

# Show the plot
fig.show()

## **Fine-Tuned**

In [None]:
# get the finetuned rolling window
bert_rolling_finetuned = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/bert_rolling_finetuned_accuracy.csv')
roberta_rolling_finetuned = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/roberta_rolling_finetuned_accuracy.csv')
distilbert_rolling_finetuned = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/distilbert_rolling_finetuned_accuracy.csv')
distilroberta_rolling_finetuned = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/distilroberta_rolling_finetuned_accuracy.csv')
finbert_rolling_finetuned = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/finbert_rolling_finetuned_accuracy.csv')

In [None]:
# add model name to new column
bert_rolling_finetuned = bert_rolling_finetuned.assign(model='bert')
roberta_rolling_finetuned = roberta_rolling_finetuned.assign(model='roberta')
distilbert_rolling_finetuned = distilbert_rolling_finetuned.assign(model='distilbert')
distilroberta_rolling_finetuned = distilroberta_rolling_finetuned.assign(model='distilroberta')
finbert_rolling_finetuned = finbert_rolling_finetuned.assign(model='finbert')

In [None]:
# concat all model's dataframe
total_rolling_finetuned = pd.concat([bert_rolling_finetuned,
                                   roberta_rolling_finetuned,
                                   distilbert_rolling_finetuned,
                                   distilroberta_rolling_finetuned,
                                   finbert_rolling_finetuned], axis=0)

In [None]:
# assuming columns are 'model', 'company', 'test_period', 'accuracy'
df_grouped = total_rolling_finetuned.groupby(['model', 'test_period']).agg({
    'test_accuracy': ['mean', 'std']
}).reset_index()

# flatten the multiindex columns
df_grouped.columns = ['model', 'test_period', 'accuracy_mean', 'accuracy_std']

In [None]:
# get a color sequence from Plotly's default colors
colors = px.colors.qualitative.Plotly

# create a dictionary to map each model to a color
color_dict = {model: colors[i % len(colors)] for i, model in enumerate(df_grouped['model'].unique())}

# define function to make the color more transparent
def get_transparent_color(color, alpha=0.2):
    # Convert hex to RGB and then to RGBA
    hex_color = color.lstrip('#')
    rgb_color = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
    return f'rgba({rgb_color[0]}, {rgb_color[1]}, {rgb_color[2]}, {alpha})'

# create a figure
fig = go.Figure()

# list of unique models
models = df_grouped['model'].unique()

# loop through each model
for model in models:
    # filter data for the current model
    df_model = df_grouped[df_grouped['model'] == model]

    # get the color for the current model
    color = color_dict[model]

    # add the mean line
    fig.add_trace(go.Scatter(
        x=df_model['test_period'],
        y=df_model['accuracy_mean'],
        mode='lines',
        name=f'{model}',
        line=dict(color=color, width=2)
    ))

    # add the standard deviation shaded area
    fig.add_trace(go.Scatter(
        x=pd.concat([df_model['test_period'], df_model['test_period'][::-1]]),
        y=pd.concat([df_model['accuracy_mean'] + df_model['accuracy_std'],
                     (df_model['accuracy_mean'] - df_model['accuracy_std'])[::-1]]),
        fill='toself',
        fillcolor=get_transparent_color(color, alpha=0.2),  # Use the same color with transparency
        line=dict(color='rgba(255,255,255,0)'),
        hoverinfo="skip",
        showlegend=False,
        name=f'{model} std dev'
    ))

# customize the y-axis
fig.update_yaxes(range=[.35, .7])

# customize layout
fig.update_layout(
    title='Rolling Window Accuracy (Average)',
    xaxis_title='Test Period (Year)',
    yaxis_title='Accuracy',
    template='plotly_white',
    width=900,
    height=700,
    showlegend=True
)
# add a horizontal line
fig.add_shape(
    type='line',
    x0=2016, x1=2023,
    y0=0.5, y1=0.5,
    line=dict(color='Red', width=2, dash='dash')
)

# show the plot
fig.show()

# **CONFUSION MATRIX**

## **Pre-Trained**

In [None]:
# load data
bert_pred_pretrained = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/bert_rolling_prediction.csv')
roberta_pred_pretrained = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/roberta_rolling_prediction.csv')
distilbert_pred_pretrained = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/distilbert_rolling_prediction.csv')
distilroberta_pred_pretrained = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/distilroberta_rolling_prediction.csv')
finbert_pred_pretrained = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/finbert_rolling_prediction.csv')

### **BERT**

In [None]:
# extract predictions and actual values
predictions = bert_pred_pretrained['prediction']
actuals = bert_pred_pretrained['actual']

# create the confusion matrix
cm = confusion_matrix(actuals, predictions, labels=['positive', 'negative'])

# convert to dataframe
cm_bert_pred_pretrained = pd.DataFrame(cm, index=['Actual Positive', 'Actual Negative'], columns=['Predicted Positive', 'Predicted Negative'])

# plotting the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_bert_pred_pretrained, annot=True, fmt='d', cmap='Blues', annot_kws={"size": 14})
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

### **RoBERTa**

In [None]:
# extract predictions and actual values
predictions = roberta_pred_pretrained['prediction']
actuals = roberta_pred_pretrained['actual']

# create the confusion matrix
cm = confusion_matrix(actuals, predictions, labels=['positive', 'negative'])

# convert to dataframe
cm_roberta_pred_pretrained = pd.DataFrame(cm, index=['Actual Positive', 'Actual Negative'], columns=['Predicted Positive', 'Predicted Negative'])

# plotting the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_roberta_pred_pretrained, annot=True, fmt='d', cmap='Blues', annot_kws={"size": 14})
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

### **DistilBERT**

In [None]:
# extract predictions and actual values
predictions = distilbert_pred_pretrained['prediction']
actuals = distilbert_pred_pretrained['actual']

# create the confusion matrix
cm = confusion_matrix(actuals, predictions, labels=['positive', 'negative'])

# convert to dataframe
cm_distilbert_pred_pretrained = pd.DataFrame(cm, index=['Actual Positive', 'Actual Negative'], columns=['Predicted Positive', 'Predicted Negative'])

# plotting the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_distilbert_pred_pretrained, annot=True, fmt='d', cmap='Blues', annot_kws={"size": 14})
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

### **DistilRoBERTa**

In [None]:
# extract predictions and actual values
predictions = distilroberta_pred_pretrained['prediction']
actuals = distilroberta_pred_pretrained['actual']

# create the confusion matrix
cm = confusion_matrix(actuals, predictions, labels=['positive', 'negative'])

# convert to dataframe
cm_distilroberta_pred_pretrained = pd.DataFrame(cm, index=['Actual Positive', 'Actual Negative'], columns=['Predicted Positive', 'Predicted Negative'])

# plotting the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_distilroberta_pred_pretrained, annot=True, fmt='d', cmap='Blues', annot_kws={"size": 14})
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

### **FinBERT**

In [None]:
# extract predictions and actual values
predictions = finbert_pred_pretrained['prediction']
actuals = finbert_pred_pretrained['actual']

# create the confusion matrix
cm = confusion_matrix(actuals, predictions, labels=['positive', 'negative'])

# convert to dataframe
cm_finbert_pred_pretrained = pd.DataFrame(cm, index=['Actual Positive', 'Actual Negative'], columns=['Predicted Positive', 'Predicted Negative'])

# plotting the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_finbert_pred_pretrained, annot=True, fmt='d', cmap='Blues', annot_kws={"size": 14})
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

## **Fine-Tuned**

In [None]:
# load data
bert_pred_finetuned = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/bert_rolling_finetuned_prediction.csv')
roberta_pred_finetuned = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/roberta_rolling_finetuned_prediction.csv')
distilbert_pred_finetuned = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/distilbert_rolling_finetuned_prediction.csv')
distilroberta_pred_finetuned = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/distilroberta_rolling_finetuned_prediction.csv')
finbert_pred_finetuned = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/finbert_rolling_finetuned_prediction.csv')

In [None]:
# encode
label_mapping = {1: 'positive', 0: 'negative'}
bert_pred_finetuned['prediction'] = bert_pred_finetuned['prediction'].map(label_mapping)
bert_pred_finetuned['actual'] = bert_pred_finetuned['actual'].map(label_mapping)
roberta_pred_finetuned['prediction'] = roberta_pred_finetuned['prediction'].map(label_mapping)
roberta_pred_finetuned['actual'] = roberta_pred_finetuned['actual'].map(label_mapping)
distilbert_pred_finetuned['prediction'] = distilbert_pred_finetuned['prediction'].map(label_mapping)
distilbert_pred_finetuned['actual'] = distilbert_pred_finetuned['actual'].map(label_mapping)
distilroberta_pred_finetuned['prediction'] = distilroberta_pred_finetuned['prediction'].map(label_mapping)
distilroberta_pred_finetuned['actual'] = distilroberta_pred_finetuned['actual'].map(label_mapping)
finbert_pred_finetuned['prediction'] = finbert_pred_finetuned['prediction'].map(label_mapping)
finbert_pred_finetuned['actual'] = finbert_pred_finetuned['actual'].map(label_mapping)

### **BERT**

In [None]:
# extract predictions and actual values
predictions = bert_pred_finetuned['prediction']
actuals = bert_pred_finetuned['actual']

# create the confusion matrix
cm = confusion_matrix(actuals, predictions, labels=['positive', 'negative'])

# convert to dataframme
cm_bert_pred_finetuned = pd.DataFrame(cm, index=['Actual Positive', 'Actual Negative'], columns=['Predicted Positive', 'Predicted Negative'])

# plotting the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_bert_pred_finetuned, annot=True, fmt='d', cmap='Blues', annot_kws={"size": 14})
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

### **RoBERTa**

In [None]:
# extract predictions and actual values
predictions = roberta_pred_finetuned['prediction']
actuals = roberta_pred_finetuned['actual']

# create the confusion matrix
cm = confusion_matrix(actuals, predictions, labels=['positive', 'negative'])

# convert to dataframe
cm_roberta_pred_finetuned = pd.DataFrame(cm, index=['Actual Positive', 'Actual Negative'], columns=['Predicted Positive', 'Predicted Negative'])

# plotting the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_roberta_pred_finetuned, annot=True, fmt='d', cmap='Blues', annot_kws={"size": 14})
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

### **DistilBERT**

In [None]:
# extract predictions and actual values
predictions = distilbert_pred_finetuned['prediction']
actuals = distilbert_pred_finetuned['actual']

# create the confusion matrix
cm = confusion_matrix(actuals, predictions, labels=['positive', 'negative'])

# convert to dataframe
cm_distilbert_pred_finetuned = pd.DataFrame(cm, index=['Actual Positive', 'Actual Negative'], columns=['Predicted Positive', 'Predicted Negative'])

# plotting the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_distilbert_pred_finetuned, annot=True, fmt='d', cmap='Blues', annot_kws={"size": 14})
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

### **DistilRoBERTa**

In [None]:
# extract predictions and actual values
predictions = distilroberta_pred_finetuned['prediction']
actuals = distilroberta_pred_finetuned['actual']

# create the confusion matrix
cm = confusion_matrix(actuals, predictions, labels=['positive', 'negative'])

# convert to dataframe
cm_distilroberta_pred_finetuned = pd.DataFrame(cm, index=['Actual Positive', 'Actual Negative'], columns=['Predicted Positive', 'Predicted Negative'])

# plotting the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_distilroberta_pred_finetuned, annot=True, fmt='d', cmap='Blues', annot_kws={"size": 14})
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

### **FinBERT**

In [None]:
# extract predictions and actual values
predictions = finbert_pred_finetuned['prediction']
actuals = finbert_pred_finetuned['actual']

# create the confusion matrix
cm = confusion_matrix(actuals, predictions, labels=['positive', 'negative'])

# convert to dataframme
cm_finbert_pred_finetuned = pd.DataFrame(cm, index=['Actual Positive', 'Actual Negative'], columns=['Predicted Positive', 'Predicted Negative'])

# plotting the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_finbert_pred_finetuned, annot=True, fmt='d', cmap='Blues', annot_kws={"size": 14})
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

# **EDA**

## **MARKET CAP**

In [None]:
# load data
marketcap_path = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Dataset/market_cap.csv'
marketcap_df = pd.read_csv(marketcap_path)

# format to datetime
marketcap_df['week_start_date'] = pd.to_datetime(marketcap_df['week_start_date'])

# group by week
marketcap_sum = marketcap_df.groupby('week_start_date')['market_cap'].sum().reset_index()

In [None]:
# create the line plot
fig = px.line(marketcap_sum,
              x='week_start_date',
              y='market_cap',
              title='Top 25 Healthcare Sector Market Capitalisation',
              height=500,
              width=1000)

# customize the x-axis
fig.update_xaxes(title_text='Date', tickformat='%Y')

# Customize the y-axis
fig.update_yaxes(title_text='Market Capitalisation')

fig.show()

## **Missing Values**

In [None]:
# load crsp train and test
crsp_train = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Dataset/crsp_train.csv')
crsp_test = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Dataset/crsp_test.csv')

In [None]:
# display missing values
crsp_train[crsp_train[['prc', 'ret']].isna().any(axis=1)]

## **Number of News by Ticker**

In [None]:
# load data
kd_train = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Dataset/kd_train.csv')
kd_test = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Dataset/kd_test.csv')
crsp_test = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Dataset/crsp_test.csv')
linking = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Dataset/linking_table.csv')

In [None]:
# concat train and test data
final_kd =pd.concat([kd_train, kd_test], axis=0, ignore_index=True)
# format to datetime
final_kd['announcedate'] = pd.to_datetime(final_kd['announcedate'])

In [None]:
# get the latest ticker of each stock
last_ticker = crsp_test[crsp_test['date']=='2023-12-29'][['permco', 'ticker']]

In [None]:
# join the datasets to get the ticker
final_kd = pd.merge(final_kd, linking[['lpermco', 'gvkey']], on='gvkey', how='left')
final_kd = pd.merge(final_kd, last_ticker, left_on='lpermco', right_on='permco', how='left')

In [None]:
# get the year
final_kd['year'] = final_kd['announcedate'].dt.year

# count the number of news by ticker and year
number_headline = final_kd.groupby(['ticker', 'year'])['headline'].count().reset_index()

In [None]:
# plot figure
fig = px.line(number_headline, x='year', y='headline',
              title='Number of News by Ticker',
              color='ticker', color_discrete_sequence=px.colors.qualitative.Alphabet)
fig.update_layout(height=700, width=700, xaxis_title='Date',
                  yaxis_title= 'Number of News',
                  xaxis=dict(showgrid=True),
                  yaxis=dict(showgrid=True, range=[0,1450]),
                  legend=dict(
                  x=1,
                  y=1,
                  xanchor='right',
                  yanchor='bottom',
                  orientation="h"
                  ))
fig.show()

## **News Interval**

In [None]:
# group news data by company and date
kd_group = final_kd.groupby(['gvkey', 'announcedate'])['ticker'].last().reset_index()

In [None]:
# calculating maximum & average interval per ticker
kd_group['date_diff'] = kd_group.groupby('gvkey')['announcedate'].diff()

max_interval_train = kd_group.groupby('ticker')['date_diff'].max().dt.days.reset_index() #round to days
avg_interval_train = kd_group.groupby('ticker')['date_diff'].mean().dt.days.reset_index() #round to days

In [None]:
# calculate the average of news interval
average_date_diff = avg_interval_train['date_diff'].mean()

In [None]:
#draw bar plot for average interval
fig = px.bar(avg_interval_train, x='ticker', y='date_diff',
             title='Average News Interval',
             text='date_diff',
             color='ticker',
             color_discrete_sequence=px.colors.qualitative.Alphabet)
fig.update_traces(textposition='outside')
fig.update_layout(height=700, width=700,
                  xaxis_title='Ticker',
                  yaxis_title='News Interval (days)',
                  yaxis=dict(range=[0, 20]),
                  legend=dict(
                  x=1,
                  y=.8,
                  xanchor='right',
                  yanchor='bottom',
                  orientation="h"
                  ))

# Add the average line
fig.add_shape(
    type='line',
    x0=-0.5,  # start of the line, just before the first bar
    x1=len(avg_interval_train['ticker']) - 0.5,  # end of the line, just after the last bar
    y0=average_date_diff,
    y1=average_date_diff,
    line=dict(color='red', width=2, dash='dash'),
    xref='x',
    yref='y'
)

fig.add_annotation(
    x=len(avg_interval_train)+1.5 ,  # position it at the end of the plot
    y=average_date_diff,
    text=f'<b>Average:<br>{average_date_diff:.2f} days',
    showarrow=False,
    yshift=10
)

fig.show()

## **Summary Statistics**

In [None]:
# load data
train_df = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Dataset/train_data.csv')
test_df = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Dataset/test_data.csv')

In [None]:
# concat train and test data
final_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)
# group by company and date of week
final_grouped = final_df.groupby(['permco', 'start_date'])['weekly_ret'].first().reset_index()

In [None]:
# define function to concatenate with special tokens to separate columns
def concatenate_columns(row):
    return f"{row['headline']} [SEP] {row['situation']} [SEP] {row['eventtype']}"

# apply function to concatenate columns headline, situation, and eventtype for dataset
final_df['combined_text'] = final_df.apply(concatenate_columns, axis=1)

In [None]:
# merge to get ticker column
final_grouped = pd.merge(final_grouped, last_ticker, on='permco', how='left')

In [None]:
# display summary stats of weekly return
final_grouped.groupby('ticker')['weekly_ret'].describe().reset_index()

## **Word Cloud**

In [None]:
# load data
train_df = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Dataset/train_data.csv')
test_df = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Dataset/test_data.csv')

crsp_train = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Dataset/crsp_train.csv')

In [None]:
# concat data
all_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

In [None]:
#set up nlp environment
nlp = spacy.load("en_core_web_sm")

In [None]:
# remove no_headlines
all_df = all_df[all_df['headline'] != 'no_headlines']

In [None]:
#define a function to tokenize
def tokenize(text):
    doc = nlp(text)
    #lemmatization, lower case, remove stop word & punctuation
    return [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]

#apply function to the headline
all_df['tokens'] = all_df['headline'].apply(tokenize)

In [None]:
# remove company related tokens (company name and ticker)
coy_text = list(np.concatenate((crsp_train['ticker'].unique(), crsp_train['comnam'].unique())))

# tokenize unwanted tokens
tokens_to_remove = [token.text.lower() for s in coy_text for token in nlp(s)]
tokens_to_remove.extend(['company', 'incorporation', 'corporation', 'incorporated', 'inc.', 'corp.', 'co.', 'cvs', 'hboc', 'idexx']) #add more tokens to be excluded

# function to filter out unwanted tokens
def filter_tokens(tokens):
    return [token for token in tokens if token not in tokens_to_remove]

# apply the function to the 'tokens' column
all_df['tokens'] = all_df['tokens'].apply(filter_tokens)

In [None]:
# joining the headline for each stock for word cloud
def join_tokens(token_list):
  return ' '.join([' '.join(sublist) for sublist in token_list])

stock_headline_train = all_df.groupby('ticker')['tokens'].agg(join_tokens)

In [None]:
# set up the matplotlib figure and axes
fig, axs = plt.subplots(figsize=(50, 80), nrows=13, ncols=2)
axs = axs.flatten()  #flatten the array of axes

# generate word cloud for each company's headlines
for ax, (ticker, headline) in zip(axs, stock_headline_train.items()):
    wordcloud = WordCloud(width=500, height=400, background_color='white').generate(headline)
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.axis('off')
    ax.set_title(f'Word Cloud for Ticker: {ticker}', fontsize=50)

# hide any unused axes (if stock_headline_train has less than 25 items)
for ax in axs[len(stock_headline_train):]:
    ax.axis('off')

plt.tight_layout()
plt.show()

In [None]:
# flatten list of tokens
all_tokens_train = [token for sublist in all_df['tokens'] for token in sublist]

# convert list of tokens to a single string
all_tokens_train_str = ' '.join(all_tokens_train)

# generate word cloud
wordcloud = WordCloud(width=500, height=250, background_color='white').generate(all_tokens_train_str)

# plot word cloud
fig, ax = plt.subplots(figsize=(10, 5))
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis('off')
ax.set_title(f'Word Cloud for All Ticker', fontsize=20)

plt.show()

## **Top Tokens**

In [None]:
#  flatten list of tokens
all_tokens_train = [token for sublist in all_df['tokens'] for token in sublist]

#top 50 tokens
token_counts_train = Counter(all_tokens_train)
top_50_tokens_train = token_counts_train.most_common(50)

In [None]:
# print out Top 50 tokens of all tickers
print("Top 50 Tokens:")
for token, count in top_50_tokens_train:
    print(f"{token}: {count}")

In [None]:
# grouping the tokens for each ticker
stock_token_train = all_df.groupby('ticker')['tokens'].sum()

In [None]:
# top15 tokens for each ticker
top_tokens_per_company_train = {}
for ticker, tokens in stock_token_train.items():
    token_counts = Counter(tokens)
    top_tokens_per_company_train[ticker] = token_counts.most_common(15)

In [None]:
# print out
for ticker, top_tokens in top_tokens_per_company_train.items():
    print(f"Ticker {ticker}:")
    for token, count in top_tokens:
        print(f"{token}: {count}")
    print("\n")

## **Data Noise**

In [None]:
# load data
test_df = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Dataset/test_data.csv')

In [None]:
# format to datetime
test_df['start_date'] = pd.to_datetime(test_df['start_date'])

In [None]:
# display noisy data in 2022
test_df[(test_df.start_date.dt.year == 2022) & (test_df['price_direction']=='negative') & test_df['eventtype'].isin(['Business Expansions', 'Announcements of Earnings'])]

## **Headline Analysis Data Inconsistency (FinBERT)**

In [None]:
# load data
bert_pred_pretrained = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/bert_rolling_prediction.csv')
roberta_pred_pretrained = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/roberta_rolling_prediction.csv')
distilbert_pred_pretrained = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/distilbert_rolling_prediction.csv')
distilroberta_pred_pretrained = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/distilroberta_rolling_prediction.csv')
finbert_pred_pretrained = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/finbert_rolling_prediction.csv')

In [None]:
# load data
bert_pred_finetuned = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/bert_rolling_finetuned_prediction.csv')
roberta_pred_finetuned = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/roberta_rolling_finetuned_prediction.csv')
distilbert_pred_finetuned = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/distilbert_rolling_finetuned_prediction.csv')
distilroberta_pred_finetuned = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/distilroberta_rolling_finetuned_prediction.csv')
finbert_pred_finetuned = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/finbert_rolling_finetuned_prediction.csv')

In [None]:
# encode
label_mapping = {1: 'positive', 0: 'negative'}
bert_pred_finetuned['prediction'] = bert_pred_finetuned['prediction'].map(label_mapping)
bert_pred_finetuned['actual'] = bert_pred_finetuned['actual'].map(label_mapping)
roberta_pred_finetuned['prediction'] = roberta_pred_finetuned['prediction'].map(label_mapping)
roberta_pred_finetuned['actual'] = roberta_pred_finetuned['actual'].map(label_mapping)
distilbert_pred_finetuned['prediction'] = distilbert_pred_finetuned['prediction'].map(label_mapping)
distilbert_pred_finetuned['actual'] = distilbert_pred_finetuned['actual'].map(label_mapping)
distilroberta_pred_finetuned['prediction'] = distilroberta_pred_finetuned['prediction'].map(label_mapping)
distilroberta_pred_finetuned['actual'] = distilroberta_pred_finetuned['actual'].map(label_mapping)
finbert_pred_finetuned['prediction'] = finbert_pred_finetuned['prediction'].map(label_mapping)
finbert_pred_finetuned['actual'] = finbert_pred_finetuned['actual'].map(label_mapping)

In [None]:
# format to datetime
finbert_pred_pretrained['week_date'] = pd.to_datetime(finbert_pred_pretrained['week_date'])
finbert_pred_finetuned['week_date'] = pd.to_datetime(finbert_pred_finetuned['week_date'])

In [None]:
# rename column
finbert_pred_pretrained.rename({'prediction': 'pred_pretrained'}, axis=1, inplace=True)
finbert_pred_finetuned.rename({'prediction': 'pred_finetuned'}, axis=1, inplace=True)

In [None]:
# create a new column 'news_number' by grouping by 'company' and 'week_date' and using cumcount
finbert_pred_pretrained['news_number'] = finbert_pred_pretrained.groupby(['company', 'week_date']).cumcount() + 1
finbert_pred_finetuned['news_number'] = finbert_pred_finetuned.groupby(['company', 'week_date']).cumcount() + 1

In [None]:
# load test data
file_path_test = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Dataset/test_data.csv'
test_df = pd.read_csv(file_path_test)

In [None]:
# convert to date format
test_df['start_date'] = pd.to_datetime(test_df['start_date'], format='%Y-%m-%d')

# filter out no change
test_df = test_df[test_df['price_direction'] != 'no change']

In [None]:
# define function to concatenate with special tokens to separate columns
def concatenate_columns(row):
    return f"{row['headline']} {row['situation']} {row['eventtype']}"

# apply function to concatenate columns headline, situation, and eventtype for dataset
test_df['combined_text'] = test_df.apply(concatenate_columns, axis=1)

In [None]:
# create a new column 'news_number' by grouping by 'company' and 'week_date' and using cumcount
test_df['news_number'] = test_df.groupby(['permco', 'start_date']).cumcount() + 1

In [None]:
# merge dataframe
combined_df_finbert = pd.merge(test_df[['permco', 'start_date', 'news_number', 'combined_text', 'eventtype']], finbert_pred_finetuned, how='left', left_on=['permco', 'start_date', 'news_number'], right_on=['company', 'week_date', 'news_number'])
combined_df_finbert.drop(['permco', 'start_date'], axis=1, inplace=True)

combined_df_finbert = pd.merge(combined_df_finbert, finbert_pred_pretrained[['company', 'week_date', 'news_number', 'pred_pretrained']], how='left', on=['company', 'week_date', 'news_number'])

In [None]:
combined_df_finbert = combined_df_finbert[['company', 'week_date', 'combined_text', 'eventtype','pred_pretrained', 'pred_finetuned', 'probability_neg', 'probability_pos','actual']]
combined_df_finbert[(combined_df_finbert['pred_finetuned']=='negative') & (combined_df_finbert['actual']=='positive') & (combined_df_finbert['pred_pretrained']=='positive') & (combined_df_finbert['eventtype'].isin(['Debt Financing Related', 'Discontinued Operations/Downsizings', 'Bankruptcy - Other', 'Delistings', 'Seeking to Sell/Divest']))]

## **Fine-tuned News Analysis Wordcloud**

### **BERT**

In [None]:
# format to datetime
bert_pred_finetuned['week_date'] = pd.to_datetime(bert_pred_finetuned['week_date'])

# add news number to differentiate multiple news items in a week
bert_pred_finetuned['news_number'] = bert_pred_finetuned.groupby(['company', 'week_date']).cumcount() + 1

# merge test dataset with fine-tuned prediction
combined_df_bert = pd.merge(test_df[['permco', 'start_date', 'news_number', 'combined_text', 'eventtype']], bert_pred_finetuned, how='left', left_on=['permco', 'start_date', 'news_number'], right_on=['company', 'week_date', 'news_number'])
combined_df_bert.drop(['permco', 'start_date'], axis=1, inplace=True)

# order columns
combined_df_bert = combined_df_bert[['company', 'week_date', 'combined_text', 'eventtype', 'probability_neg', 'probability_pos', 'prediction','actual']]

# remove rows without news
combined_df_bert = combined_df_bert[combined_df_bert['combined_text'] != 'no_headlines no_headlines no_headlines']

# take the top 10% positive probability from the maximum values
top_10percent_pos_bert = combined_df_bert.nlargest(int(len(combined_df_bert) * 0.1), 'probability_pos')

# take the top 10% negative probability from the maximum values
top_10percent_neg_bert = combined_df_bert.nlargest(int(len(combined_df_bert) * 0.1), 'probability_neg')

**Wordcloud**

In [None]:
#set up nlp environment
nlp = spacy.load("en_core_web_sm")

In [None]:
# load crsp train to get company names and tickers to be excluded
crsp_train = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Dataset/crsp_train.csv')

In [None]:
#define a function to tokenize
def tokenize(text):
    doc = nlp(text)
    #lemmatization, lower case, remove stop word & punctuation
    return [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]

#apply function to the news for top10% positive
top_10percent_pos_bert['tokens'] = top_10percent_pos_bert['combined_text'].apply(tokenize)

#apply function to the news for top10% negative
top_10percent_neg_bert['tokens'] = top_10percent_neg_bert['combined_text'].apply(tokenize)

In [None]:
# remove company related tokens (company name and ticker)
coy_text = list(np.concatenate((crsp_train['ticker'].unique(), crsp_train['comnam'].unique())))

# tokenize unwanted tokens
tokens_to_remove = [token.text.lower() for s in coy_text for token in nlp(s)]
tokens_to_remove.extend(['company', 'incorporation', 'corporation', 'incorporated',
                         'inc.', 'corp.', 'co.', 'cvs', 'hboc', 'idexx'
                         'year', 'ago', 'million', 'usd', 'united', 'states', 'us']) #add more tokens to be excluded

# function to filter out unwanted tokens
def filter_tokens(tokens):
    return [token for token in tokens if token not in tokens_to_remove]

# apply the function to the 'tokens' column
top_10percent_pos_bert['tokens'] = top_10percent_pos_bert['tokens'].apply(filter_tokens)
top_10percent_neg_bert['tokens'] = top_10percent_neg_bert['tokens'].apply(filter_tokens)

In [None]:
# flatten list of tokens
all_tokens_pos_bert = [token for sublist in top_10percent_pos_bert['tokens'] for token in sublist]
all_tokens_neg_bert = [token for sublist in top_10percent_neg_bert['tokens'] for token in sublist]

# convert list of tokens to a single string
all_tokens_pos_bert_str = ' '.join(all_tokens_pos_bert)
all_tokens_neg_bert_str = ' '.join(all_tokens_neg_bert)

In [None]:
# generate word cloud
wordcloud_bert_pos = WordCloud(width=400, height=250, background_color='white').generate(all_tokens_pos_bert_str)
wordcloud_bert_neg = WordCloud(width=400, height=250, background_color='white').generate(all_tokens_neg_bert_str)

In [None]:
# plot word cloud for top10% positive
fig, ax = plt.subplots(figsize=(10, 5))
ax.imshow(wordcloud_bert_pos, interpolation='bilinear')
ax.axis('off')
ax.set_title(f'Fine-Tuned BERT Top 10% Positive', fontsize=20)

plt.show()

In [None]:
# plot word cloud for top10% negative
fig, ax = plt.subplots(figsize=(10, 5))
ax.imshow(wordcloud_bert_neg, interpolation='bilinear')
ax.axis('off')
ax.set_title(f'Fine-Tuned BERT Top 10% Negative', fontsize=20)

plt.show()

### **RoBERTa**

In [None]:
# format to datetime
roberta_pred_finetuned['week_date'] = pd.to_datetime(roberta_pred_finetuned['week_date'])

# add news number to differentiate multiple news items in a week
roberta_pred_finetuned['news_number'] = roberta_pred_finetuned.groupby(['company', 'week_date']).cumcount() + 1

# merge test dataset with fine-tuned prediction
combined_df_roberta = pd.merge(test_df[['permco', 'start_date', 'news_number', 'combined_text', 'eventtype']], roberta_pred_finetuned, how='left', left_on=['permco', 'start_date', 'news_number'], right_on=['company', 'week_date', 'news_number'])
combined_df_roberta.drop(['permco', 'start_date'], axis=1, inplace=True)

# order columns
combined_df_roberta = combined_df_roberta[['company', 'week_date', 'combined_text', 'eventtype', 'probability_neg', 'probability_pos', 'prediction','actual']]

# remove rows without news
combined_df_roberta = combined_df_roberta[combined_df_roberta['combined_text'] != 'no_headlines no_headlines no_headlines']

# take the top 10% positive probability from the maximum values
top_10percent_pos_roberta = combined_df_roberta.nlargest(int(len(combined_df_roberta) * 0.1), 'probability_pos')

# take the top 10% negative probability from the maximum values
top_10percent_neg_roberta = combined_df_roberta.nlargest(int(len(combined_df_roberta) * 0.1), 'probability_neg')

**Wordcloud**

In [None]:
#apply function to the news for top10% positive
top_10percent_pos_roberta['tokens'] = top_10percent_pos_roberta['combined_text'].apply(tokenize)

#apply function to the news for top10% negative
top_10percent_neg_roberta['tokens'] = top_10percent_neg_roberta['combined_text'].apply(tokenize)

In [None]:
# apply the function to the 'tokens' column
top_10percent_pos_roberta['tokens'] = top_10percent_pos_roberta['tokens'].apply(filter_tokens)
top_10percent_neg_roberta['tokens'] = top_10percent_neg_roberta['tokens'].apply(filter_tokens)

In [None]:
# flatten list of tokens
all_tokens_pos_roberta = [token for sublist in top_10percent_pos_roberta['tokens'] for token in sublist]
all_tokens_neg_roberta = [token for sublist in top_10percent_neg_roberta['tokens'] for token in sublist]

# convert list of tokens to a single string
all_tokens_pos_roberta_str = ' '.join(all_tokens_pos_roberta)
all_tokens_neg_roberta_str = ' '.join(all_tokens_neg_roberta)

In [None]:
# generate word cloud
wordcloud_roberta_pos = WordCloud(width=400, height=250, background_color='white').generate(all_tokens_pos_roberta_str)
wordcloud_roberta_neg = WordCloud(width=400, height=250, background_color='white').generate(all_tokens_neg_roberta_str)

In [None]:
# plot word cloud for top10% positive
fig, ax = plt.subplots(figsize=(10, 5))
ax.imshow(wordcloud_roberta_pos, interpolation='bilinear')
ax.axis('off')
ax.set_title(f'Fine-Tuned RoBERTa Top 10% Positive', fontsize=20)

plt.show()

In [None]:
# plot word cloud for top10% negative
fig, ax = plt.subplots(figsize=(10, 5))
ax.imshow(wordcloud_roberta_neg, interpolation='bilinear')
ax.axis('off')
ax.set_title(f'Fine-Tuned RoBERTa Top 10% Negative', fontsize=20)

plt.show()

### **DistilBERT**

In [None]:
# format to datetime
distilbert_pred_finetuned['week_date'] = pd.to_datetime(distilbert_pred_finetuned['week_date'])

# add news number to differentiate multiple news items in a week
distilbert_pred_finetuned['news_number'] = distilbert_pred_finetuned.groupby(['company', 'week_date']).cumcount() + 1

# merge test dataset with fine-tuned prediction
combined_df_distilbert = pd.merge(test_df[['permco', 'start_date', 'news_number', 'combined_text', 'eventtype']], distilbert_pred_finetuned, how='left', left_on=['permco', 'start_date', 'news_number'], right_on=['company', 'week_date', 'news_number'])
combined_df_distilbert.drop(['permco', 'start_date'], axis=1, inplace=True)

# order columns
combined_df_distilbert = combined_df_distilbert[['company', 'week_date', 'combined_text', 'eventtype', 'probability_neg', 'probability_pos', 'prediction','actual']]

# remove rows without news
combined_df_distilbert = combined_df_distilbert[combined_df_distilbert['combined_text'] != 'no_headlines no_headlines no_headlines']

# take the top 10% positive probability from the maximum values
top_10percent_pos_distilbert = combined_df_distilbert.nlargest(int(len(combined_df_distilbert) * 0.1), 'probability_pos')

# take the top 10% negative probability from the maximum values
top_10percent_neg_distilbert = combined_df_distilbert.nlargest(int(len(combined_df_distilbert) * 0.1), 'probability_neg')

**Wordcloud**

In [None]:
#apply function to the news for top10% positive
top_10percent_pos_distilbert['tokens'] = top_10percent_pos_distilbert['combined_text'].apply(tokenize)

#apply function to the news for top10% negative
top_10percent_neg_distilbert['tokens'] = top_10percent_neg_distilbert['combined_text'].apply(tokenize)

In [None]:
# apply the function to the 'tokens' column
top_10percent_pos_distilbert['tokens'] = top_10percent_pos_distilbert['tokens'].apply(filter_tokens)
top_10percent_neg_distilbert['tokens'] = top_10percent_neg_distilbert['tokens'].apply(filter_tokens)

In [None]:
# flatten list of tokens
all_tokens_pos_distilbert = [token for sublist in top_10percent_pos_distilbert['tokens'] for token in sublist]
all_tokens_neg_distilbert = [token for sublist in top_10percent_neg_distilbert['tokens'] for token in sublist]

# convert list of tokens to a single string
all_tokens_pos_distilbert_str = ' '.join(all_tokens_pos_distilbert)
all_tokens_neg_distilbert_str = ' '.join(all_tokens_neg_distilbert)

In [None]:
# generate word cloud
wordcloud_distilbert_pos = WordCloud(width=400, height=250, background_color='white').generate(all_tokens_pos_distilbert_str)
wordcloud_distilbert_neg = WordCloud(width=400, height=250, background_color='white').generate(all_tokens_neg_distilbert_str)

In [None]:
# plot word cloud for top10% positive
fig, ax = plt.subplots(figsize=(10, 5))
ax.imshow(wordcloud_distilbert_pos, interpolation='bilinear')
ax.axis('off')
ax.set_title(f'Fine-Tuned DistilBERT Top 10% Positive', fontsize=20)

plt.show()

In [None]:
# plot word cloud for top10% negative
fig, ax = plt.subplots(figsize=(10, 5))
ax.imshow(wordcloud_distilbert_neg, interpolation='bilinear')
ax.axis('off')
ax.set_title(f'Fine-Tuned DistilBERT Top 10% Negative', fontsize=20)

plt.show()

### **DistilRoBERTa**

In [None]:
# format to datetime
distilroberta_pred_finetuned['week_date'] = pd.to_datetime(distilroberta_pred_finetuned['week_date'])

# add news number to differentiate multiple news items in a week
distilroberta_pred_finetuned['news_number'] = distilroberta_pred_finetuned.groupby(['company', 'week_date']).cumcount() + 1

# merge test dataset with fine-tuned prediction
combined_df_distilroberta = pd.merge(test_df[['permco', 'start_date', 'news_number', 'combined_text', 'eventtype']], distilroberta_pred_finetuned, how='left', left_on=['permco', 'start_date', 'news_number'], right_on=['company', 'week_date', 'news_number'])
combined_df_distilroberta.drop(['permco', 'start_date'], axis=1, inplace=True)

# order columns
combined_df_distilroberta = combined_df_distilroberta[['company', 'week_date', 'combined_text', 'eventtype', 'probability_neg', 'probability_pos', 'prediction','actual']]

# remove rows without news
combined_df_distilroberta = combined_df_distilroberta[combined_df_distilroberta['combined_text'] != 'no_headlines no_headlines no_headlines']

# take the top 10% positive probability from the maximum values
top_10percent_pos_distilroberta = combined_df_distilroberta.nlargest(int(len(combined_df_distilroberta) * 0.1), 'probability_pos')

# take the top 10% negative probability from the maximum values
top_10percent_neg_distilroberta = combined_df_distilroberta.nlargest(int(len(combined_df_distilroberta) * 0.1), 'probability_neg')

**Wordcloud**

In [None]:
#apply function to the news for top10% positive
top_10percent_pos_distilroberta['tokens'] = top_10percent_pos_distilroberta['combined_text'].apply(tokenize)

#apply function to the news for top10% negative
top_10percent_neg_distilroberta['tokens'] = top_10percent_neg_distilroberta['combined_text'].apply(tokenize)

In [None]:
# apply the function to the 'tokens' column
top_10percent_pos_distilroberta['tokens'] = top_10percent_pos_distilroberta['tokens'].apply(filter_tokens)
top_10percent_neg_distilroberta['tokens'] = top_10percent_neg_distilroberta['tokens'].apply(filter_tokens)

In [None]:
# flatten list of tokens
all_tokens_pos_distilroberta = [token for sublist in top_10percent_pos_distilroberta['tokens'] for token in sublist]
all_tokens_neg_distilroberta = [token for sublist in top_10percent_neg_distilroberta['tokens'] for token in sublist]

# convert list of tokens to a single string
all_tokens_pos_distilroberta_str = ' '.join(all_tokens_pos_distilroberta)
all_tokens_neg_distilroberta_str = ' '.join(all_tokens_neg_distilroberta)

In [None]:
# generate word cloud
wordcloud_distilroberta_pos = WordCloud(width=400, height=250, background_color='white').generate(all_tokens_pos_distilroberta_str)
wordcloud_distilroberta_neg = WordCloud(width=400, height=250, background_color='white').generate(all_tokens_neg_distilroberta_str)

In [None]:
# plot word cloud for top10% positive
fig, ax = plt.subplots(figsize=(10, 5))
ax.imshow(wordcloud_distilroberta_pos, interpolation='bilinear')
ax.axis('off')
ax.set_title(f'Fine-Tuned DistilRoBERTa Top 10% Positive', fontsize=20)

plt.show()

In [None]:
# plot word cloud for top10% negative
fig, ax = plt.subplots(figsize=(10, 5))
ax.imshow(wordcloud_distilroberta_neg, interpolation='bilinear')
ax.axis('off')
ax.set_title(f'Fine-Tuned DistilRoBERTa Top 10% Negative', fontsize=20)

plt.show()

### **FinBERT**

In [None]:
# format to datetime
finbert_pred_finetuned['week_date'] = pd.to_datetime(finbert_pred_finetuned['week_date'])

# add news number to differentiate multiple news items in a week
finbert_pred_finetuned['news_number'] = finbert_pred_finetuned.groupby(['company', 'week_date']).cumcount() + 1

# merge test dataset with fine-tuned prediction
combined_df_finbert = pd.merge(test_df[['permco', 'start_date', 'news_number', 'combined_text', 'eventtype']], finbert_pred_finetuned, how='left', left_on=['permco', 'start_date', 'news_number'], right_on=['company', 'week_date', 'news_number'])
combined_df_finbert.drop(['permco', 'start_date'], axis=1, inplace=True)

# order columns
combined_df_finbert = combined_df_finbert[['company', 'week_date', 'combined_text', 'eventtype', 'probability_neg', 'probability_pos', 'prediction','actual']]

# remove rows without news
combined_df_finbert = combined_df_finbert[combined_df_finbert['combined_text'] != 'no_headlines no_headlines no_headlines']

# take the top 10% positive probability from the maximum values
top_10percent_pos_finbert = combined_df_finbert.nlargest(int(len(combined_df_finbert) * 0.1), 'probability_pos')

# take the top 10% negative probability from the maximum values
top_10percent_neg_finbert = combined_df_finbert.nlargest(int(len(combined_df_finbert) * 0.1), 'probability_neg')

**Wordcloud**

In [None]:
#apply function to the news for top10% positive
top_10percent_pos_finbert['tokens'] = top_10percent_pos_finbert['combined_text'].apply(tokenize)

#apply function to the news for top10% negative
top_10percent_neg_finbert['tokens'] = top_10percent_neg_finbert['combined_text'].apply(tokenize)

In [None]:
# apply the function to the 'tokens' column
top_10percent_pos_finbert['tokens'] = top_10percent_pos_finbert['tokens'].apply(filter_tokens)
top_10percent_neg_finbert['tokens'] = top_10percent_neg_finbert['tokens'].apply(filter_tokens)

In [None]:
# flatten list of tokens
all_tokens_pos_finbert = [token for sublist in top_10percent_pos_finbert['tokens'] for token in sublist]
all_tokens_neg_finbert = [token for sublist in top_10percent_neg_finbert['tokens'] for token in sublist]

# convert list of tokens to a single string
all_tokens_pos_finbert_str = ' '.join(all_tokens_pos_finbert)
all_tokens_neg_finbert_str = ' '.join(all_tokens_neg_finbert)

In [None]:
# generate word cloud
wordcloud_finbert_pos = WordCloud(width=400, height=250, background_color='white').generate(all_tokens_pos_finbert_str)
wordcloud_finbert_neg = WordCloud(width=400, height=250, background_color='white').generate(all_tokens_neg_finbert_str)

In [None]:
# plot word cloud for top10% positive
fig, ax = plt.subplots(figsize=(10, 5))
ax.imshow(wordcloud_finbert_pos, interpolation='bilinear')
ax.axis('off')
ax.set_title(f'Fine-Tuned FinBERT Top 10% Positive', fontsize=20)

plt.show()

In [None]:
# plot word cloud for top10% negative
fig, ax = plt.subplots(figsize=(10, 5))
ax.imshow(wordcloud_finbert_neg, interpolation='bilinear')
ax.axis('off')
ax.set_title(f'Fine-Tuned FinBERT Top 10% Negative', fontsize=20)

plt.show()

# **PORTFOLIO ANALYSIS WITHOUT TRANSACTION COST**

## **Pre-Trained**

In [None]:
# load data
bert_portfolio_pretrained = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Portfolio Analysis - Pre Trained/bert_portfolio_pretrained.csv')
roberta_portfolio_pretrained = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Portfolio Analysis - Pre Trained/roberta_portfolio_pretrained.csv')
distilbert_portfolio_pretrained = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Portfolio Analysis - Pre Trained/distilbert_portfolio_pretrained.csv')
distilroberta_portfolio_pretrained = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Portfolio Analysis - Pre Trained/distilroberta_portfolio_pretrained.csv')
finbert_portfolio_pretrained = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Portfolio Analysis - Pre Trained/finbert_portfolio_pretrained.csv')

In [None]:
# format to datetime
bert_portfolio_pretrained['date'] = pd.to_datetime(bert_portfolio_pretrained['date'])
roberta_portfolio_pretrained['date'] = pd.to_datetime(roberta_portfolio_pretrained['date'])
distilbert_portfolio_pretrained['date'] = pd.to_datetime(distilbert_portfolio_pretrained['date'])
distilroberta_portfolio_pretrained['date'] = pd.to_datetime(distilroberta_portfolio_pretrained['date'])
finbert_portfolio_pretrained['date'] = pd.to_datetime(finbert_portfolio_pretrained['date'])

In [None]:
# add suffix to column name
bert_portfolio_pretrained = bert_portfolio_pretrained.add_suffix('_bert')
roberta_portfolio_pretrained = roberta_portfolio_pretrained.add_suffix('_roberta')
distilbert_portfolio_pretrained = distilbert_portfolio_pretrained.add_suffix('_distilbert')
distilroberta_portfolio_pretrained = distilroberta_portfolio_pretrained.add_suffix('_distilroberta')
finbert_portfolio_pretrained = finbert_portfolio_pretrained.add_suffix('_finbert')

### **EW L-S**

In [None]:
# concat all models' EW-LS portfolio
ew_ls = pd.concat([bert_portfolio_pretrained[['date_bert', 'cum_market_return_bert','cum_ELS_return_bert']],
                   roberta_portfolio_pretrained['cum_ELS_return_roberta'],
                   distilbert_portfolio_pretrained['cum_ELS_return_distilbert'],
                   distilroberta_portfolio_pretrained['cum_ELS_return_distilroberta'],
                   finbert_portfolio_pretrained['cum_ELS_return_finbert']], axis=1)

In [None]:
# rename column
ew_ls.rename(columns={'date_bert': 'date',
                      'cum_ELS_return_bert': 'BERT',
                      'cum_ELS_return_distilbert':'DistilBERT',
                      'cum_ELS_return_distilroberta':'DistilRoBERTa',
                      'cum_ELS_return_finbert':'FinBERT',
                      'cum_ELS_return_roberta':'RoBERTa',
                      'cum_market_return_bert':'Market'}, inplace=True)

In [None]:
# reshape the DataFrame from wide to long format
df_long = pd.melt(ew_ls, id_vars=['date'], value_vars=['BERT', 'DistilBERT', 'DistilRoBERTa', 'FinBERT', 'RoBERTa', 'Market'],
                  var_name='model', value_name='log_return')

# create the line plot
fig = px.line(df_long,
              x='date',
              y='log_return',
              color='model',
              title='Equal Weighted Long-Short Strategy',
              height=800,
              width=900)

# customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks


# customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-1, 1.2],
                 title_font=dict(size=18),  # Font size for the y-axis title
                 tickfont=dict(size=14))    # Font size for the y-axis ticks

# set different line styles for different models
line_styles = {
    'BERT': 'solid',
    'DistilBERT': 'solid',
    'DistilRoBERTa': 'solid',
    'FinBERT': 'solid',
    'RoBERTa': 'solid',
    'Market': 'dot'
}

# apply the line styles and set 'Market' color to black
fig.for_each_trace(lambda trace: trace.update(
    line=dict(dash=line_styles[trace.name], color='black' if trace.name == 'Market' else trace.line.color)
) if trace.name in line_styles else ())

# show the plot
fig.show()

### **VW L-S**

In [None]:
vw_ls = pd.concat([bert_portfolio_pretrained[['date_bert', 'cum_market_return_bert','cum_VLS_return_bert']],
                   roberta_portfolio_pretrained['cum_VLS_return_roberta'],
                   distilbert_portfolio_pretrained['cum_VLS_return_distilbert'],
                   distilroberta_portfolio_pretrained['cum_VLS_return_distilroberta'],
                   finbert_portfolio_pretrained['cum_VLS_return_finbert']], axis=1)

In [None]:
vw_ls.rename(columns={'date_bert': 'date',
                      'cum_market_return_bert':'Market',
                      'cum_VLS_return_bert': 'BERT',
                      'cum_VLS_return_roberta':'RoBERTa',
                      'cum_VLS_return_distilbert':'DistilBERT',
                      'cum_VLS_return_distilroberta':'DistilRoBERTa',
                      'cum_VLS_return_finbert':'FinBERT'}, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
df_long = pd.melt(vw_ls, id_vars=['date'], value_vars=['BERT', 'DistilBERT', 'DistilRoBERTa', 'FinBERT', 'RoBERTa', 'Market', ],
                  var_name='model', value_name='log_return')

# Create the line plot
fig = px.line(df_long,
              x='date',
              y='log_return',
              color='model',
              title='Value Weighted Long Strategy',
              height=800,
              width=900)

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks


# Customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-1, 1.2],
                 title_font=dict(size=18),  # Font size for the y-axis title
                 tickfont=dict(size=14))    # Font size for the y-axis ticks

# Set different line styles for different models
line_styles = {
    'BERT': 'solid',
    'DistilBERT': 'solid',
    'DistilRoBERTa': 'solid',
    'FinBERT': 'solid',
    'RoBERTa': 'solid',
    'Market': 'dot'
}

# Apply the line styles and set 'Market' color to black
fig.for_each_trace(lambda trace: trace.update(
    line=dict(dash=line_styles[trace.name], color='black' if trace.name == 'Market' else trace.line.color)
) if trace.name in line_styles else ())

# Show the plot
fig.show()

### **EW L**

In [None]:
ew_l = pd.concat([bert_portfolio_pretrained[['date_bert', 'cum_market_return_bert','cum_EL_return_bert']],
                   roberta_portfolio_pretrained['cum_EL_return_roberta'],
                   distilbert_portfolio_pretrained['cum_EL_return_distilbert'],
                   distilroberta_portfolio_pretrained['cum_EL_return_distilroberta'],
                   finbert_portfolio_pretrained['cum_EL_return_finbert']], axis=1)

In [None]:
ew_l.rename(columns={'date_bert': 'date',
                      'cum_EL_return_bert': 'BERT',
                      'cum_EL_return_distilbert':'DistilBERT',
                      'cum_EL_return_distilroberta':'DistilRoBERTa',
                      'cum_EL_return_finbert':'FinBERT',
                      'cum_EL_return_roberta':'RoBERTa',
                      'cum_market_return_bert':'Market'}, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
df_long = pd.melt(ew_l, id_vars=['date'], value_vars=['BERT', 'DistilBERT', 'DistilRoBERTa', 'FinBERT', 'RoBERTa', 'Market'],
                  var_name='model', value_name='log_return')

# Create the line plot
fig = px.line(df_long,
              x='date',
              y='log_return',
              color='model',
              title='Equal Weighted Long Strategy',
              height=800,
              width=900)

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks

# Customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-.5, 1.6])

# Set different line styles for different models
line_styles = {
    'BERT': 'solid',
    'DistilBERT': 'solid',
    'DistilRoBERTa': 'solid',
    'FinBERT': 'solid',
    'RoBERTa': 'solid',
    'Market': 'dot'
}

# Apply the line styles and set 'Market' color to black
fig.for_each_trace(lambda trace: trace.update(
    line=dict(dash=line_styles[trace.name], color='black' if trace.name == 'Market' else trace.line.color)
) if trace.name in line_styles else ())

# Show the plot
fig.show()

### **VW L**

In [None]:
vw_l = pd.concat([bert_portfolio_pretrained[['date_bert', 'cum_market_return_bert','cum_VL_return_bert']],
                   roberta_portfolio_pretrained['cum_VL_return_roberta'],
                   distilbert_portfolio_pretrained['cum_VL_return_distilbert'],
                   distilroberta_portfolio_pretrained['cum_VL_return_distilroberta'],
                   finbert_portfolio_pretrained['cum_VL_return_finbert']], axis=1)

In [None]:
vw_l.rename(columns={'date_bert': 'date',
                      'cum_market_return_bert':'Market',
                      'cum_VL_return_bert': 'BERT',
                      'cum_VL_return_roberta':'RoBERTa',
                      'cum_VL_return_distilbert':'DistilBERT',
                      'cum_VL_return_distilroberta':'DistilRoBERTa',
                      'cum_VL_return_finbert':'FinBERT'}, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
df_long = pd.melt(vw_l, id_vars=['date'], value_vars=['BERT', 'DistilBERT', 'DistilRoBERTa', 'FinBERT', 'RoBERTa', 'Market', ],
                  var_name='model', value_name='log_return')

# Create the line plot
fig = px.line(df_long,
              x='date',
              y='log_return',
              color='model',
              title='Value Weighted Long Strategy',
              height=800,
              width=900)

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks

# Customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-.5, 1.6])

# Set different line styles for different models
line_styles = {
    'BERT': 'solid',
    'DistilBERT': 'solid',
    'DistilRoBERTa': 'solid',
    'FinBERT': 'solid',
    'RoBERTa': 'solid',
    'Market': 'dot'
}

# Apply the line styles and set 'Market' color to black
fig.for_each_trace(lambda trace: trace.update(
    line=dict(dash=line_styles[trace.name], color='black' if trace.name == 'Market' else trace.line.color)
) if trace.name in line_styles else ())

# Show the plot
fig.show()

### **EW S**

In [None]:
ew_s = pd.concat([bert_portfolio_pretrained[['date_bert', 'cum_market_return_bert','cum_ES_return_bert']],
                   roberta_portfolio_pretrained['cum_ES_return_roberta'],
                   distilbert_portfolio_pretrained['cum_ES_return_distilbert'],
                   distilroberta_portfolio_pretrained['cum_ES_return_distilroberta'],
                   finbert_portfolio_pretrained['cum_ES_return_finbert']], axis=1)

In [None]:
ew_s.rename(columns={'date_bert': 'date',
                      'cum_ES_return_bert': 'BERT',
                      'cum_ES_return_distilbert':'DistilBERT',
                      'cum_ES_return_distilroberta':'DistilRoBERTa',
                      'cum_ES_return_finbert':'FinBERT',
                      'cum_ES_return_roberta':'RoBERTa',
                      'cum_market_return_bert':'Market'}, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
df_long = pd.melt(ew_s, id_vars=['date'], value_vars=['BERT', 'DistilBERT', 'DistilRoBERTa', 'FinBERT', 'RoBERTa', 'Market'],
                  var_name='model', value_name='log_return')

# Create the line plot
fig = px.line(df_long,
              x='date',
              y='log_return',
              color='model',
              title='Equal Weighted Short Strategy',
              height=800,
              width=900)

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks

# Customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-2, 1.2])

# Set different line styles for different models
line_styles = {
    'BERT': 'solid',
    'DistilBERT': 'solid',
    'DistilRoBERTa': 'solid',
    'FinBERT': 'solid',
    'RoBERTa': 'solid',
    'Market': 'dot'
}

# Apply the line styles and set 'Market' color to black
fig.for_each_trace(lambda trace: trace.update(
    line=dict(dash=line_styles[trace.name], color='black' if trace.name == 'Market' else trace.line.color)
) if trace.name in line_styles else ())

# Show the plot
fig.show()

### **VW S**

In [None]:
vw_s = pd.concat([bert_portfolio_pretrained[['date_bert', 'cum_market_return_bert','cum_VS_return_bert']],
                   roberta_portfolio_pretrained['cum_VS_return_roberta'],
                   distilbert_portfolio_pretrained['cum_VS_return_distilbert'],
                   distilroberta_portfolio_pretrained['cum_VS_return_distilroberta'],
                   finbert_portfolio_pretrained['cum_VS_return_finbert']], axis=1)

In [None]:
vw_s.rename(columns={'date_bert': 'date',
                      'cum_market_return_bert':'Market',
                      'cum_VS_return_bert': 'BERT',
                      'cum_VS_return_roberta':'RoBERTa',
                      'cum_VS_return_distilbert':'DistilBERT',
                      'cum_VS_return_distilroberta':'DistilRoBERTa',
                      'cum_VS_return_finbert':'FinBERT'}, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
df_long = pd.melt(vw_s, id_vars=['date'], value_vars=['BERT', 'DistilBERT', 'DistilRoBERTa', 'FinBERT', 'RoBERTa', 'Market', ],
                  var_name='model', value_name='log_return')

# Create the line plot
fig = px.line(df_long,
              x='date',
              y='log_return',
              color='model',
              title='Value Weighted Short Strategy',
              height=800,
              width=900)

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks

# Customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-2, 1.2])

# Set different line styles for different models
line_styles = {
    'BERT': 'solid',
    'DistilBERT': 'solid',
    'DistilRoBERTa': 'solid',
    'FinBERT': 'solid',
    'RoBERTa': 'solid',
    'Market': 'dot'
}

# Apply the line styles and set 'Market' color to black
fig.for_each_trace(lambda trace: trace.update(
    line=dict(dash=line_styles[trace.name], color='black' if trace.name == 'Market' else trace.line.color)
) if trace.name in line_styles else ())

# Show the plot
fig.show()

## **Fine-Tuned**

In [None]:
# load data
bert_portfolio_finetuned = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Portfolio Analysis - Fine Tune/bert_portfolio.csv')
roberta_portfolio_finetuned = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Portfolio Analysis - Fine Tune/roberta_portfolio.csv')
distilbert_portfolio_finetuned = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Portfolio Analysis - Fine Tune/distilbert_portfolio.csv')
distilroberta_portfolio_finetuned = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Portfolio Analysis - Fine Tune/distilroberta_portfolio.csv')
finbert_portfolio_finetuned = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Portfolio Analysis - Fine Tune/finbert_portfolio.csv')

In [None]:
bert_portfolio_finetuned['date'] = pd.to_datetime(bert_portfolio_finetuned['date'])
roberta_portfolio_finetuned['date'] = pd.to_datetime(roberta_portfolio_finetuned['date'])
distilbert_portfolio_finetuned['date'] = pd.to_datetime(distilbert_portfolio_finetuned['date'])
distilroberta_portfolio_finetuned['date'] = pd.to_datetime(distilroberta_portfolio_finetuned['date'])
finbert_portfolio_finetuned['date'] = pd.to_datetime(finbert_portfolio_finetuned['date'])

In [None]:
# add suffix to column name
bert_portfolio_finetuned = bert_portfolio_finetuned.add_suffix('_bert')
roberta_portfolio_finetuned = roberta_portfolio_finetuned.add_suffix('_roberta')
distilbert_portfolio_finetuned = distilbert_portfolio_finetuned.add_suffix('_distilbert')
distilroberta_portfolio_finetuned = distilroberta_portfolio_finetuned.add_suffix('_distilroberta')
finbert_portfolio_finetuned = finbert_portfolio_finetuned.add_suffix('_finbert')

### **EW L-S**

In [None]:
ew_ls = pd.concat([bert_portfolio_finetuned[['date_bert', 'cum_market_return_bert','cum_ELS_return_bert']],
                   roberta_portfolio_finetuned['cum_ELS_return_roberta'],
                   distilbert_portfolio_finetuned['cum_ELS_return_distilbert'],
                   distilroberta_portfolio_finetuned['cum_ELS_return_distilroberta'],
                   finbert_portfolio_finetuned['cum_ELS_return_finbert']], axis=1)

In [None]:
ew_ls.rename(columns={'date_bert': 'date',
                      'cum_ELS_return_bert': 'BERT',
                      'cum_ELS_return_distilbert':'DistilBERT',
                      'cum_ELS_return_distilroberta':'DistilRoBERTa',
                      'cum_ELS_return_finbert':'FinBERT',
                      'cum_ELS_return_roberta':'RoBERTa',
                      'cum_market_return_bert':'Market'}, inplace=True)

In [None]:
df_long = pd.melt(ew_ls, id_vars=['date'], value_vars=['BERT', 'DistilBERT', 'DistilRoBERTa', 'FinBERT', 'RoBERTa', 'Market'],
                  var_name='model', value_name='log_return')

In [None]:
# Reshape the DataFrame from wide to long format
df_long = pd.melt(ew_ls, id_vars=['date'], value_vars=['BERT', 'DistilBERT', 'DistilRoBERTa', 'FinBERT', 'RoBERTa', 'Market'],
                  var_name='model', value_name='log_return')

# Create the line plot
fig = px.line(df_long,
              x='date',
              y='log_return',
              color='model',
              title='Equal Weighted Long-Short Strategy',
              height=800,
              width=900)

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks


# Customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-1, 1.2],
                 title_font=dict(size=18),  # Font size for the y-axis title
                 tickfont=dict(size=14))    # Font size for the y-axis ticks

# Set different line styles for different models
line_styles = {
    'BERT': 'solid',
    'DistilBERT': 'solid',
    'DistilRoBERTa': 'solid',
    'FinBERT': 'solid',
    'RoBERTa': 'solid',
    'Market': 'dot'
}

# Apply the line styles and set 'Market' color to black
fig.for_each_trace(lambda trace: trace.update(
    line=dict(dash=line_styles[trace.name], color='black' if trace.name == 'Market' else trace.line.color)
) if trace.name in line_styles else ())

# Show the plot
fig.show()

### **VW L-S**

In [None]:
vw_ls = pd.concat([bert_portfolio_finetuned[['date_bert', 'cum_market_return_bert','cum_VLS_return_bert']],
                   roberta_portfolio_finetuned['cum_VLS_return_roberta'],
                   distilbert_portfolio_finetuned['cum_VLS_return_distilbert'],
                   distilroberta_portfolio_finetuned['cum_VLS_return_distilroberta'],
                   finbert_portfolio_finetuned['cum_VLS_return_finbert']], axis=1)

In [None]:
vw_ls.rename(columns={'date_bert': 'date',
                      'cum_market_return_bert':'Market',
                      'cum_VLS_return_bert': 'BERT',
                      'cum_VLS_return_roberta':'RoBERTa',
                      'cum_VLS_return_distilbert':'DistilBERT',
                      'cum_VLS_return_distilroberta':'DistilRoBERTa',
                      'cum_VLS_return_finbert':'FinBERT'}, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
df_long = pd.melt(vw_ls, id_vars=['date'], value_vars=['BERT', 'DistilBERT', 'DistilRoBERTa', 'FinBERT', 'RoBERTa', 'Market', ],
                  var_name='model', value_name='log_return')

# Create the line plot
fig = px.line(df_long,
              x='date',
              y='log_return',
              color='model',
              title='Value Weighted Long-Short Strategy',
              height=800,
              width=900)

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks

# Customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-1, 1.2])

# Set different line styles for different models
line_styles = {
    'BERT': 'solid',
    'DistilBERT': 'solid',
    'DistilRoBERTa': 'solid',
    'FinBERT': 'solid',
    'RoBERTa': 'solid',
    'Market': 'dot'
}

# Apply the line styles and set 'Market' color to black
fig.for_each_trace(lambda trace: trace.update(
    line=dict(dash=line_styles[trace.name], color='black' if trace.name == 'Market' else trace.line.color)
) if trace.name in line_styles else ())

# Show the plot
fig.show()

### **EW L**

In [None]:
ew_l = pd.concat([bert_portfolio_finetuned[['date_bert', 'cum_market_return_bert','cum_EL_return_bert']],
                   roberta_portfolio_finetuned['cum_EL_return_roberta'],
                   distilbert_portfolio_finetuned['cum_EL_return_distilbert'],
                   distilroberta_portfolio_finetuned['cum_EL_return_distilroberta'],
                   finbert_portfolio_finetuned['cum_EL_return_finbert']], axis=1)

In [None]:
ew_l.rename(columns={'date_bert': 'date',
                      'cum_EL_return_bert': 'BERT',
                      'cum_EL_return_distilbert':'DistilBERT',
                      'cum_EL_return_distilroberta':'DistilRoBERTa',
                      'cum_EL_return_finbert':'FinBERT',
                      'cum_EL_return_roberta':'RoBERTa',
                      'cum_market_return_bert':'Market'}, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
df_long = pd.melt(ew_l, id_vars=['date'], value_vars=['BERT', 'DistilBERT', 'DistilRoBERTa', 'FinBERT', 'RoBERTa', 'Market'],
                  var_name='model', value_name='log_return')

# Create the line plot
fig = px.line(df_long,
              x='date',
              y='log_return',
              color='model',
              title='Equal Weighted Long',
              height=800,
              width=900)

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks

# Customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-.5, 1.5])

# Set different line styles for different models
line_styles = {
    'BERT': 'solid',
    'DistilBERT': 'solid',
    'DistilRoBERTa': 'solid',
    'FinBERT': 'solid',
    'RoBERTa': 'solid',
    'Market': 'dot'
}

# Apply the line styles and set 'Market' color to black
fig.for_each_trace(lambda trace: trace.update(
    line=dict(dash=line_styles[trace.name], color='black' if trace.name == 'Market' else trace.line.color)
) if trace.name in line_styles else ())

# Show the plot
fig.show()

### **VW L**

In [None]:
vw_l = pd.concat([bert_portfolio_finetuned[['date_bert', 'cum_market_return_bert','cum_VL_return_bert']],
                   roberta_portfolio_finetuned['cum_VL_return_roberta'],
                   distilbert_portfolio_finetuned['cum_VL_return_distilbert'],
                   distilroberta_portfolio_finetuned['cum_VL_return_distilroberta'],
                   finbert_portfolio_finetuned['cum_VL_return_finbert']], axis=1)

In [None]:
vw_l.rename(columns={'date_bert': 'date',
                      'cum_market_return_bert':'Market',
                      'cum_VL_return_bert': 'BERT',
                      'cum_VL_return_roberta':'RoBERTa',
                      'cum_VL_return_distilbert':'DistilBERT',
                      'cum_VL_return_distilroberta':'DistilRoBERTa',
                      'cum_VL_return_finbert':'FinBERT'}, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
df_long = pd.melt(vw_l, id_vars=['date'], value_vars=['BERT', 'DistilBERT', 'DistilRoBERTa', 'FinBERT', 'RoBERTa', 'Market', ],
                  var_name='model', value_name='log_return')

# Create the line plot
fig = px.line(df_long,
              x='date',
              y='log_return',
              color='model',
              title='Value Weighted Long Strategy',
              height=800,
              width=900)

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks

# Customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-.5, 1.5])

# Set different line styles for different models
line_styles = {
    'BERT': 'solid',
    'DistilBERT': 'solid',
    'DistilRoBERTa': 'solid',
    'FinBERT': 'solid',
    'RoBERTa': 'solid',
    'Market': 'dot'
}

# Apply the line styles and set 'Market' color to black
fig.for_each_trace(lambda trace: trace.update(
    line=dict(dash=line_styles[trace.name], color='black' if trace.name == 'Market' else trace.line.color)
) if trace.name in line_styles else ())

# Show the plot
fig.show()

### **EW S**

In [None]:
ew_s = pd.concat([bert_portfolio_finetuned[['date_bert', 'cum_market_return_bert','cum_ES_return_bert']],
                   roberta_portfolio_finetuned['cum_ES_return_roberta'],
                   distilbert_portfolio_finetuned['cum_ES_return_distilbert'],
                   distilroberta_portfolio_finetuned['cum_ES_return_distilroberta'],
                   finbert_portfolio_finetuned['cum_ES_return_finbert']], axis=1)

In [None]:
ew_s.rename(columns={'date_bert': 'date',
                      'cum_ES_return_bert': 'BERT',
                      'cum_ES_return_distilbert':'DistilBERT',
                      'cum_ES_return_distilroberta':'DistilRoBERTa',
                      'cum_ES_return_finbert':'FinBERT',
                      'cum_ES_return_roberta':'RoBERTa',
                      'cum_market_return_bert':'Market'}, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
df_long = pd.melt(ew_s, id_vars=['date'], value_vars=['BERT', 'DistilBERT', 'DistilRoBERTa', 'FinBERT', 'RoBERTa', 'Market'],
                  var_name='model', value_name='log_return')

# Create the line plot
fig = px.line(df_long,
              x='date',
              y='log_return',
              color='model',
              title='Equal Weighted Short Strategy',
              height=800,
              width=900)

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks

# Customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-2, 1])

# Set different line styles for different models
line_styles = {
    'BERT': 'solid',
    'DistilBERT': 'solid',
    'DistilRoBERTa': 'solid',
    'FinBERT': 'solid',
    'RoBERTa': 'solid',
    'Market': 'dot'
}

# Apply the line styles and set 'Market' color to black
fig.for_each_trace(lambda trace: trace.update(
    line=dict(dash=line_styles[trace.name], color='black' if trace.name == 'Market' else trace.line.color)
) if trace.name in line_styles else ())

# Show the plot
fig.show()

### **VW S**

In [None]:
vw_s = pd.concat([bert_portfolio_finetuned[['date_bert', 'cum_market_return_bert','cum_VS_return_bert']],
                   roberta_portfolio_finetuned['cum_VS_return_roberta'],
                   distilbert_portfolio_finetuned['cum_VS_return_distilbert'],
                   distilroberta_portfolio_finetuned['cum_VS_return_distilroberta'],
                   finbert_portfolio_finetuned['cum_VS_return_finbert']], axis=1)

In [None]:
vw_s.rename(columns={'date_bert': 'date',
                      'cum_market_return_bert':'Market',
                      'cum_VS_return_bert': 'BERT',
                      'cum_VS_return_roberta':'RoBERTa',
                      'cum_VS_return_distilbert':'DistilBERT',
                      'cum_VS_return_distilroberta':'DistilRoBERTa',
                      'cum_VS_return_finbert':'FinBERT'}, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
df_long = pd.melt(vw_s, id_vars=['date'], value_vars=['BERT', 'DistilBERT', 'DistilRoBERTa', 'FinBERT', 'RoBERTa', 'Market', ],
                  var_name='model', value_name='log_return')

# Create the line plot
fig = px.line(df_long,
              x='date',
              y='log_return',
              color='model',
              title='Value Weighted Short Strategy',
              height=800,
              width=900)

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks

# Customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-2, 1.2])

# Set different line styles for different models
line_styles = {
    'BERT': 'solid',
    'DistilBERT': 'solid',
    'DistilRoBERTa': 'solid',
    'FinBERT': 'solid',
    'RoBERTa': 'solid',
    'Market': 'dot'
}

# Apply the line styles and set 'Market' color to black
fig.for_each_trace(lambda trace: trace.update(
    line=dict(dash=line_styles[trace.name], color='black' if trace.name == 'Market' else trace.line.color)
) if trace.name in line_styles else ())

# Show the plot
fig.show()

## **by Model**

### **BERT Pre-Trained**

In [None]:
bert_portfolio_pretrained.rename(columns={'date_bert': 'date',
                               'cum_market_return_bert':'Market',
                               'cum_EL_return_bert':'EW-L',
                               'cum_ES_return_bert':'EW-S',
                               'cum_ELS_return_bert':'EW-LS',
                               'cum_VL_return_bert': 'VW-L',
                               'cum_VS_return_bert': 'VW-S',
                               'cum_VLS_return_bert':'VW-LS'
                                }, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
bert_long = pd.melt(bert_portfolio_pretrained, id_vars=['date'], value_vars=['Market', 'EW-L', 'EW-S', 'EW-LS', 'VW-L', 'VW-S', 'VW-LS'],
                  var_name='strategy', value_name='log_return')

# Create the line plot
fig = px.line(bert_long, x='date', y='log_return', color='strategy',
                  height=800, width=900, title='BERT Portfolio Pre-Trained')

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks


# Customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-1.6, 1.5],
                 title_font=dict(size=18),  # Font size for the y-axis title
                 tickfont=dict(size=14))    # Font size for the y-axis ticks


strategy_styles = {
    'Market': {'color': 'orange', 'dash': 'dot'},
    'EW-L': {'color': 'midnightblue', 'dash': 'solid'},
    'EW-S': {'color': 'midnightblue', 'dash': 'longdash'},
    'EW-LS': {'color': 'midnightblue', 'dash': 'dashdot'},
    'VW-L': {'color': 'skyblue', 'dash': 'solid'},
    'VW-S': {'color': 'skyblue', 'dash': 'longdashdot'},
    'VW-LS': {'color': 'skyblue', 'dash': 'dashdot'}
}

# Apply the line styles and colors
fig.for_each_trace(lambda trace: trace.update(
    line=dict(color=strategy_styles[trace.name]['color'], dash=strategy_styles[trace.name]['dash'])
) if trace.name in strategy_styles else ())

# Show the plot
fig.show()

### **BERT Fine-Tuned**

In [None]:
bert_portfolio_finetuned.rename(columns={'date_bert': 'date',
                               'cum_market_return_bert':'Market',
                               'cum_EL_return_bert':'EW-L',
                               'cum_ES_return_bert':'EW-S',
                               'cum_ELS_return_bert':'EW-LS',
                               'cum_VL_return_bert': 'VW-L',
                               'cum_VS_return_bert': 'VW-S',
                               'cum_VLS_return_bert':'VW-LS'
                                }, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
bert_long = pd.melt(bert_portfolio_finetuned, id_vars=['date'], value_vars=['Market', 'EW-L', 'EW-S', 'EW-LS', 'VW-L', 'VW-S', 'VW-LS'],
                  var_name='strategy', value_name='log_return')

# Create the line plot
fig = px.line(bert_long, x='date', y='log_return', color='strategy',
                  height=800, width=900, title='BERT Portfolio Fine-Tuned')

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks


# Customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-1.6, 1.5],
                 title_font=dict(size=18),  # Font size for the y-axis title
                 tickfont=dict(size=14))    # Font size for the y-axis ticks

strategy_styles = {
    'Market': {'color': 'orange', 'dash': 'dot'},
    'EW-L': {'color': 'midnightblue', 'dash': 'solid'},
    'EW-S': {'color': 'midnightblue', 'dash': 'longdash'},
    'EW-LS': {'color': 'midnightblue', 'dash': 'dashdot'},
    'VW-L': {'color': 'skyblue', 'dash': 'solid'},
    'VW-S': {'color': 'skyblue', 'dash': 'longdashdot'},
    'VW-LS': {'color': 'skyblue', 'dash': 'dashdot'}
}

# Apply the line styles and colors
fig.for_each_trace(lambda trace: trace.update(
    line=dict(color=strategy_styles[trace.name]['color'], dash=strategy_styles[trace.name]['dash'])
) if trace.name in strategy_styles else ())

# Show the plot
fig.show()

### **RoBERTa Pretrained**

In [None]:
roberta_portfolio_pretrained.rename(columns={'date_roberta': 'date',
                               'cum_market_return_roberta':'Market',
                               'cum_EL_return_roberta':'EW-L',
                               'cum_ES_return_roberta':'EW-S',
                               'cum_ELS_return_roberta':'EW-LS',
                               'cum_VL_return_roberta': 'VW-L',
                               'cum_VS_return_roberta': 'VW-S',
                               'cum_VLS_return_roberta':'VW-LS'
                                }, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
roberta_long = pd.melt(roberta_portfolio_pretrained, id_vars=['date'], value_vars=['Market', 'EW-L', 'EW-S', 'EW-LS', 'VW-L', 'VW-S', 'VW-LS'],
                  var_name='strategy', value_name='log_return')

# Create the line plot
fig = px.line(roberta_long, x='date', y='log_return', color='strategy',
                  height=800, width=900, title='RoBERTa Portfolio Pre-Trained')

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks

# Customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-1.6, 1.5],
                 title_font=dict(size=18),  # Font size for the y-axis title
                 tickfont=dict(size=14))    # Font size for the y-axis ticks

strategy_styles = {
    'Market': {'color': 'orange', 'dash': 'dot'},
    'EW-L': {'color': 'midnightblue', 'dash': 'solid'},
    'EW-S': {'color': 'midnightblue', 'dash': 'longdash'},
    'EW-LS': {'color': 'midnightblue', 'dash': 'dashdot'},
    'VW-L': {'color': 'skyblue', 'dash': 'solid'},
    'VW-S': {'color': 'skyblue', 'dash': 'longdashdot'},
    'VW-LS': {'color': 'skyblue', 'dash': 'dashdot'}
}

# Apply the line styles and colors
fig.for_each_trace(lambda trace: trace.update(
    line=dict(color=strategy_styles[trace.name]['color'], dash=strategy_styles[trace.name]['dash'])
) if trace.name in strategy_styles else ())

# Show the plot
fig.show()

### **RoBERTa Fine-Tuned**

In [None]:
roberta_portfolio_finetuned.rename(columns={'date_roberta': 'date',
                               'cum_market_return_roberta':'Market',
                               'cum_EL_return_roberta':'EW-L',
                               'cum_ES_return_roberta':'EW-S',
                               'cum_ELS_return_roberta':'EW-LS',
                               'cum_VL_return_roberta': 'VW-L',
                               'cum_VS_return_roberta': 'VW-S',
                               'cum_VLS_return_roberta':'VW-LS'
                                }, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
roberta_long = pd.melt(roberta_portfolio_finetuned, id_vars=['date'], value_vars=['Market', 'EW-L', 'EW-S', 'EW-LS', 'VW-L', 'VW-S', 'VW-LS'],
                  var_name='strategy', value_name='log_return')

# Create the line plot
fig = px.line(roberta_long, x='date', y='log_return', color='strategy',
                  height=800, width=900, title='RoBERTa Portfolio Fine-Tuned')

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks

# Customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-1.6, 1.5],
                 title_font=dict(size=18),  # Font size for the y-axis title
                 tickfont=dict(size=14))    # Font size for the y-axis ticks
strategy_styles = {
    'Market': {'color': 'orange', 'dash': 'dot'},
    'EW-L': {'color': 'midnightblue', 'dash': 'solid'},
    'EW-S': {'color': 'midnightblue', 'dash': 'longdash'},
    'EW-LS': {'color': 'midnightblue', 'dash': 'dashdot'},
    'VW-L': {'color': 'skyblue', 'dash': 'solid'},
    'VW-S': {'color': 'skyblue', 'dash': 'longdashdot'},
    'VW-LS': {'color': 'skyblue', 'dash': 'dashdot'}
}

# Apply the line styles and colors
fig.for_each_trace(lambda trace: trace.update(
    line=dict(color=strategy_styles[trace.name]['color'], dash=strategy_styles[trace.name]['dash'])
) if trace.name in strategy_styles else ())

# Show the plot
fig.show()

### **DistilBERT Pretrained**

In [None]:
distilbert_portfolio_pretrained.rename(columns={'date_distilbert': 'date',
                               'cum_market_return_distilbert':'Market',
                               'cum_EL_return_distilbert':'EW-L',
                               'cum_ES_return_distilbert':'EW-S',
                               'cum_ELS_return_distilbert':'EW-LS',
                               'cum_VL_return_distilbert': 'VW-L',
                               'cum_VS_return_distilbert': 'VW-S',
                               'cum_VLS_return_distilbert':'VW-LS'
                                }, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
distilbert_long = pd.melt(distilbert_portfolio_pretrained, id_vars=['date'], value_vars=['Market', 'EW-L', 'EW-S', 'EW-LS', 'VW-L', 'VW-S', 'VW-LS'],
                  var_name='strategy', value_name='log_return')

# Create the line plot
fig = px.line(distilbert_long, x='date', y='log_return', color='strategy',
                  height=800, width=900, title='DistilBERT Portfolio Pre-Trained')

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks


# Customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-1.6, 1.5],
                 title_font=dict(size=18),  # Font size for the y-axis title
                 tickfont=dict(size=14))    # Font size for the y-axis ticks

strategy_styles = {
    'Market': {'color': 'orange', 'dash': 'dot'},
    'EW-L': {'color': 'midnightblue', 'dash': 'solid'},
    'EW-S': {'color': 'midnightblue', 'dash': 'longdash'},
    'EW-LS': {'color': 'midnightblue', 'dash': 'dashdot'},
    'VW-L': {'color': 'skyblue', 'dash': 'solid'},
    'VW-S': {'color': 'skyblue', 'dash': 'longdashdot'},
    'VW-LS': {'color': 'skyblue', 'dash': 'dashdot'}
}

# Apply the line styles and colors
fig.for_each_trace(lambda trace: trace.update(
    line=dict(color=strategy_styles[trace.name]['color'], dash=strategy_styles[trace.name]['dash'])
) if trace.name in strategy_styles else ())

# Show the plot
fig.show()

### **DistilBERT Fine-Tuned**

In [None]:
distilbert_portfolio_finetuned.rename(columns={'date_distilbert': 'date',
                               'cum_market_return_distilbert':'Market',
                               'cum_EL_return_distilbert':'EW-L',
                               'cum_ES_return_distilbert':'EW-S',
                               'cum_ELS_return_distilbert':'EW-LS',
                               'cum_VL_return_distilbert': 'VW-L',
                               'cum_VS_return_distilbert': 'VW-S',
                               'cum_VLS_return_distilbert':'VW-LS'
                                }, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
distilbert_long = pd.melt(distilbert_portfolio_finetuned, id_vars=['date'], value_vars=['Market', 'EW-L', 'EW-S', 'EW-LS', 'VW-L', 'VW-S', 'VW-LS'],
                  var_name='strategy', value_name='log_return')

# Create the line plot
fig = px.line(distilbert_long, x='date', y='log_return', color='strategy',
                  height=800, width=900, title='DistilBERT Portfolio Fine-Tuned')

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks

# Customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-1.6, 1.5],
                 title_font=dict(size=18),  # Font size for the y-axis title
                 tickfont=dict(size=14))    # Font size for the y-axis ticks

strategy_styles = {
    'Market': {'color': 'orange', 'dash': 'dot'},
    'EW-L': {'color': 'midnightblue', 'dash': 'solid'},
    'EW-S': {'color': 'midnightblue', 'dash': 'longdash'},
    'EW-LS': {'color': 'midnightblue', 'dash': 'dashdot'},
    'VW-L': {'color': 'skyblue', 'dash': 'solid'},
    'VW-S': {'color': 'skyblue', 'dash': 'longdashdot'},
    'VW-LS': {'color': 'skyblue', 'dash': 'dashdot'}
}

# Apply the line styles and colors
fig.for_each_trace(lambda trace: trace.update(
    line=dict(color=strategy_styles[trace.name]['color'], dash=strategy_styles[trace.name]['dash'])
) if trace.name in strategy_styles else ())

# Show the plot
fig.show()

### **DistilRoBERTa Pretrained**

In [None]:
distilroberta_portfolio_pretrained.rename(columns={'date_distilroberta': 'date',
                               'cum_market_return_distilroberta':'Market',
                               'cum_EL_return_distilroberta':'EW-L',
                               'cum_ES_return_distilroberta':'EW-S',
                               'cum_ELS_return_distilroberta':'EW-LS',
                               'cum_VL_return_distilroberta': 'VW-L',
                               'cum_VS_return_distilroberta': 'VW-S',
                               'cum_VLS_return_distilroberta':'VW-LS'
                                }, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
distilroberta_long = pd.melt(distilroberta_portfolio_pretrained, id_vars=['date'], value_vars=['Market', 'EW-L', 'EW-S', 'EW-LS', 'VW-L', 'VW-S', 'VW-LS'],
                  var_name='strategy', value_name='log_return')

# Create the line plot
fig = px.line(distilroberta_long, x='date', y='log_return', color='strategy',
                  height=800, width=900, title='DistilRoBERTa Portfolio Pre-Trained')

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks


# Customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-1.6, 1.5],
                 title_font=dict(size=18),  # Font size for the y-axis title
                 tickfont=dict(size=14))    # Font size for the y-axis ticks

strategy_styles = {
    'Market': {'color': 'orange', 'dash': 'dot'},
    'EW-L': {'color': 'midnightblue', 'dash': 'solid'},
    'EW-S': {'color': 'midnightblue', 'dash': 'longdash'},
    'EW-LS': {'color': 'midnightblue', 'dash': 'dashdot'},
    'VW-L': {'color': 'skyblue', 'dash': 'solid'},
    'VW-S': {'color': 'skyblue', 'dash': 'longdashdot'},
    'VW-LS': {'color': 'skyblue', 'dash': 'dashdot'}
}

# Apply the line styles and colors
fig.for_each_trace(lambda trace: trace.update(
    line=dict(color=strategy_styles[trace.name]['color'], dash=strategy_styles[trace.name]['dash'])
) if trace.name in strategy_styles else ())

# Show the plot
fig.show()

### **DistilRoBERTa Fine-Tuned**

In [None]:
distilroberta_portfolio_finetuned.rename(columns={'date_distilroberta': 'date',
                               'cum_market_return_distilroberta':'Market',
                               'cum_EL_return_distilroberta':'EW-L',
                               'cum_ES_return_distilroberta':'EW-S',
                               'cum_ELS_return_distilroberta':'EW-LS',
                               'cum_VL_return_distilroberta': 'VW-L',
                               'cum_VS_return_distilroberta': 'VW-S',
                               'cum_VLS_return_distilroberta':'VW-LS'
                                }, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
distilroberta_long = pd.melt(distilroberta_portfolio_finetuned, id_vars=['date'], value_vars=['Market', 'EW-L', 'EW-S', 'EW-LS', 'VW-L', 'VW-S', 'VW-LS'],
                  var_name='strategy', value_name='log_return')

# Create the line plot
fig = px.line(distilroberta_long, x='date', y='log_return', color='strategy',
                  height=800, width=900, title='DistilRoBERTa Portfolio Fine-Tuned')

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks

# Customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-1.6, 1.5],
                 title_font=dict(size=18),  # Font size for the y-axis title
                 tickfont=dict(size=14))    # Font size for the y-axis ticks

strategy_styles = {
    'Market': {'color': 'orange', 'dash': 'dot'},
    'EW-L': {'color': 'midnightblue', 'dash': 'solid'},
    'EW-S': {'color': 'midnightblue', 'dash': 'longdash'},
    'EW-LS': {'color': 'midnightblue', 'dash': 'dashdot'},
    'VW-L': {'color': 'skyblue', 'dash': 'solid'},
    'VW-S': {'color': 'skyblue', 'dash': 'longdashdot'},
    'VW-LS': {'color': 'skyblue', 'dash': 'dashdot'}
}

# Apply the line styles and colors
fig.for_each_trace(lambda trace: trace.update(
    line=dict(color=strategy_styles[trace.name]['color'], dash=strategy_styles[trace.name]['dash'])
) if trace.name in strategy_styles else ())

# Show the plot
fig.show()

### **FinBERT Pre-Trained**

In [None]:
finbert_portfolio_pretrained.rename(columns={'date_finbert': 'date',
                               'cum_market_return_finbert':'Market',
                               'cum_EL_return_finbert':'EW-L',
                               'cum_ES_return_finbert':'EW-S',
                               'cum_ELS_return_finbert':'EW-LS',
                               'cum_VL_return_finbert': 'VW-L',
                               'cum_VS_return_finbert': 'VW-S',
                               'cum_VLS_return_finbert':'VW-LS'
                                }, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
finbert_long = pd.melt(finbert_portfolio_pretrained, id_vars=['date'], value_vars=['Market', 'EW-L', 'EW-S', 'EW-LS', 'VW-L', 'VW-S', 'VW-LS'],
                  var_name='strategy', value_name='log_return')

# Create the line plot
fig = px.line(finbert_long, x='date', y='log_return', color='strategy',
                  height=800, width=900, title='FinBERT Portfolio Pre-Trained')

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks


# Customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-1.6, 1.5],
                 title_font=dict(size=18),  # Font size for the y-axis title
                 tickfont=dict(size=14))    # Font size for the y-axis ticks

strategy_styles = {
    'Market': {'color': 'orange', 'dash': 'dot'},
    'EW-L': {'color': 'midnightblue', 'dash': 'solid'},
    'EW-S': {'color': 'midnightblue', 'dash': 'longdash'},
    'EW-LS': {'color': 'midnightblue', 'dash': 'dashdot'},
    'VW-L': {'color': 'skyblue', 'dash': 'solid'},
    'VW-S': {'color': 'skyblue', 'dash': 'longdashdot'},
    'VW-LS': {'color': 'skyblue', 'dash': 'dashdot'}
}

# Apply the line styles and colors
fig.for_each_trace(lambda trace: trace.update(
    line=dict(color=strategy_styles[trace.name]['color'], dash=strategy_styles[trace.name]['dash'])
) if trace.name in strategy_styles else ())

# Show the plot
fig.show()

### **FinBERT Fine-Tuned**

In [None]:
finbert_portfolio_finetuned.rename(columns={'date_finbert': 'date',
                               'cum_market_return_finbert':'Market',
                               'cum_EL_return_finbert':'EW-L',
                               'cum_ES_return_finbert':'EW-S',
                               'cum_ELS_return_finbert':'EW-LS',
                               'cum_VL_return_finbert': 'VW-L',
                               'cum_VS_return_finbert': 'VW-S',
                               'cum_VLS_return_finbert':'VW-LS'
                                }, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
finbert_long = pd.melt(finbert_portfolio_finetuned, id_vars=['date'], value_vars=['Market', 'EW-L', 'EW-S', 'EW-LS', 'VW-L', 'VW-S', 'VW-LS'],
                  var_name='strategy', value_name='log_return')

# Create the line plot
fig = px.line(finbert_long, x='date', y='log_return', color='strategy',
                  height=800, width=900, title='FinBERT Portfolio Fine-Tuned')

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks


# Customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-1.6, 1.5],
                 title_font=dict(size=18),  # Font size for the y-axis title
                 tickfont=dict(size=14))    # Font size for the y-axis ticks

strategy_styles = {
    'Market': {'color': 'orange', 'dash': 'dot'},
    'EW-L': {'color': 'midnightblue', 'dash': 'solid'},
    'EW-S': {'color': 'midnightblue', 'dash': 'longdash'},
    'EW-LS': {'color': 'midnightblue', 'dash': 'dashdot'},
    'VW-L': {'color': 'skyblue', 'dash': 'solid'},
    'VW-S': {'color': 'skyblue', 'dash': 'longdashdot'},
    'VW-LS': {'color': 'skyblue', 'dash': 'dashdot'}
}

# Apply the line styles and colors
fig.for_each_trace(lambda trace: trace.update(
    line=dict(color=strategy_styles[trace.name]['color'], dash=strategy_styles[trace.name]['dash'])
) if trace.name in strategy_styles else ())

# Show the plot
fig.show()

# **PORTFOLIO ANALYSIS WITH TRANSACTION COST**

## **Pre-Trained**

In [None]:
# load data
bert_portfolio_pretrained = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Portfolio Analysis - Pre Trained/bert_portfolio_pretrained_transcost.csv')
roberta_portfolio_pretrained = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Portfolio Analysis - Pre Trained/roberta_portfolio_pretrained_transcost.csv')
distilbert_portfolio_pretrained = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Portfolio Analysis - Pre Trained/distilbert_portfolio_pretrained_transcost.csv')
distilroberta_portfolio_pretrained = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Portfolio Analysis - Pre Trained/distilroberta_portfolio_pretrained_transcost.csv')
finbert_portfolio_pretrained = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Portfolio Analysis - Pre Trained/finbert_portfolio_pretrained_transcost.csv')

In [None]:
bert_portfolio_pretrained['date'] = pd.to_datetime(bert_portfolio_pretrained['date'])
roberta_portfolio_pretrained['date'] = pd.to_datetime(roberta_portfolio_pretrained['date'])
distilbert_portfolio_pretrained['date'] = pd.to_datetime(distilbert_portfolio_pretrained['date'])
distilroberta_portfolio_pretrained['date'] = pd.to_datetime(distilroberta_portfolio_pretrained['date'])
finbert_portfolio_pretrained['date'] = pd.to_datetime(finbert_portfolio_pretrained['date'])

In [None]:
# add suffix to column name
bert_portfolio_pretrained = bert_portfolio_pretrained.add_suffix('_bert')
roberta_portfolio_pretrained = roberta_portfolio_pretrained.add_suffix('_roberta')
distilbert_portfolio_pretrained = distilbert_portfolio_pretrained.add_suffix('_distilbert')
distilroberta_portfolio_pretrained = distilroberta_portfolio_pretrained.add_suffix('_distilroberta')
finbert_portfolio_pretrained = finbert_portfolio_pretrained.add_suffix('_finbert')

### **EW L-S**

In [None]:
ew_ls = pd.concat([bert_portfolio_pretrained[['date_bert', 'cum_market_return_bert','cum_ELS_return_bert']],
                   roberta_portfolio_pretrained['cum_ELS_return_roberta'],
                   distilbert_portfolio_pretrained['cum_ELS_return_distilbert'],
                   distilroberta_portfolio_pretrained['cum_ELS_return_distilroberta'],
                   finbert_portfolio_pretrained['cum_ELS_return_finbert']], axis=1)

In [None]:
ew_ls.rename(columns={'date_bert': 'date',
                      'cum_ELS_return_bert': 'BERT',
                      'cum_ELS_return_distilbert':'DistilBERT',
                      'cum_ELS_return_distilroberta':'DistilRoBERTa',
                      'cum_ELS_return_finbert':'FinBERT',
                      'cum_ELS_return_roberta':'RoBERTa',
                      'cum_market_return_bert':'Market'}, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
df_long = pd.melt(ew_ls, id_vars=['date'], value_vars=['BERT', 'DistilBERT', 'DistilRoBERTa', 'FinBERT', 'RoBERTa', 'Market'],
                  var_name='model', value_name='log_return')

# Create the line plot
fig = px.line(df_long,
              x='date',
              y='log_return',
              color='model',
              title='Equal Weighted Long-Short Strategy',
              height=800,
              width=900)

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks

# Customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-2, 1.2])

# Set different line styles for different models
line_styles = {
    'BERT': 'solid',
    'DistilBERT': 'solid',
    'DistilRoBERTa': 'solid',
    'FinBERT': 'solid',
    'RoBERTa': 'solid',
    'Market': 'dot'
}

# Apply the line styles and set 'Market' color to black
fig.for_each_trace(lambda trace: trace.update(
    line=dict(dash=line_styles[trace.name], color='black' if trace.name == 'Market' else trace.line.color)
) if trace.name in line_styles else ())

# Show the plot
fig.show()

### **VW L-S**

In [None]:
vw_ls = pd.concat([bert_portfolio_pretrained[['date_bert', 'cum_market_return_bert','cum_VLS_return_bert']],
                   roberta_portfolio_pretrained['cum_VLS_return_roberta'],
                   distilbert_portfolio_pretrained['cum_VLS_return_distilbert'],
                   distilroberta_portfolio_pretrained['cum_VLS_return_distilroberta'],
                   finbert_portfolio_pretrained['cum_VLS_return_finbert']], axis=1)

In [None]:
vw_ls.rename(columns={'date_bert': 'date',
                      'cum_market_return_bert':'Market',
                      'cum_VLS_return_bert': 'BERT',
                      'cum_VLS_return_roberta':'RoBERTa',
                      'cum_VLS_return_distilbert':'DistilBERT',
                      'cum_VLS_return_distilroberta':'DistilRoBERTa',
                      'cum_VLS_return_finbert':'FinBERT'}, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
df_long = pd.melt(vw_ls, id_vars=['date'], value_vars=['BERT', 'DistilBERT', 'DistilRoBERTa', 'FinBERT', 'RoBERTa', 'Market', ],
                  var_name='model', value_name='log_return')

# Create the line plot
fig = px.line(df_long,
              x='date',
              y='log_return',
              color='model',
              title='Value Weighted Long Strategy',
              height=800,
              width=900)

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks

# Customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-2, 1])

# Set different line styles for different models
line_styles = {
    'BERT': 'solid',
    'DistilBERT': 'solid',
    'DistilRoBERTa': 'solid',
    'FinBERT': 'solid',
    'RoBERTa': 'solid',
    'Market': 'dot'
}

# Apply the line styles and set 'Market' color to black
fig.for_each_trace(lambda trace: trace.update(
    line=dict(dash=line_styles[trace.name], color='black' if trace.name == 'Market' else trace.line.color)
) if trace.name in line_styles else ())

# Show the plot
fig.show()

### **EW L**

In [None]:
ew_l = pd.concat([bert_portfolio_pretrained[['date_bert', 'cum_market_return_bert','cum_EL_return_bert']],
                   roberta_portfolio_pretrained['cum_EL_return_roberta'],
                   distilbert_portfolio_pretrained['cum_EL_return_distilbert'],
                   distilroberta_portfolio_pretrained['cum_EL_return_distilroberta'],
                   finbert_portfolio_pretrained['cum_EL_return_finbert']], axis=1)

In [None]:
ew_l.rename(columns={'date_bert': 'date',
                      'cum_EL_return_bert': 'BERT',
                      'cum_EL_return_distilbert':'DistilBERT',
                      'cum_EL_return_distilroberta':'DistilRoBERTa',
                      'cum_EL_return_finbert':'FinBERT',
                      'cum_EL_return_roberta':'RoBERTa',
                      'cum_market_return_bert':'Market'}, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
df_long = pd.melt(ew_l, id_vars=['date'], value_vars=['BERT', 'DistilBERT', 'DistilRoBERTa', 'FinBERT', 'RoBERTa', 'Market'],
                  var_name='model', value_name='log_return')

# Create the line plot
fig = px.line(df_long,
              x='date',
              y='log_return',
              color='model',
              title='Equal Weighted Long Strategy',
              height=800,
              width=900)

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks

# Customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-.5, 1.5])

# Set different line styles for different models
line_styles = {
    'BERT': 'solid',
    'DistilBERT': 'solid',
    'DistilRoBERTa': 'solid',
    'FinBERT': 'solid',
    'RoBERTa': 'solid',
    'Market': 'dot'
}

# Apply the line styles and set 'Market' color to black
fig.for_each_trace(lambda trace: trace.update(
    line=dict(dash=line_styles[trace.name], color='black' if trace.name == 'Market' else trace.line.color)
) if trace.name in line_styles else ())

# Show the plot
fig.show()

### **VW L**

In [None]:
vw_l = pd.concat([bert_portfolio_pretrained[['date_bert', 'cum_market_return_bert','cum_VL_return_bert']],
                   roberta_portfolio_pretrained['cum_VL_return_roberta'],
                   distilbert_portfolio_pretrained['cum_VL_return_distilbert'],
                   distilroberta_portfolio_pretrained['cum_VL_return_distilroberta'],
                   finbert_portfolio_pretrained['cum_VL_return_finbert']], axis=1)

In [None]:
vw_l.rename(columns={'date_bert': 'date',
                      'cum_market_return_bert':'Market',
                      'cum_VL_return_bert': 'BERT',
                      'cum_VL_return_roberta':'RoBERTa',
                      'cum_VL_return_distilbert':'DistilBERT',
                      'cum_VL_return_distilroberta':'DistilRoBERTa',
                      'cum_VL_return_finbert':'FinBERT'}, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
df_long = pd.melt(vw_l, id_vars=['date'], value_vars=['BERT', 'DistilBERT', 'DistilRoBERTa', 'FinBERT', 'RoBERTa', 'Market', ],
                  var_name='model', value_name='log_return')

# Create the line plot
fig = px.line(df_long,
              x='date',
              y='log_return',
              color='model',
              title='Value Weighted Long Strategy',
              height=800,
              width=900)

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks

# Customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-.5, 1.5])

# Set different line styles for different models
line_styles = {
    'BERT': 'solid',
    'DistilBERT': 'solid',
    'DistilRoBERTa': 'solid',
    'FinBERT': 'solid',
    'RoBERTa': 'solid',
    'Market': 'dot'
}

# Apply the line styles and set 'Market' color to black
fig.for_each_trace(lambda trace: trace.update(
    line=dict(dash=line_styles[trace.name], color='black' if trace.name == 'Market' else trace.line.color)
) if trace.name in line_styles else ())

# Show the plot
fig.show()

### **EW S**

In [None]:
ew_s = pd.concat([bert_portfolio_pretrained[['date_bert', 'cum_market_return_bert','cum_ES_return_bert']],
                   roberta_portfolio_pretrained['cum_ES_return_roberta'],
                   distilbert_portfolio_pretrained['cum_ES_return_distilbert'],
                   distilroberta_portfolio_pretrained['cum_ES_return_distilroberta'],
                   finbert_portfolio_pretrained['cum_ES_return_finbert']], axis=1)

In [None]:
ew_s.rename(columns={'date_bert': 'date',
                      'cum_ES_return_bert': 'BERT',
                      'cum_ES_return_distilbert':'DistilBERT',
                      'cum_ES_return_distilroberta':'DistilRoBERTa',
                      'cum_ES_return_finbert':'FinBERT',
                      'cum_ES_return_roberta':'RoBERTa',
                      'cum_market_return_bert':'Market'}, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
df_long = pd.melt(ew_s, id_vars=['date'], value_vars=['BERT', 'DistilBERT', 'DistilRoBERTa', 'FinBERT', 'RoBERTa', 'Market'],
                  var_name='model', value_name='log_return')

# Create the line plot
fig = px.line(df_long,
              x='date',
              y='log_return',
              color='model',
              title='Equal Weighted Short Strategy',
              height=800,
              width=900)

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks

# Customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-2, 1])

# Set different line styles for different models
line_styles = {
    'BERT': 'solid',
    'DistilBERT': 'solid',
    'DistilRoBERTa': 'solid',
    'FinBERT': 'solid',
    'RoBERTa': 'solid',
    'Market': 'dot'
}

# Apply the line styles and set 'Market' color to black
fig.for_each_trace(lambda trace: trace.update(
    line=dict(dash=line_styles[trace.name], color='black' if trace.name == 'Market' else trace.line.color)
) if trace.name in line_styles else ())

# Show the plot
fig.show()

### **VW S**

In [None]:
vw_s = pd.concat([bert_portfolio_pretrained[['date_bert', 'cum_market_return_bert','cum_VS_return_bert']],
                   roberta_portfolio_pretrained['cum_VS_return_roberta'],
                   distilbert_portfolio_pretrained['cum_VS_return_distilbert'],
                   distilroberta_portfolio_pretrained['cum_VS_return_distilroberta'],
                   finbert_portfolio_pretrained['cum_VS_return_finbert']], axis=1)

In [None]:
vw_s.rename(columns={'date_bert': 'date',
                      'cum_market_return_bert':'Market',
                      'cum_VS_return_bert': 'BERT',
                      'cum_VS_return_roberta':'RoBERTa',
                      'cum_VS_return_distilbert':'DistilBERT',
                      'cum_VS_return_distilroberta':'DistilRoBERTa',
                      'cum_VS_return_finbert':'FinBERT'}, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
df_long = pd.melt(vw_s, id_vars=['date'], value_vars=['BERT', 'DistilBERT', 'DistilRoBERTa', 'FinBERT', 'RoBERTa', 'Market', ],
                  var_name='model', value_name='log_return')

# Create the line plot
fig = px.line(df_long,
              x='date',
              y='log_return',
              color='model',
              title='Value Weighted Short Strategy',
              height=800,
              width=900)

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks

# Customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-2, 1])

# Set different line styles for different models
line_styles = {
    'BERT': 'solid',
    'DistilBERT': 'solid',
    'DistilRoBERTa': 'solid',
    'FinBERT': 'solid',
    'RoBERTa': 'solid',
    'Market': 'dot'
}

# Apply the line styles and set 'Market' color to black
fig.for_each_trace(lambda trace: trace.update(
    line=dict(dash=line_styles[trace.name], color='black' if trace.name == 'Market' else trace.line.color)
) if trace.name in line_styles else ())

# Show the plot
fig.show()

## **Fine-Tuned**

In [None]:
# load data
bert_portfolio_finetuned = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Portfolio Analysis - Fine Tune/bert_portfolio_transcost.csv')
roberta_portfolio_finetuned = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Portfolio Analysis - Fine Tune/roberta_portfolio_transcost.csv')
distilbert_portfolio_finetuned = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Portfolio Analysis - Fine Tune/distilbert_portfolio_transcost.csv')
distilroberta_portfolio_finetuned = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Portfolio Analysis - Fine Tune/distilroberta_portfolio_transcost.csv')
finbert_portfolio_finetuned = pd.read_csv('/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Portfolio Analysis - Fine Tune/finbert_portfolio_transcost.csv')

In [None]:
bert_portfolio_finetuned['date'] = pd.to_datetime(bert_portfolio_finetuned['date'])
roberta_portfolio_finetuned['date'] = pd.to_datetime(roberta_portfolio_finetuned['date'])
distilbert_portfolio_finetuned['date'] = pd.to_datetime(distilbert_portfolio_finetuned['date'])
distilroberta_portfolio_finetuned['date'] = pd.to_datetime(distilroberta_portfolio_finetuned['date'])
finbert_portfolio_finetuned['date'] = pd.to_datetime(finbert_portfolio_finetuned['date'])

In [None]:
# add suffix to column name
bert_portfolio_finetuned = bert_portfolio_finetuned.add_suffix('_bert')
roberta_portfolio_finetuned = roberta_portfolio_finetuned.add_suffix('_roberta')
distilbert_portfolio_finetuned = distilbert_portfolio_finetuned.add_suffix('_distilbert')
distilroberta_portfolio_finetuned = distilroberta_portfolio_finetuned.add_suffix('_distilroberta')
finbert_portfolio_finetuned = finbert_portfolio_finetuned.add_suffix('_finbert')

### **EW L-S**

In [None]:
ew_ls = pd.concat([bert_portfolio_finetuned[['date_bert', 'cum_market_return_bert','cum_ELS_return_bert']],
                   roberta_portfolio_finetuned['cum_ELS_return_roberta'],
                   distilbert_portfolio_finetuned['cum_ELS_return_distilbert'],
                   distilroberta_portfolio_finetuned['cum_ELS_return_distilroberta'],
                   finbert_portfolio_finetuned['cum_ELS_return_finbert']], axis=1)

In [None]:
ew_ls.rename(columns={'date_bert': 'date',
                      'cum_ELS_return_bert': 'BERT',
                      'cum_ELS_return_distilbert':'DistilBERT',
                      'cum_ELS_return_distilroberta':'DistilRoBERTa',
                      'cum_ELS_return_finbert':'FinBERT',
                      'cum_ELS_return_roberta':'RoBERTa',
                      'cum_market_return_bert':'Market'}, inplace=True)

In [None]:
df_long = pd.melt(ew_ls, id_vars=['date'], value_vars=['BERT', 'DistilBERT', 'DistilRoBERTa', 'FinBERT', 'RoBERTa', 'Market'],
                  var_name='model', value_name='log_return')

In [None]:
# Reshape the DataFrame from wide to long format
df_long = pd.melt(ew_ls, id_vars=['date'], value_vars=['BERT', 'DistilBERT', 'DistilRoBERTa', 'FinBERT', 'RoBERTa', 'Market'],
                  var_name='model', value_name='log_return')

# Create the line plot
fig = px.line(df_long,
              x='date',
              y='log_return',
              color='model',
              title='Equal Weighted Long-Short Strategy',
              height=800,
              width=900)

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks

# Customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-1.5, 1])

# Set different line styles for different models
line_styles = {
    'BERT': 'solid',
    'DistilBERT': 'solid',
    'DistilRoBERTa': 'solid',
    'FinBERT': 'solid',
    'RoBERTa': 'solid',
    'Market': 'dot'
}

# Apply the line styles and set 'Market' color to black
fig.for_each_trace(lambda trace: trace.update(
    line=dict(dash=line_styles[trace.name], color='black' if trace.name == 'Market' else trace.line.color)
) if trace.name in line_styles else ())

# Show the plot
fig.show()

### **VW L-S**

In [None]:
vw_ls = pd.concat([bert_portfolio_finetuned[['date_bert', 'cum_market_return_bert','cum_VLS_return_bert']],
                   roberta_portfolio_finetuned['cum_VLS_return_roberta'],
                   distilbert_portfolio_finetuned['cum_VLS_return_distilbert'],
                   distilroberta_portfolio_finetuned['cum_VLS_return_distilroberta'],
                   finbert_portfolio_finetuned['cum_VLS_return_finbert']], axis=1)

In [None]:
vw_ls.rename(columns={'date_bert': 'date',
                      'cum_market_return_bert':'Market',
                      'cum_VLS_return_bert': 'BERT',
                      'cum_VLS_return_roberta':'RoBERTa',
                      'cum_VLS_return_distilbert':'DistilBERT',
                      'cum_VLS_return_distilroberta':'DistilRoBERTa',
                      'cum_VLS_return_finbert':'FinBERT'}, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
df_long = pd.melt(vw_ls, id_vars=['date'], value_vars=['BERT', 'DistilBERT', 'DistilRoBERTa', 'FinBERT', 'RoBERTa', 'Market', ],
                  var_name='model', value_name='log_return')

# Create the line plot
fig = px.line(df_long,
              x='date',
              y='log_return',
              color='model',
              title='Value Weighted Long-Short Strategy',
              height=800,
              width=900)

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks

# Customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-2, 1])

# Set different line styles for different models
line_styles = {
    'BERT': 'solid',
    'DistilBERT': 'solid',
    'DistilRoBERTa': 'solid',
    'FinBERT': 'solid',
    'RoBERTa': 'solid',
    'Market': 'dot'
}

# Apply the line styles and set 'Market' color to black
fig.for_each_trace(lambda trace: trace.update(
    line=dict(dash=line_styles[trace.name], color='black' if trace.name == 'Market' else trace.line.color)
) if trace.name in line_styles else ())

# Show the plot
fig.show()

### **EW L**

In [None]:
ew_l = pd.concat([bert_portfolio_finetuned[['date_bert', 'cum_market_return_bert','cum_EL_return_bert']],
                   roberta_portfolio_finetuned['cum_EL_return_roberta'],
                   distilbert_portfolio_finetuned['cum_EL_return_distilbert'],
                   distilroberta_portfolio_finetuned['cum_EL_return_distilroberta'],
                   finbert_portfolio_finetuned['cum_EL_return_finbert']], axis=1)

In [None]:
ew_l.rename(columns={'date_bert': 'date',
                      'cum_EL_return_bert': 'BERT',
                      'cum_EL_return_distilbert':'DistilBERT',
                      'cum_EL_return_distilroberta':'DistilRoBERTa',
                      'cum_EL_return_finbert':'FinBERT',
                      'cum_EL_return_roberta':'RoBERTa',
                      'cum_market_return_bert':'Market'}, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
df_long = pd.melt(ew_l, id_vars=['date'], value_vars=['BERT', 'DistilBERT', 'DistilRoBERTa', 'FinBERT', 'RoBERTa', 'Market'],
                  var_name='model', value_name='log_return')

# Create the line plot
fig = px.line(df_long,
              x='date',
              y='log_return',
              color='model',
              title='Equal Weighted Long',
              height=800,
              width=900)

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks

# Customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-.5, 1.5])

# Set different line styles for different models
line_styles = {
    'BERT': 'solid',
    'DistilBERT': 'solid',
    'DistilRoBERTa': 'solid',
    'FinBERT': 'solid',
    'RoBERTa': 'solid',
    'Market': 'dot'
}

# Apply the line styles and set 'Market' color to black
fig.for_each_trace(lambda trace: trace.update(
    line=dict(dash=line_styles[trace.name], color='black' if trace.name == 'Market' else trace.line.color)
) if trace.name in line_styles else ())

# Show the plot
fig.show()

### **VW L**

In [None]:
vw_l = pd.concat([bert_portfolio_finetuned[['date_bert', 'cum_market_return_bert','cum_VL_return_bert']],
                   roberta_portfolio_finetuned['cum_VL_return_roberta'],
                   distilbert_portfolio_finetuned['cum_VL_return_distilbert'],
                   distilroberta_portfolio_finetuned['cum_VL_return_distilroberta'],
                   finbert_portfolio_finetuned['cum_VL_return_finbert']], axis=1)

In [None]:
vw_l.rename(columns={'date_bert': 'date',
                      'cum_market_return_bert':'Market',
                      'cum_VL_return_bert': 'BERT',
                      'cum_VL_return_roberta':'RoBERTa',
                      'cum_VL_return_distilbert':'DistilBERT',
                      'cum_VL_return_distilroberta':'DistilRoBERTa',
                      'cum_VL_return_finbert':'FinBERT'}, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
df_long = pd.melt(vw_l, id_vars=['date'], value_vars=['BERT', 'DistilBERT', 'DistilRoBERTa', 'FinBERT', 'RoBERTa', 'Market', ],
                  var_name='model', value_name='log_return')

# Create the line plot
fig = px.line(df_long,
              x='date',
              y='log_return',
              color='model',
              title='Value Weighted Long Strategy',
              height=800,
              width=900)

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks

# Customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-.5, 1.5])

# Set different line styles for different models
line_styles = {
    'BERT': 'solid',
    'DistilBERT': 'solid',
    'DistilRoBERTa': 'solid',
    'FinBERT': 'solid',
    'RoBERTa': 'solid',
    'Market': 'dot'
}

# Apply the line styles and set 'Market' color to black
fig.for_each_trace(lambda trace: trace.update(
    line=dict(dash=line_styles[trace.name], color='black' if trace.name == 'Market' else trace.line.color)
) if trace.name in line_styles else ())

# Show the plot
fig.show()

### **EW S**

In [None]:
ew_s = pd.concat([bert_portfolio_finetuned[['date_bert', 'cum_market_return_bert','cum_ES_return_bert']],
                   roberta_portfolio_finetuned['cum_ES_return_roberta'],
                   distilbert_portfolio_finetuned['cum_ES_return_distilbert'],
                   distilroberta_portfolio_finetuned['cum_ES_return_distilroberta'],
                   finbert_portfolio_finetuned['cum_ES_return_finbert']], axis=1)

In [None]:
ew_s.rename(columns={'date_bert': 'date',
                      'cum_ES_return_bert': 'BERT',
                      'cum_ES_return_distilbert':'DistilBERT',
                      'cum_ES_return_distilroberta':'DistilRoBERTa',
                      'cum_ES_return_finbert':'FinBERT',
                      'cum_ES_return_roberta':'RoBERTa',
                      'cum_market_return_bert':'Market'}, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
df_long = pd.melt(ew_s, id_vars=['date'], value_vars=['BERT', 'DistilBERT', 'DistilRoBERTa', 'FinBERT', 'RoBERTa', 'Market'],
                  var_name='model', value_name='log_return')

# Create the line plot
fig = px.line(df_long,
              x='date',
              y='log_return',
              color='model',
              title='Equal Weighted Short Strategy',
              height=800,
              width=900)

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks

# Customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-2, 1])

# Set different line styles for different models
line_styles = {
    'BERT': 'solid',
    'DistilBERT': 'solid',
    'DistilRoBERTa': 'solid',
    'FinBERT': 'solid',
    'RoBERTa': 'solid',
    'Market': 'dot'
}

# Apply the line styles and set 'Market' color to black
fig.for_each_trace(lambda trace: trace.update(
    line=dict(dash=line_styles[trace.name], color='black' if trace.name == 'Market' else trace.line.color)
) if trace.name in line_styles else ())

# Show the plot
fig.show()

### **VW S**

In [None]:
vw_s = pd.concat([bert_portfolio_finetuned[['date_bert', 'cum_market_return_bert','cum_VS_return_bert']],
                   roberta_portfolio_finetuned['cum_VS_return_roberta'],
                   distilbert_portfolio_finetuned['cum_VS_return_distilbert'],
                   distilroberta_portfolio_finetuned['cum_VS_return_distilroberta'],
                   finbert_portfolio_finetuned['cum_VS_return_finbert']], axis=1)

In [None]:
vw_s.rename(columns={'date_bert': 'date',
                      'cum_market_return_bert':'Market',
                      'cum_VS_return_bert': 'BERT',
                      'cum_VS_return_roberta':'RoBERTa',
                      'cum_VS_return_distilbert':'DistilBERT',
                      'cum_VS_return_distilroberta':'DistilRoBERTa',
                      'cum_VS_return_finbert':'FinBERT'}, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
df_long = pd.melt(vw_s, id_vars=['date'], value_vars=['BERT', 'DistilBERT', 'DistilRoBERTa', 'FinBERT', 'RoBERTa', 'Market', ],
                  var_name='model', value_name='log_return')

# Create the line plot
fig = px.line(df_long,
              x='date',
              y='log_return',
              color='model',
              title='Value Weighted Short Strategy',
              height=800,
              width=900)

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y',
                 title_font=dict(size=18),  # Font size for the x-axis title
                 tickfont=dict(size=14))    # Font size for the x-axis ticks

# Customize the y-axis
fig.update_yaxes(title_text='Cum. Log Return',
                 range=[-2, 1])

# Set different line styles for different models
line_styles = {
    'BERT': 'solid',
    'DistilBERT': 'solid',
    'DistilRoBERTa': 'solid',
    'FinBERT': 'solid',
    'RoBERTa': 'solid',
    'Market': 'dot'
}

# Apply the line styles and set 'Market' color to black
fig.for_each_trace(lambda trace: trace.update(
    line=dict(dash=line_styles[trace.name], color='black' if trace.name == 'Market' else trace.line.color)
) if trace.name in line_styles else ())

# Show the plot
fig.show()

## **By Model**

### **BERT Pre-Trained**

In [None]:
bert_portfolio_pretrained.rename(columns={'date_bert': 'date',
                               'cum_market_return_bert':'Market',
                               'cum_EL_return_bert':'EW-L',
                               'cum_ES_return_bert':'EW-S',
                               'cum_ELS_return_bert':'EW-LS',
                               'cum_VL_return_bert': 'VW-L',
                               'cum_VS_return_bert': 'VW-S',
                               'cum_VLS_return_bert':'VW-LS'
                                }, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
bert_long = pd.melt(bert_portfolio_pretrained, id_vars=['date'], value_vars=['Market', 'EW-L', 'EW-S', 'EW-LS', 'VW-L', 'VW-S', 'VW-LS'],
                  var_name='strategy', value_name='log_return')

# Create the line plot
fig = px.line(bert_long, x='date', y='log_return', color='strategy',
                  height=600, width=800, title='BERT Portfolio Pre-Trained')

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y', showgrid=True)

# Customize the y-axis
fig.update_yaxes(
    title_text='Cum. Log Return',
    range=[-2.3, 1.5]
)

strategy_styles = {
    'Market': {'color': 'orange', 'dash': 'dot'},
    'EW-L': {'color': 'midnightblue', 'dash': 'solid'},
    'EW-S': {'color': 'midnightblue', 'dash': 'longdash'},
    'EW-LS': {'color': 'midnightblue', 'dash': 'dashdot'},
    'VW-L': {'color': 'skyblue', 'dash': 'solid'},
    'VW-S': {'color': 'skyblue', 'dash': 'longdashdot'},
    'VW-LS': {'color': 'skyblue', 'dash': 'dashdot'}
}

# Apply the line styles and colors
fig.for_each_trace(lambda trace: trace.update(
    line=dict(color=strategy_styles[trace.name]['color'], dash=strategy_styles[trace.name]['dash'])
) if trace.name in strategy_styles else ())

# Show the plot
fig.show()

### **BERT Fine-Tuned**

In [None]:
bert_portfolio_finetuned.rename(columns={'date_bert': 'date',
                               'cum_market_return_bert':'Market',
                               'cum_EL_return_bert':'EW-L',
                               'cum_ES_return_bert':'EW-S',
                               'cum_ELS_return_bert':'EW-LS',
                               'cum_VL_return_bert': 'VW-L',
                               'cum_VS_return_bert': 'VW-S',
                               'cum_VLS_return_bert':'VW-LS'
                                }, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
bert_long = pd.melt(bert_portfolio_finetuned, id_vars=['date'], value_vars=['Market', 'EW-L', 'EW-S', 'EW-LS', 'VW-L', 'VW-S', 'VW-LS'],
                  var_name='strategy', value_name='log_return')

# Create the line plot
fig = px.line(bert_long, x='date', y='log_return', color='strategy',
                  height=600, width=800, title='BERT Portfolio Fine-Tuned')

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y', showgrid=True)

# Customize the y-axis
fig.update_yaxes(
    title_text='Cum. Log Return',
    range=[-2.3, 1.5]
)

strategy_styles = {
    'Market': {'color': 'orange', 'dash': 'dot'},
    'EW-L': {'color': 'midnightblue', 'dash': 'solid'},
    'EW-S': {'color': 'midnightblue', 'dash': 'longdash'},
    'EW-LS': {'color': 'midnightblue', 'dash': 'dashdot'},
    'VW-L': {'color': 'skyblue', 'dash': 'solid'},
    'VW-S': {'color': 'skyblue', 'dash': 'longdashdot'},
    'VW-LS': {'color': 'skyblue', 'dash': 'dashdot'}
}

# Apply the line styles and colors
fig.for_each_trace(lambda trace: trace.update(
    line=dict(color=strategy_styles[trace.name]['color'], dash=strategy_styles[trace.name]['dash'])
) if trace.name in strategy_styles else ())

# Show the plot
fig.show()

### **RoBERTa Pretrained**

In [None]:
roberta_portfolio_pretrained.rename(columns={'date_roberta': 'date',
                               'cum_market_return_roberta':'Market',
                               'cum_EL_return_roberta':'EW-L',
                               'cum_ES_return_roberta':'EW-S',
                               'cum_ELS_return_roberta':'EW-LS',
                               'cum_VL_return_roberta': 'VW-L',
                               'cum_VS_return_roberta': 'VW-S',
                               'cum_VLS_return_roberta':'VW-LS'
                                }, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
roberta_long = pd.melt(roberta_portfolio_pretrained, id_vars=['date'], value_vars=['Market', 'EW-L', 'EW-S', 'EW-LS', 'VW-L', 'VW-S', 'VW-LS'],
                  var_name='strategy', value_name='log_return')

# Create the line plot
fig = px.line(roberta_long, x='date', y='log_return', color='strategy',
                  height=600, width=800, title='RoBERTa Portfolio Pre-Trained')

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y', showgrid=True)

# Customize the y-axis
fig.update_yaxes(
    title_text='Cum. Log Return',
    range=[-2.3, 1.5]
)

strategy_styles = {
    'Market': {'color': 'orange', 'dash': 'dot'},
    'EW-L': {'color': 'midnightblue', 'dash': 'solid'},
    'EW-S': {'color': 'midnightblue', 'dash': 'longdash'},
    'EW-LS': {'color': 'midnightblue', 'dash': 'dashdot'},
    'VW-L': {'color': 'skyblue', 'dash': 'solid'},
    'VW-S': {'color': 'skyblue', 'dash': 'longdashdot'},
    'VW-LS': {'color': 'skyblue', 'dash': 'dashdot'}
}

# Apply the line styles and colors
fig.for_each_trace(lambda trace: trace.update(
    line=dict(color=strategy_styles[trace.name]['color'], dash=strategy_styles[trace.name]['dash'])
) if trace.name in strategy_styles else ())

# Show the plot
fig.show()

### **RoBERTa Fine-Tuned**

In [None]:
roberta_portfolio_finetuned.rename(columns={'date_roberta': 'date',
                               'cum_market_return_roberta':'Market',
                               'cum_EL_return_roberta':'EW-L',
                               'cum_ES_return_roberta':'EW-S',
                               'cum_ELS_return_roberta':'EW-LS',
                               'cum_VL_return_roberta': 'VW-L',
                               'cum_VS_return_roberta': 'VW-S',
                               'cum_VLS_return_roberta':'VW-LS'
                                }, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
roberta_long = pd.melt(roberta_portfolio_finetuned, id_vars=['date'], value_vars=['Market', 'EW-L', 'EW-S', 'EW-LS', 'VW-L', 'VW-S', 'VW-LS'],
                  var_name='strategy', value_name='log_return')

# Create the line plot
fig = px.line(roberta_long, x='date', y='log_return', color='strategy',
                  height=600, width=800, title='RoBERTa Portfolio Fine-Tuned')

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y', showgrid=True)

# Customize the y-axis
fig.update_yaxes(
    title_text='Cum. Log Return',
    range=[-2.3, 1.5]
)

strategy_styles = {
    'Market': {'color': 'orange', 'dash': 'dot'},
    'EW-L': {'color': 'midnightblue', 'dash': 'solid'},
    'EW-S': {'color': 'midnightblue', 'dash': 'longdash'},
    'EW-LS': {'color': 'midnightblue', 'dash': 'dashdot'},
    'VW-L': {'color': 'skyblue', 'dash': 'solid'},
    'VW-S': {'color': 'skyblue', 'dash': 'longdashdot'},
    'VW-LS': {'color': 'skyblue', 'dash': 'dashdot'}
}

# Apply the line styles and colors
fig.for_each_trace(lambda trace: trace.update(
    line=dict(color=strategy_styles[trace.name]['color'], dash=strategy_styles[trace.name]['dash'])
) if trace.name in strategy_styles else ())

# Show the plot
fig.show()

### **DistilBERT Pretrained**

In [None]:
distilbert_portfolio_pretrained.rename(columns={'date_distilbert': 'date',
                               'cum_market_return_distilbert':'Market',
                               'cum_EL_return_distilbert':'EW-L',
                               'cum_ES_return_distilbert':'EW-S',
                               'cum_ELS_return_distilbert':'EW-LS',
                               'cum_VL_return_distilbert': 'VW-L',
                               'cum_VS_return_distilbert': 'VW-S',
                               'cum_VLS_return_distilbert':'VW-LS'
                                }, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
distilbert_long = pd.melt(distilbert_portfolio_pretrained, id_vars=['date'], value_vars=['Market', 'EW-L', 'EW-S', 'EW-LS', 'VW-L', 'VW-S', 'VW-LS'],
                  var_name='strategy', value_name='log_return')

# Create the line plot
fig = px.line(distilbert_long, x='date', y='log_return', color='strategy',
                  height=600, width=800, title='DistilBERT Portfolio Pre-Trained')

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y', showgrid=True)

# Customize the y-axis
fig.update_yaxes(
    title_text='Cum. Log Return',
    range=[-2.3, 1.5]
)

strategy_styles = {
    'Market': {'color': 'orange', 'dash': 'dot'},
    'EW-L': {'color': 'midnightblue', 'dash': 'solid'},
    'EW-S': {'color': 'midnightblue', 'dash': 'longdash'},
    'EW-LS': {'color': 'midnightblue', 'dash': 'dashdot'},
    'VW-L': {'color': 'skyblue', 'dash': 'solid'},
    'VW-S': {'color': 'skyblue', 'dash': 'longdashdot'},
    'VW-LS': {'color': 'skyblue', 'dash': 'dashdot'}
}

# Apply the line styles and colors
fig.for_each_trace(lambda trace: trace.update(
    line=dict(color=strategy_styles[trace.name]['color'], dash=strategy_styles[trace.name]['dash'])
) if trace.name in strategy_styles else ())

# Show the plot
fig.show()

### **DistilBERT Fine-Tuned**

In [None]:
distilbert_portfolio_finetuned.rename(columns={'date_distilbert': 'date',
                               'cum_market_return_distilbert':'Market',
                               'cum_EL_return_distilbert':'EW-L',
                               'cum_ES_return_distilbert':'EW-S',
                               'cum_ELS_return_distilbert':'EW-LS',
                               'cum_VL_return_distilbert': 'VW-L',
                               'cum_VS_return_distilbert': 'VW-S',
                               'cum_VLS_return_distilbert':'VW-LS'
                                }, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
distilbert_long = pd.melt(distilbert_portfolio_finetuned, id_vars=['date'], value_vars=['Market', 'EW-L', 'EW-S', 'EW-LS', 'VW-L', 'VW-S', 'VW-LS'],
                  var_name='strategy', value_name='log_return')

# Create the line plot
fig = px.line(distilbert_long, x='date', y='log_return', color='strategy',
                  height=600, width=800, title='DistilBERT Portfolio Fine-Tuned')

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y', showgrid=True)

# Customize the y-axis
fig.update_yaxes(
    title_text='Cum. Log Return',
    range=[-2.3, 1.5]
)

strategy_styles = {
    'Market': {'color': 'orange', 'dash': 'dot'},
    'EW-L': {'color': 'midnightblue', 'dash': 'solid'},
    'EW-S': {'color': 'midnightblue', 'dash': 'longdash'},
    'EW-LS': {'color': 'midnightblue', 'dash': 'dashdot'},
    'VW-L': {'color': 'skyblue', 'dash': 'solid'},
    'VW-S': {'color': 'skyblue', 'dash': 'longdashdot'},
    'VW-LS': {'color': 'skyblue', 'dash': 'dashdot'}
}

# Apply the line styles and colors
fig.for_each_trace(lambda trace: trace.update(
    line=dict(color=strategy_styles[trace.name]['color'], dash=strategy_styles[trace.name]['dash'])
) if trace.name in strategy_styles else ())

# Show the plot
fig.show()

### **DistilRoBERTa Pretrained**

In [None]:
distilroberta_portfolio_pretrained.rename(columns={'date_distilroberta': 'date',
                               'cum_market_return_distilroberta':'Market',
                               'cum_EL_return_distilroberta':'EW-L',
                               'cum_ES_return_distilroberta':'EW-S',
                               'cum_ELS_return_distilroberta':'EW-LS',
                               'cum_VL_return_distilroberta': 'VW-L',
                               'cum_VS_return_distilroberta': 'VW-S',
                               'cum_VLS_return_distilroberta':'VW-LS'
                                }, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
distilroberta_long = pd.melt(distilroberta_portfolio_pretrained, id_vars=['date'], value_vars=['Market', 'EW-L', 'EW-S', 'EW-LS', 'VW-L', 'VW-S', 'VW-LS'],
                  var_name='strategy', value_name='log_return')

# Create the line plot
fig = px.line(distilroberta_long, x='date', y='log_return', color='strategy',
                  height=600, width=800, title='DistilRoBERTa Portfolio Pre-Trained')

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y', showgrid=True)

# Customize the y-axis
fig.update_yaxes(
    title_text='Cum. Log Return',
    range=[-2.3, 1.5]
)

strategy_styles = {
    'Market': {'color': 'orange', 'dash': 'dot'},
    'EW-L': {'color': 'midnightblue', 'dash': 'solid'},
    'EW-S': {'color': 'midnightblue', 'dash': 'longdash'},
    'EW-LS': {'color': 'midnightblue', 'dash': 'dashdot'},
    'VW-L': {'color': 'skyblue', 'dash': 'solid'},
    'VW-S': {'color': 'skyblue', 'dash': 'longdashdot'},
    'VW-LS': {'color': 'skyblue', 'dash': 'dashdot'}
}

# Apply the line styles and colors
fig.for_each_trace(lambda trace: trace.update(
    line=dict(color=strategy_styles[trace.name]['color'], dash=strategy_styles[trace.name]['dash'])
) if trace.name in strategy_styles else ())

# Show the plot
fig.show()

### **DistilRoBERTa Fine-Tuned**

In [None]:
distilroberta_portfolio_finetuned.rename(columns={'date_distilroberta': 'date',
                               'cum_market_return_distilroberta':'Market',
                               'cum_EL_return_distilroberta':'EW-L',
                               'cum_ES_return_distilroberta':'EW-S',
                               'cum_ELS_return_distilroberta':'EW-LS',
                               'cum_VL_return_distilroberta': 'VW-L',
                               'cum_VS_return_distilroberta': 'VW-S',
                               'cum_VLS_return_distilroberta':'VW-LS'
                                }, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
distilroberta_long = pd.melt(distilroberta_portfolio_finetuned, id_vars=['date'], value_vars=['Market', 'EW-L', 'EW-S', 'EW-LS', 'VW-L', 'VW-S', 'VW-LS'],
                  var_name='strategy', value_name='log_return')

# Create the line plot
fig = px.line(distilroberta_long, x='date', y='log_return', color='strategy',
                  height=600, width=800, title='DistilRoBERTa Portfolio Fine-Tuned')

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y', showgrid=True)

# Customize the y-axis
fig.update_yaxes(
    title_text='Cum. Log Return',
    range=[-2.3, 1.5]
)

strategy_styles = {
    'Market': {'color': 'orange', 'dash': 'dot'},
    'EW-L': {'color': 'midnightblue', 'dash': 'solid'},
    'EW-S': {'color': 'midnightblue', 'dash': 'longdash'},
    'EW-LS': {'color': 'midnightblue', 'dash': 'dashdot'},
    'VW-L': {'color': 'skyblue', 'dash': 'solid'},
    'VW-S': {'color': 'skyblue', 'dash': 'longdashdot'},
    'VW-LS': {'color': 'skyblue', 'dash': 'dashdot'}
}

# Apply the line styles and colors
fig.for_each_trace(lambda trace: trace.update(
    line=dict(color=strategy_styles[trace.name]['color'], dash=strategy_styles[trace.name]['dash'])
) if trace.name in strategy_styles else ())

# Show the plot
fig.show()

### **FinBERT Pre-Trained**

In [None]:
finbert_portfolio_pretrained.rename(columns={'date_finbert': 'date',
                               'cum_market_return_finbert':'Market',
                               'cum_EL_return_finbert':'EW-L',
                               'cum_ES_return_finbert':'EW-S',
                               'cum_ELS_return_finbert':'EW-LS',
                               'cum_VL_return_finbert': 'VW-L',
                               'cum_VS_return_finbert': 'VW-S',
                               'cum_VLS_return_finbert':'VW-LS'
                                }, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
finbert_long = pd.melt(finbert_portfolio_pretrained, id_vars=['date'], value_vars=['Market', 'EW-L', 'EW-S', 'EW-LS', 'VW-L', 'VW-S', 'VW-LS'],
                  var_name='strategy', value_name='log_return')

# Create the line plot
fig = px.line(finbert_long, x='date', y='log_return', color='strategy',
                  height=600, width=800, title='FinBERT Portfolio Pre-Trained')

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y', showgrid=True)

# Customize the y-axis
fig.update_yaxes(
    title_text='Cum. Log Return',
    range=[-2.3, 1.5]
)

strategy_styles = {
    'Market': {'color': 'orange', 'dash': 'dot'},
    'EW-L': {'color': 'midnightblue', 'dash': 'solid'},
    'EW-S': {'color': 'midnightblue', 'dash': 'longdash'},
    'EW-LS': {'color': 'midnightblue', 'dash': 'dashdot'},
    'VW-L': {'color': 'skyblue', 'dash': 'solid'},
    'VW-S': {'color': 'skyblue', 'dash': 'longdashdot'},
    'VW-LS': {'color': 'skyblue', 'dash': 'dashdot'}
}

# Apply the line styles and colors
fig.for_each_trace(lambda trace: trace.update(
    line=dict(color=strategy_styles[trace.name]['color'], dash=strategy_styles[trace.name]['dash'])
) if trace.name in strategy_styles else ())

# Show the plot
fig.show()

### **FinBERT Fine-Tuned**

In [None]:
finbert_portfolio_finetuned.rename(columns={'date_finbert': 'date',
                               'cum_market_return_finbert':'Market',
                               'cum_EL_return_finbert':'EW-L',
                               'cum_ES_return_finbert':'EW-S',
                               'cum_ELS_return_finbert':'EW-LS',
                               'cum_VL_return_finbert': 'VW-L',
                               'cum_VS_return_finbert': 'VW-S',
                               'cum_VLS_return_finbert':'VW-LS'
                                }, inplace=True)

In [None]:
# Reshape the DataFrame from wide to long format
finbert_long = pd.melt(finbert_portfolio_finetuned, id_vars=['date'], value_vars=['Market', 'EW-L', 'EW-S', 'EW-LS', 'VW-L', 'VW-S', 'VW-LS'],
                  var_name='strategy', value_name='log_return')

# Create the line plot
fig = px.line(finbert_long, x='date', y='log_return', color='strategy',
                  height=600, width=800, title='FinBERT Portfolio Fine-Tuned')

# Customize the x-axis to display date labels correctly
fig.update_xaxes(title_text='Date', tickformat='%Y', showgrid=True)

# Customize the y-axis
fig.update_yaxes(
    title_text='Cum. Log Return',
    range=[-2.3, 1.5]
)

strategy_styles = {
    'Market': {'color': 'orange', 'dash': 'dot'},
    'EW-L': {'color': 'midnightblue', 'dash': 'solid'},
    'EW-S': {'color': 'midnightblue', 'dash': 'longdash'},
    'EW-LS': {'color': 'midnightblue', 'dash': 'dashdot'},
    'VW-L': {'color': 'skyblue', 'dash': 'solid'},
    'VW-S': {'color': 'skyblue', 'dash': 'longdashdot'},
    'VW-LS': {'color': 'skyblue', 'dash': 'dashdot'}
}

# Apply the line styles and colors
fig.for_each_trace(lambda trace: trace.update(
    line=dict(color=strategy_styles[trace.name]['color'], dash=strategy_styles[trace.name]['dash'])
) if trace.name in strategy_styles else ())

# Show the plot
fig.show()