# **IMPORT LIBRARIES**

In [None]:
# import libraries
import pandas as pd
import numpy as np
from google.colab import files
from sklearn.model_selection import train_test_split, RandomizedSearchCV, PredefinedSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, f1_score, accuracy_score

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

# **DEFINE FUNCTION**

In [None]:
# define function to prepare data for logistic regression
def prepare_data_for_logistic_regression(df):
    # separate features and labels
    features = df.filter(regex='^embedding_')
    labels = df['price_direction']
    return features, labels

In [None]:
# custom F1 scorer with pos_label='positive'
f1_scorer = make_scorer(f1_score, pos_label='positive')

# define function to train with Randomized Search CV
def train_model_with_randomsearch(train_features, train_labels, eval_features, eval_labels):

    # define hyperparameter option
    param_grid = {
        'C': [0.5, 1, 5, 10],
        'max_iter': [10000, 15000, 20000, 25000]}

    # initiate model with lbfgs solver
    model = LogisticRegression(solver='lbfgs')

    # combine train and eval features and labels into single arrays
    combined_features = np.vstack((train_features, eval_features))
    combined_labels = np.hstack((train_labels, eval_labels))

    # create a predefined split for cross validation (training & validation dataset)
    test_fold = np.hstack((
        np.full(train_features.shape[0], -1, dtype=int),
        np.full(eval_features.shape[0], 0, dtype=int)
    ))

    ps = PredefinedSplit(test_fold)

    # initiate randomized search
    random_search = RandomizedSearchCV(model, param_grid, scoring=f1_scorer, cv=ps)
    random_search.fit(combined_features, combined_labels)

    return random_search.best_estimator_

In [None]:
# define rolling window training and prediction function
def rolling_window_predictions(df):
    # define the rolling windows
    rolling_windows = [
        (2005, 2013, 2014, 2015, [2016, 2017]),
        (2007, 2015, 2016, 2017, [2018, 2019]),
        (2009, 2017, 2018, 2019, [2020, 2021]),
        (2011, 2019, 2020, 2021, [2022, 2023])
    ]

    # prepare results storage
    results_accuracy = []
    results_prediction = []

    # iterate through each ticker
    for company in df['permco'].unique():
        print(f"Processing company: {company}")
        company_df = df[df['permco'] == company]

        # iterate through pre-set rolling window
        for train_start, train_end, eval_start, eval_end, test_years in rolling_windows:

            # define training period
            train_period = (company_df['start_date'].dt.year >= train_start) & (company_df['start_date'].dt.year <= train_end)
            train_data = company_df[train_period]

            # define validation period
            eval_period = (company_df['start_date'].dt.year >= eval_start) & (company_df['start_date'].dt.year <= eval_end)
            eval_data = company_df[eval_period]

            # skip the iteration if there's no data for training or evaluation
            if train_data.empty or eval_data.empty:
                continue

            # prepare training and evaluation data for logistic regression
            train_features, train_labels = prepare_data_for_logistic_regression(train_data)
            eval_features, eval_labels = prepare_data_for_logistic_regression(eval_data)

            # convert to numpy arrays
            train_features = train_features.to_numpy()
            eval_features = eval_features.to_numpy()
            train_labels = train_labels.to_numpy()
            eval_labels = eval_labels.to_numpy()

            # train the logistic regression model with randomized search
            best_model = train_model_with_randomsearch(train_features, train_labels, eval_features, eval_labels)

            # iterate over each test year defined in the rolling window
            for test_year in test_years:
                test_period = (company_df['start_date'].dt.year == test_year)
                test_data = company_df[test_period]

                # skip the iteration if there's no data for testing
                if test_data.empty:
                    continue

                # prepare test data for logistic regression
                test_features, test_labels = prepare_data_for_logistic_regression(test_data)

                # convert to numpy arrays
                test_features = test_features.to_numpy()
                test_labels = test_labels.to_numpy()

                # make predictions
                predictions = best_model.predict(test_features)
                prob = best_model.predict_proba(test_features)

                # evaluate the model
                test_accuracy = accuracy_score(test_labels, predictions)

                # store accuracy results
                results_accuracy.append({
                    'company': company,
                    'train_period': f"{train_start}-{train_end}",
                    'eval_period': f"{eval_start}-{eval_end}",
                    'test_period': f"{test_year}",
                    'test_accuracy': test_accuracy,  # Overall accuracy for the test period
                })

                # store prediction results
                for idx in range(len(predictions)):
                    results_prediction.append({
                        'company': company,
                        'week_date': test_data.iloc[idx]['start_date'],  # The specific date of the prediction
                        'probability_neg': prob[idx][0].tolist(),  # Probability for the specific prediction
                        'probability_pos': prob[idx][1].tolist(),
                        'prediction': predictions[idx],  # Specific prediction
                        'actual': test_data.iloc[idx]['price_direction']  # Actual value
                    })

                print(f"Company: {company}, Train: {train_start}-{train_end}, Eval: {eval_start}-{eval_end}, "
                      f"Test: {test_year}, Test Accuracy: {test_accuracy:.4f}")

    return results_accuracy, results_prediction

# **TRAINING AND PREDICTION**

## **BERT**

In [None]:
# load training & test data embeddings
file_path_train_bert = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Embedding/Pre-Trained/bert_train_pretrained.csv'
train_df_bert = pd.read_csv(file_path_train_bert)

file_path_test_bert = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Embedding/Pre-Trained/bert_test_pretrained.csv'
test_df_bert = pd.read_csv(file_path_test_bert)

# combine train and test dataset for rolling window
final_df_bert = pd.concat([train_df_bert, test_df_bert], axis=0, ignore_index=True)

# sort by permco and date
final_df_bert.sort_values(by=['permco', 'start_date'], inplace=True)

# format to datetime
final_df_bert['start_date'] = pd.to_datetime(final_df_bert['start_date'], format='%Y-%m-%d')

In [None]:
# perform rolling window traning and predictions
rolling_accuracy_bert, rolling_prediction_bert = rolling_window_predictions(final_df_bert)

# convert results to dataframe
rolling_accuracy_df_bert = pd.DataFrame(rolling_accuracy_bert)
rolling_prediction_df_bert = pd.DataFrame(rolling_prediction_bert)

In [None]:
# define path to save results
path_bert_accuracy = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/bert_rolling_accuracy.csv'
path_bert_prediction = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/bert_rolling_prediction.csv'

# save to csv
rolling_accuracy_df_bert.to_csv(path_bert_accuracy, index=False)
rolling_prediction_df_bert.to_csv(path_bert_prediction, index=False)

## **RoBERTa**

In [None]:
# load training & test data embeddings
file_path_train_roberta = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Embedding/Pre-Trained/roberta_train_pretrained.csv'
train_df_roberta = pd.read_csv(file_path_train_roberta)

file_path_test_roberta = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Embedding/Pre-Trained/roberta_test_pretrained.csv'
test_df_roberta = pd.read_csv(file_path_test_roberta)

# combine train and test dataset for rolling window
final_df_roberta = pd.concat([train_df_roberta, test_df_roberta], axis=0, ignore_index=True)

# sort by permco and date
final_df_roberta.sort_values(by=['permco', 'start_date'], inplace=True)

# format to datetime
final_df_roberta['start_date'] = pd.to_datetime(final_df_roberta['start_date'], format='%Y-%m-%d')

In [None]:
# perform rolling window traning and predictions
rolling_accuracy_roberta, rolling_prediction_roberta = rolling_window_predictions(final_df_roberta)

# convert results to dataframe
rolling_accuracy_df_roberta = pd.DataFrame(rolling_accuracy_roberta)
rolling_prediction_df_roberta = pd.DataFrame(rolling_prediction_roberta)


In [None]:
# define path to save results
path_roberta_accuracy = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/roberta_rolling_accuracy.csv'
path_roberta_prediction = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/roberta_rolling_prediction.csv'

# save to csv
rolling_accuracy_df_roberta.to_csv(path_roberta_accuracy, index=False)
rolling_prediction_df_roberta.to_csv(path_roberta_prediction, index=False)

## **DistilBERT**

In [None]:
# load training & test data embeddings
file_path_train_distilbert = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Embedding/Pre-Trained/distilbert_train_pretrained.csv'
train_df_distilbert = pd.read_csv(file_path_train_distilbert)

file_path_test_distilbert = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Embedding/Pre-Trained/distilbert_test_pretrained.csv'
test_df_distilbert = pd.read_csv(file_path_test_distilbert)

# combine train and test dataset for rolling window
final_df_distilbert = pd.concat([train_df_distilbert, test_df_distilbert], axis=0, ignore_index=True)

# sort by permco and date
final_df_distilbert.sort_values(by=['permco', 'start_date'], inplace=True)

# format to datetime
final_df_distilbert['start_date'] = pd.to_datetime(final_df_distilbert['start_date'], format='%Y-%m-%d')

In [None]:
# perform rolling window traning and predictions
rolling_accuracy_distilbert, rolling_prediction_distilbert = rolling_window_predictions(final_df_distilbert)

# convert results to dataframe
rolling_accuracy_df_distilbert = pd.DataFrame(rolling_accuracy_distilbert)
rolling_prediction_df_distilbert = pd.DataFrame(rolling_prediction_distilbert)


In [None]:
# define path to save results
path_distilbert_accuracy = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/distilbert_rolling_accuracy.csv'
path_distilbert_prediction = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/distilbert_rolling_prediction.csv'

# save to csv
rolling_accuracy_df_distilbert.to_csv(path_distilbert_accuracy, index=False)
rolling_prediction_df_distilbert.to_csv(path_distilbert_prediction, index=False)

## **DistilRoBERTa**

In [None]:
# load training & test data embeddings
file_path_train_distilroberta = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Embedding/Pre-Trained/distilroberta_train_pretrained.csv'
train_df_distilroberta = pd.read_csv(file_path_train_distilroberta)

file_path_test_distilroberta = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Embedding/Pre-Trained/distilroberta_test_pretrained.csv'
test_df_distilroberta = pd.read_csv(file_path_test_distilroberta)

# combine train and test dataset for rolling window
final_df_distilroberta = pd.concat([train_df_distilroberta, test_df_distilroberta], axis=0, ignore_index=True)

# sort by permco and date
final_df_distilroberta.sort_values(by=['permco', 'start_date'], inplace=True)

# format to datetime
final_df_distilroberta['start_date'] = pd.to_datetime(final_df_distilroberta['start_date'], format='%Y-%m-%d')

In [None]:
# perform rolling window traning and predictions
rolling_accuracy_distilroberta, rolling_prediction_distilroberta = rolling_window_predictions(final_df_distilroberta)

# convert results to dataframe
rolling_accuracy_df_distilroberta = pd.DataFrame(rolling_accuracy_distilroberta)
rolling_prediction_df_distilroberta = pd.DataFrame(rolling_prediction_distilroberta)


In [None]:
# define path to save results
path_distilroberta_accuracy = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/distilroberta_rolling_accuracy.csv'
path_distilroberta_prediction = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/distilroberta_rolling_prediction.csv'

# save to csv
rolling_accuracy_df_distilroberta.to_csv(path_distilroberta_accuracy, index=False)
rolling_prediction_df_distilroberta.to_csv(path_distilroberta_prediction, index=False)

## **FinBERT**

In [None]:
# load training & test data embeddings
file_path_train_finbert = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Embedding/Pre-Trained/finbert_train_pretrained.csv'
train_df_finbert = pd.read_csv(file_path_train_finbert)

file_path_test_finbert = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Embedding/Pre-Trained/finbert_test_pretrained.csv'
test_df_finbert = pd.read_csv(file_path_test_finbert)

# combine train and test dataset for rolling window
final_df_finbert = pd.concat([train_df_finbert, test_df_finbert], axis=0, ignore_index=True)

# sort by permco and date
final_df_finbert.sort_values(by=['permco', 'start_date'], inplace=True)

# format to datetime
final_df_finbert['start_date'] = pd.to_datetime(final_df_finbert['start_date'], format='%Y-%m-%d')

In [None]:
# perform rolling window traning and predictions
rolling_accuracy_finbert, rolling_prediction_finbert = rolling_window_predictions(final_df_finbert)

# convert results to dataframe
rolling_accuracy_df_finbert = pd.DataFrame(rolling_accuracy_finbert)
rolling_prediction_df_finbert = pd.DataFrame(rolling_prediction_finbert)


In [None]:
# define path to save results
path_finbert_accuracy = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/finbert_rolling_accuracy.csv'
path_finbert_prediction = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/finbert_rolling_prediction.csv'

# save to csv
rolling_accuracy_df_finbert.to_csv(path_finbert_accuracy, index=False)
rolling_prediction_df_finbert.to_csv(path_finbert_prediction, index=False)