In [None]:
import torch
import pandas as pd
import transformers
from transformers import BertTokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer
import datetime as dt
import pickle
from helpFunctions import *

finbert_tokenizer = AutoTokenizer.from_pretrained('ProsusAI/finbert')
finbert_model = transformers.BertModel.from_pretrained('ProsusAI/finbert')

In [None]:
def parseBTCtime(d):
    #Parse all Data on 30 minutes to aggregate on 60
    if d.time().hour < 23:
        d = d.replace(hour = d.time().hour + 1, minute = 0, second = 0)
    else:
        d = d+ ONE_DAY
        d = d.replace(hour=0, minute=0)   
    return d

def encodeFinBERT(titles):
    input_ids = finbert_tokenizer.encode(titles, return_tensors='pt', truncation=True, max_length=512, pad_to_max_length=True)
    finbert_model.eval()
    encoded_text = finbert_model(input_ids)[0]
    sentence_embedding = encoded_text[0]
    return sentence_embedding


def configIndex(df, ticker, type):
    if type == "headlines" and (ticker is None or ticker =="marketsNews"):
        df['Datetime'] = df['Date'] + " " + df['Time']
        df.drop(columns = "Unnamed: 0", axis =0)
    elif type == "headlines":
        df['Datetime'] = df['Date']
    df['Datetime'] = pd.to_datetime(df['Datetime'])
    df = df.set_index('Datetime')
    return df

def transformHeadlines1(df, s_d):
    df = df.reset_index()
    df['Datetime'] = df['Datetime'] - dt.timedelta(hours=6)
    df['Datetime'] = df['Datetime'].apply(lambda row: parseBTCtime(row))
    df.set_index('Datetime', inplace = True)
    return df

import re


def remove_symbols(text):
    # Define regular expression pattern to match awkward symbols
    symbol_pattern = r'[^\w\s\.\?\!\',;:\-]'
    
    # Convert text to lowercase
    text = text.lower()
    
    # Remove symbols from text using regex
    text = re.sub(symbol_pattern, '', text)
    
    return text



In [None]:
sentiment_model = 'ProsusAI/FinBERT'
start = '2021-06-01'
valid_start = '2022-02-01' 
fixed_valid = True                                   
test_start = '2022-05-01'   
end = '2023-04-30' 
re_train = False
ticker = "BTC-USD"
#Note: if cryptocurrency different parsing has to be configured (other stock exchange open times)

In [None]:
#Read in Data
h_general = configIndex(getLatestData("sentiments", None, sentiment_model), None, "sentiments")
h_market = configIndex(getLatestData("sentiments", "marketsNews",sentiment_model), "marketsNews", "sentiments")
h_stock = configIndex(getLatestData("sentiments", ticker, sentiment_model), ticker, "sentiments")
stock_data = configIndex(getLatestData("stock_data", ticker, sentiment_model), ticker, "stock_data")

#Filter on Relevant Data for Training & Prediction Range
stock_data = stock_data[(stock_data.index > start) & (stock_data.index < end)]

#Transform the Data
h_gen = transformHeadlines1(h_general, stock_data)
h_mar = transformHeadlines1(h_market, stock_data)
h_stock = transformHeadlines1(h_stock, stock_data)

In [None]:

stock_data = stock_data[(stock_data.index > start) & (stock_data.index < end)]

def mean_of_lists(lists):
    # Concatenate the lists into a 2D array
    arr = np.vstack(lists)
    
    # Take the mean along the first axis
    result = np.mean(arr, axis=0)
    
    return list(result)


model = SentenceTransformer('all-MiniLM-L6-v2')
length = 384
#Sentences are encoded by calling model.encode()


length = 384
#Fill all Types of encodings in the list to be generated
cols = [1, 10, 99]
#1 = Last Headline
#10 = Last 10 Headlines
#99 = Mean of all Headlines


news = ['GeneralNews', 'MarketNews', 'StockNews']
#News Types to add

for run in range (0,len(news)):
    print("Run")
    print(run)
    if run ==0:
        headlines = [remove_symbols(str(headline)) for headline in h_gen["Headline"].tolist()]
        timestamps = h_gen.index.tolist()
    if run == 1:
        headlines = [remove_symbols(str(headline)) for headline in h_mar["Headline"].tolist()]
        timestamps = h_mar.index.tolist()
    if run ==2:
        headlines = [remove_symbols(str(headline)) for headline in h_stock["Headline"].tolist()]
        timestamps = h_stock.index.tolist()

    embeddings = []
    for headline in headlines:
        embedding = model.encode(headline).tolist()
        embeddings.append(embedding)

    df = pd.DataFrame(index = timestamps)
    df['Headline'] = embeddings
    df.index = df.index.tz_localize('UTC')

    for col in cols:
        print("Column")
        print(col)
        df_new = pd.DataFrame(columns = range(col), index = stock_data.index.unique() )
        for timestamp in df_new.index:
            subset = df[df.index == timestamp]
            if col == 99:
                if len(subset) >0:
                    df_new.loc[timestamp,i] = mean_of_lists(subset['Headline'])
                else:
                    df_new.loc[timestamp, i] = [0] * length
            else:
                subset = subset.sort_index(ascending=False)[:col]
                for i, (_, row) in enumerate(subset.iterrows()):
                    df_new.loc[timestamp, i] = row['Headline']
            
                # Fill remaining columns with empty arrays
                for i in range(len(subset), col):
                    df_new.loc[timestamp, i] = [0] * length

        prefix = ticker +"_" 

        merged_data = pd.merge(stock_data, df_new, how='left', left_index= True, right_index=True)

        #write sentence embedding as pickle file to the drive
        if col == 10:
            merged_data = merged_data.loc[:, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]
            with open(prefix + str(run) + '_' + news[run] + '_Last' + str(col) + '_encoded.pkl', 'wb') as f:
                pickle.dump(merged_data, f)
        else:
            merged_data = merged_data.loc[:, [0]]
            if col == 99:
                with open(prefix + str(run) + '_' + news[run] + '_Mean'  + '_encoded.pkl', 'wb') as f:
                    pickle.dump(merged_data, f)
            else:
                with open(prefix + str(run) + '_' + news[run] + '_Last' +str(col) + '_encoded.pkl', 'wb') as f:
                    pickle.dump(merged_data, f)
