In [11]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from transformers import Trainer, TrainingArguments
import pandas as pd
import numpy as np
from datetime import date
from sklearn.preprocessing import StandardScaler

class RoBerta():
    def __init__(self, data, labels, tokenizer, max_length):
        self.data = None
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.days = 1

    # def __len__(self):
    #     return len(self.data)

    # def __getitem__(self, idx):
    #     example = self.data[idx]
    #     labels = self.labels[idx]

    #     encodings = []
    #     for column_data in example:
    #         if isinstance(column_data, str):
    #             encoding = self.tokenizer(column_data, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
    #             encodings.append(encoding)

    #     input_ids = torch.cat([encoding['input_ids'] for encoding in encodings], dim=1)
    #     attention_mask = torch.cat([encoding['attention_mask'] for encoding in encodings], dim=1)

    #     label_tensor = torch.tensor(labels, dtype=torch.long)

    #     return {
    #         'input_ids': input_ids.squeeze(),
    #         'attention_mask': attention_mask.squeeze(),
    #         'labels': label_tensor
    #     }
    
    ############################################################

    def get_data(self):
        try:
            #os.chdir(os.getcwd())
            self.data = pd.read_csv(f'data/{self.ticker}_cleaned_data.csv')
            print(f'{self.ticker} data imported. Size: {self.data.shape}')
        except FileNotFoundError:
            print(f'Error: File for {self.ticker} not found.')

        #self.data = pd.read_csv(f'../data/{self.ticker}_cleaned_data.csv')
        print(f'{self.ticker} size: {self.data.shape}')
        self.data_original = self.data.copy()

    ############################################################

    def preprocess(self):
        # Preprocess date column
        if self.data is None:
            self.get_data()

        self.data['Date'] = pd.to_datetime(self.data['date'])
        self.data.drop(['date'], axis=1, inplace=True)

        self.data['month_sin'] = np.sin(2*np.pi*self.data['Date'].dt.month/12)
        self.data['month_cos'] = np.cos(2*np.pi*self.data['Date'].dt.month/12)
        self.data['day_of_month_sin'] = np.sin(2*np.pi*self.data['Date'].dt.day/31)
        self.data['day_of_month_cos'] = np.cos(2*np.pi*self.data['Date'].dt.day/31)
        self.data['day_of_week_sin'] = np.sin(2*np.pi*self.data['day']/5)
        self.data['day_of_week_cos'] = np.cos(2*np.pi*self.data['day']/5)
        self.data = self.data.drop('day', axis=1)

        self.data['Year'] = self.data['Date'].dt.year
        self.data['Month'] = self.data['Date'].dt.month
        self.data['Day'] = self.data['Date'].dt.day

        self.data = pd.get_dummies(self.data, columns=['Month'])  # one-hot encode month column

        # set the 'date' column as the DataFrame's index
        self.data.set_index('Date', inplace=True)

        # lag the 'close_price' column by three months
        self.data['close_price_lagged'] = self.data['close'].shift(-self.days)

        # reset the index back to a column
        self.data.reset_index(inplace=True)

        # create new data as last three months of data
        self.new_data = self.data[self.data['close_price_lagged'].isna()==True].copy().drop(['close_price_lagged'], axis=1)
        self.data_orig_final = self.data.copy()
        self.new_data_orig = self.new_data.copy()
        self.data = self.data[self.data['close_price_lagged'].isna()==False].copy()

        self.data = self.data.drop('Date', axis=1)
        self.new_data = self.new_data.drop('Date', axis=1)

        # scale data
        scaler = StandardScaler()
        self.data.iloc[:, 1:self.data.shape[1]-1] = scaler.fit_transform(self.data.iloc[:, 1:self.data.shape[1]-1])  # standardize year and day columns
        self.new_data.iloc[:, 1:self.new_data.shape[1]-1] = scaler.fit_transform(self.new_data.iloc[:, 1:self.new_data.shape[1]-1])

        self.X = self.data.drop('close_price_lagged', axis=1).values
        self.y = self.data['close_price_lagged'].values.reshape(-1, 1)
        self.new_data = self.new_data.values

        # reshape for LSTM
        self.X = self.X.reshape(self.X.shape[0], 1, self.X.shape[1])  # reshape to 3D array


In [4]:
import pandas as pd
msft = pd.read_csv('/Users/cristianleo/Documents/GitHub/algotrading/data/MSFT_stock.csv')
msft.head()

Unnamed: 0,date,open,high,low,close,volume,tic,day,macd,macds,...,rsi_14,rsi,close_50_sma,ma50,close_200_sma,ma200,vix,TLT,IEF,SHY
0,2018-05-01,93.21,95.29,92.79,89.659,31408900,MSFT,1,0.0,0.0,...,,,89.659,89.659,89.659,89.659,15.49,118.41,101.35,83.13
1,2018-05-02,94.99,95.17,93.19,88.252,27471000,MSFT,2,-0.032,-0.018,...,0.0,0.0,88.955,88.955,88.955,88.955,15.97,118.28,101.37,83.16
2,2018-05-03,92.96,94.93,92.45,88.781,31142500,MSFT,3,-0.024,-0.02,...,28.812,28.812,88.897,88.897,88.897,88.897,15.9,118.81,101.63,83.22
3,2018-05-04,93.32,95.37,92.92,89.81,22531300,MSFT,4,0.017,-0.007,...,55.618,55.618,89.125,89.125,89.125,89.125,14.77,118.99,101.64,83.19
4,2018-05-07,95.17,96.71,95.1,90.81,24242000,MSFT,0,0.079,0.018,...,68.17,68.17,89.462,89.462,89.462,89.462,14.75,118.8,101.59,83.19


In [10]:
# lag the close column by 1 day by creating a new column 'lagged_close'
target = msft['close'].shift(-1)[:-1]
target

0        88.252
1        88.781
2        89.810
3        90.810
4        90.423
         ...   
1255    304.830
1256    307.260
1257    305.560
1258    305.410
1259    304.400
Name: close, Length: 1260, dtype: float64

In [None]:

# Prepare your data
text_columns = []  # List of lists, where each inner list contains text data for a column
numeric_columns = msft.values  # List of lists, where each inner list contains numeric data for a column
labels = target # List of labels

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
max_length = 128  # Set your desired maximum sequence length

data = list(zip(text_columns, numeric_columns))
dataset = CustomDataset(data, labels, tokenizer, max_length)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# Create the model
num_classes = 3  # Set the number of classes for classification
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_classes)

# Define training arguments and create Trainer
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    save_steps=500,
    save_total_limit=2,
    learning_rate=1e-5,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args
)

# Start training
trainer.train()


In [1]:
from roberta import RoBerta
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from transformers import Trainer, TrainingArguments

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
max_length = 128  # Set your desired maximum sequence length

# data = list(zip(text_columns, numeric_columns))
# dataset = CustomDataset(data, labels, tokenizer, max_length)
# dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# Create the model
num_classes = 3  # Set the number of classes for classification
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_classes)

# Define training arguments and create Trainer
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    save_steps=500,
    save_total_limit=2,
    learning_rate=1e-5,
    weight_decay=0.01,
    logging_dir='./logs',
)

roberta = RoBerta('MSFT', model, training_args)
X, y, new_data = roberta.preprocess()


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly i

MSFT data imported. Size: (1259, 34)
MSFT size: (1259, 34)


In [5]:
import pandas as pd
pd.read_csv('/Users/cristianleo/Documents/GitHub/algotrading/data/MSFT_cleaned_data.csv')