In [None]:
import torch
th = torch

In [5]:
import pandas as pd
FILEPATH = '/Users/Shared/tweets/elon_tweet.csv'
df1 = pd.read_csv(FILEPATH)
print(df1.columns)

Index(['Date Created', 'Tweet'], dtype='object')


In [26]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
embedder = BertModel.from_pretrained('bert-base-uncased')
text = 'TWEETS'
encoded_input = tokenizer(text, return_tensors='pt')
# output = model(**encoded_input)

In [29]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

class StockDataset(Dataset):
    def __init__(self, dataframes, input_window, output_window):
        """
        Args:
        dataframes (list of pd.DataFrame): List of dataframes, each containing columns:
                                          'time', 'stock price', 'stock index price', 'text message'
        input_window (int): Number of timesteps in each input sequence.
        output_window (int): Number of timesteps in each target sequence.
        """
        self.input_window = input_window
        self.output_window = output_window
        self.sequences = []
        self.text_encoder = LabelEncoder()

        # Preprocess and encode text data from all dataframes
        # all_texts = pd.concat([df['text message'] for df in dataframes]).values()
        all_texts = [_df['text message'].values.tolist() for _df in dataframes]
        list_list_embs = self.embed_text(all_texts)

        
        for _i_df, df in enumerate(dataframes):
            stock_prices = th.tensor(df['stock price'].values).reshape(-1, 1)
            index_prices = th.tensor(df['stock index price'].values).reshape(-1, 1)
            embs = list_list_embs[_i_df]
            
            num_sequences = len(df) - self.input_window - self.output_window + 1

            for i in range(num_sequences):
                input_start = i
                input_end = i + self.input_window
                target_start = input_end
                target_end = target_start + self.output_window
    
                a = stock_prices[input_start:input_end]
                b = index_prices[input_start:input_end]
                c = embs[input_start:input_end]
                
                input_sequence = th.cat((a, b, c), dim=1)
                target_sequence = stock_prices[target_start:target_end]
                self.sequences.append((input_sequence, target_sequence))
                
    def embed_text(self, list_list_texts):
        unique_embs = dict()
        print(list_list_texts)
        
        embs = []
        for _i, _txts in enumerate(list_list_texts):
            _embs = []
            for _txt in _txts:
                if _txt in unique_embs: 
                    _emb = unique_embs.get(_txt)
                else:
                    _tkn = tokenizer(_txt, return_tensors='pt') 
                    _emb = embedder(**_tkn).last_hidden_state[:,0,:]
                    unique_embs[_txt] = _emb
                _embs.append(_emb)
            embs.append(_embs)

        list_list_embs = [th.cat(_embs, dim=0) for _embs in embs]
        return list_list_embs
        
    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        input_sequence, target_sequence = self.sequences[idx]
        input_sequence = torch.tensor(input_sequence, dtype=torch.float32)
        target_sequence = torch.tensor(target_sequence, dtype=torch.float32)
        return input_sequence, target_sequence

# Example fake data generation
np.random.seed(42)
data_length = 10
df1 = pd.DataFrame({
    'time': pd.date_range(start='1/1/2020', periods=data_length, freq='D'),
    'stock price': np.random.rand(data_length) * 100,
    'stock index price': np.random.rand(data_length) * 1000,
    'text message': np.random.choice(['news', '', 'alert', 'none'], data_length)
})

df2 = pd.DataFrame({
    'time': pd.date_range(start='1/1/2020', periods=data_length, freq='D'),
    'stock price': np.random.rand(data_length) * 100,
    'stock index price': np.random.rand(data_length) * 1000,
    'text message': np.random.choice(['announcement', 'report', 'alert', 'none'], data_length)
})

dataframes = [df1, df2]
input_window = 5  # e.g., 5 days input
output_window = 2  # e.g., 2 days to predict
dataset = StockDataset(dataframes, input_window, output_window)

# ### Iterate over the DataLoader
# for inputs, targets in dataloader:
#     print("Inputs Shape:", inputs.shape)  # Shape should be [batch_size, input_window, features]
#     print("Targets Shape:", targets.shape)  # Shape should be [batch_size, output_window]
#     break  # Only print the first batch to check
,

[['alert', 'alert', '', 'none', 'none', 'none', 'none', 'alert', '', ''], ['none', 'none', 'report', 'report', 'report', 'report', 'report', 'none', 'report', 'announcement']]


''

In [30]:
print(dataset[0])

(tensor([[ 3.7454e+01,  2.0584e+01, -2.6599e-01,  ..., -1.6488e-01,
          2.8372e-02,  3.1597e-01],
        [ 9.5071e+01,  9.6991e+02, -2.6599e-01,  ..., -1.6488e-01,
          2.8372e-02,  3.1597e-01],
        [ 7.3199e+01,  8.3244e+02, -1.0534e+00,  ..., -3.7310e-01,
         -3.8231e-01,  3.3689e-01],
        [ 5.9866e+01,  2.1234e+02, -1.9510e-01,  ..., -4.3778e-02,
          6.9668e-02,  1.9106e-01],
        [ 1.5602e+01,  1.8182e+02, -1.9510e-01,  ..., -4.3778e-02,
          6.9668e-02,  1.9106e-01]]), tensor([[15.5995],
        [ 5.8084]]))


  input_sequence = torch.tensor(input_sequence, dtype=torch.float32)
  target_sequence = torch.tensor(target_sequence, dtype=torch.float32)


In [66]:

### DataLoader Setup
batch_size = 10
shuffle = True
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

In [22]:
df1['text message'].values.tolist()

['none',
 'none',
 'alert',
 'news',
 'none',
 'none',
 '',
 'none',
 'none',
 '',
 'alert',
 'none',
 '',
 'news',
 'alert',
 'news',
 'news',
 '',
 'none',
 '',
 '',
 '',
 '',
 'alert',
 'none',
 'news',
 'news',
 'none',
 'news',
 'none',
 'news',
 '',
 'news',
 'none',
 'none',
 'none',
 'none',
 'none',
 'alert',
 'alert',
 'news',
 'none',
 'news',
 'none',
 'none',
 'alert',
 'alert',
 '',
 'alert',
 '',
 '',
 '',
 '',
 '',
 'news',
 'alert',
 '',
 'none',
 'alert',
 'alert',
 '',
 'news',
 '',
 'news',
 'none',
 'alert',
 'none',
 'news',
 'news',
 'none',
 'news',
 'none',
 'alert',
 'none',
 'alert',
 'none',
 'news',
 'none',
 'news',
 'none',
 'none',
 '',
 'news',
 '',
 'news',
 '',
 'alert',
 'none',
 'news',
 'news',
 'none',
 'news',
 'news',
 '',
 'news',
 'alert',
 'alert',
 'none',
 'news',
 'none']

In [12]:
all_texts = pd.concat([df1['text message'] for df in dataframes])
non_empty_texts = all_texts[all_texts.str.strip().astype(bool)]
non_empty_texts

0      none
1      none
2     alert
3      news
4      none
      ...  
95    alert
96    alert
97     none
98     news
99     none
Name: text message, Length: 156, dtype: object

In [3]:
dataset[0]

(tensor([[ 37.4540,  31.4292,   3.0000],
         [ 95.0714, 636.4104,   3.0000],
         [ 73.1994, 314.3560,   0.0000],
         [ 59.8658, 508.5707,   2.0000],
         [ 15.6019, 907.5665,   3.0000]]),
 tensor([15.5995,  5.8084]))