# Versuch die Sentiment-Analyse in das Stock Modell einfließen zu lassen

## 1. Vorverarbeitung der Sentimentdaten

In [23]:
import pandas as pd
import json

with open('tweets_with_classes.json', 'r') as file:
    sentiment_data = json.load(file)

sentiment_df = pd.DataFrame(sentiment_data)
sentiment_df = sentiment_df.drop(columns=["title"])
sentiment_df['date'] = pd.to_datetime(sentiment_df['date'])
sentiment_df['class'] = sentiment_df['class'].map({'Negative': -1, 'Neutral': 0, 'Positive': 1})


In [24]:
sentiment_df

Unnamed: 0,date,description,class
0,2023-06-13,@Damadeferroofic Will investigate,0
1,2023-06-13,@WallStreetSilv This doesn’t make sense,-1
2,2023-06-13,@ScienceNews “Studies show …”,0
3,2023-06-12,@JonErlichman Interesting,1
4,2023-06-12,@DirtyTesLa Noted,0
...,...,...,...
989,2023-05-03,Cult / Culture,0
990,2023-05-03,@GaryMarcus @geoffreyhinton I’ve been saying t...,0
991,2023-05-03,"@Timcast Go woke, go …\r\n\r\nIt’s been a whil...",-1
992,2023-05-03,@waitbutwhy Probably won’t even need to pay fo...,0


## 2. Kombinieren der Stock-Daten und Sentiments

In [25]:
duplicates = sentiment_df[sentiment_df.duplicated(subset=['date'], keep=False)]
print(duplicates)

          date                                        description  class
0   2023-06-13                  @Damadeferroofic Will investigate      0
1   2023-06-13            @WallStreetSilv This doesn’t make sense     -1
2   2023-06-13                      @ScienceNews “Studies show …”      0
3   2023-06-12                          @JonErlichman Interesting      1
4   2023-06-12                                  @DirtyTesLa Noted      0
..         ...                                                ...    ...
988 2023-05-03  @AlexBerenson This is absurd. Shame on the CEO...     -1
989 2023-05-03                                     Cult / Culture      0
990 2023-05-03  @GaryMarcus @geoffreyhinton I’ve been saying t...      0
991 2023-05-03  @Timcast Go woke, go …\r\n\r\nIt’s been a whil...     -1
992 2023-05-03  @waitbutwhy Probably won’t even need to pay fo...      0

[993 rows x 3 columns]


In [26]:
with open('stock_data.json', 'r') as file:
    stock_data = json.load(file)
stock_data = pd.DataFrame(stock_data)
stock_data['Datum'] = pd.to_datetime(stock_data['Datum'])

In [27]:
duplicates = stock_data[stock_data.duplicated(subset=['Datum'], keep=False)]
print(duplicates)

Empty DataFrame
Columns: [Datum, open, high, low, close, volume]
Index: []


Mehrere Tweets pro Tag vs. ein Stock Value pro Tag -> Wie geht man damit um?  
Versuch 1: Mittelwertberechung

In [30]:
sentiment_df = sentiment_df.groupby('date').agg({'class': 'mean'}).reset_index()

In [33]:
combined_data = pd.merge(stock_data, sentiment_df, left_on='Datum', right_on='date', how='left', validate='one_to_one')

# Fehlende Sentiments mit 0 auffüllen
combined_data.fillna({"class": 0}, inplace=True)

final_data = combined_data[['close', 'open', 'high', 'low', 'volume', 'class']]


## 3. LSTM-Modell Implementierung

Modelldefinition

In [35]:
import torch.nn as nn

class StockSentimentLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(StockSentimentLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])  # Nur die Ausgabe des letzten Zeitschritts
        return out

# Modell initialisieren
input_size = 6  # 5 Stock-Daten + 1 Sentiment
hidden_size = 64
num_layers = 2
output_size = 1

model = StockSentimentLSTM(input_size, hidden_size, num_layers, output_size)


In [34]:
import numpy as np
import torch
from sklearn.preprocessing import MinMaxScaler

# Normalisiere die Daten
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(final_data)

# Erstelle Eingabe- und Zielsequenzen
def create_sequences(data, seq_length):
    sequences = []
    for i in range(len(data) - seq_length):
        seq = data[i:i+seq_length]
        label = data[i+seq_length, 0]  # Vorhersage für 'close'
        sequences.append((seq, label))
    return sequences

seq_length = 10
sequences = create_sequences(data_scaled, seq_length)

# Konvertiere in Tensoren
X, y = zip(*sequences)
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)


  X = torch.tensor(X, dtype=torch.float32)


Training

In [36]:
import torch.optim as optim

# Loss und Optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop
epochs = 50
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X)
    loss = criterion(outputs.squeeze(), y)
    loss.backward()
    optimizer.step()
    
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}')


Epoch 1/50, Loss: 0.1904
Epoch 2/50, Loss: 0.1719
Epoch 3/50, Loss: 0.1543
Epoch 4/50, Loss: 0.1374
Epoch 5/50, Loss: 0.1213
Epoch 6/50, Loss: 0.1059
Epoch 7/50, Loss: 0.0914
Epoch 8/50, Loss: 0.0781
Epoch 9/50, Loss: 0.0664
Epoch 10/50, Loss: 0.0572
Epoch 11/50, Loss: 0.0514
Epoch 12/50, Loss: 0.0501
Epoch 13/50, Loss: 0.0524
Epoch 14/50, Loss: 0.0548
Epoch 15/50, Loss: 0.0538
Epoch 16/50, Loss: 0.0491
Epoch 17/50, Loss: 0.0421
Epoch 18/50, Loss: 0.0349
Epoch 19/50, Loss: 0.0286
Epoch 20/50, Loss: 0.0237
Epoch 21/50, Loss: 0.0200
Epoch 22/50, Loss: 0.0171
Epoch 23/50, Loss: 0.0144
Epoch 24/50, Loss: 0.0115
Epoch 25/50, Loss: 0.0084
Epoch 26/50, Loss: 0.0053
Epoch 27/50, Loss: 0.0029
Epoch 28/50, Loss: 0.0019
Epoch 29/50, Loss: 0.0028
Epoch 30/50, Loss: 0.0043
Epoch 31/50, Loss: 0.0046
Epoch 32/50, Loss: 0.0037
Epoch 33/50, Loss: 0.0031
Epoch 34/50, Loss: 0.0034
Epoch 35/50, Loss: 0.0043
Epoch 36/50, Loss: 0.0049
Epoch 37/50, Loss: 0.0049
Epoch 38/50, Loss: 0.0042
Epoch 39/50, Loss: 0.

In [37]:
model.eval()
with torch.no_grad():
    predictions = model(X).squeeze().numpy()

# Rücktransformation der Daten
predictions = scaler.inverse_transform(np.column_stack((predictions, np.zeros((len(predictions), 5)))))


In [38]:
predictions

array([[ 3.06671554e+02,  9.48800000e+00,  1.03313000e+01,
         9.40330000e+00,  1.06541500e+07, -1.00000000e+00],
       [ 2.98266963e+02,  9.48800000e+00,  1.03313000e+01,
         9.40330000e+00,  1.06541500e+07, -1.00000000e+00],
       [ 2.88528567e+02,  9.48800000e+00,  1.03313000e+01,
         9.40330000e+00,  1.06541500e+07, -1.00000000e+00],
       ...,
       [ 2.52153711e+01,  9.48800000e+00,  1.03313000e+01,
         9.40330000e+00,  1.06541500e+07, -1.00000000e+00],
       [ 2.55276356e+01,  9.48800000e+00,  1.03313000e+01,
         9.40330000e+00,  1.06541500e+07, -1.00000000e+00],
       [ 2.56467708e+01,  9.48800000e+00,  1.03313000e+01,
         9.40330000e+00,  1.06541500e+07, -1.00000000e+00]])