In [1]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer
import h5py

KeyboardInterrupt: 

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer=None, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        if self.tokenizer:
            inputs = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")
            return {
                'input_ids': inputs['input_ids'].squeeze(),
                'attention_mask': inputs['attention_mask'].squeeze(),
                'label': torch.tensor(label, dtype=torch.long)
            }
        else:
            return {
                'text': text,
                'label': torch.tensor(label, dtype=torch.long)
            }

In [None]:
# Define BiLSTM Model
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        output, (hidden, cell) = self.lstm(embedded)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        return self.fc(hidden)

In [None]:
# Define BERT + CNN Model
class BERTCNN(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', num_classes=2, kernel_sizes=[2, 3, 4], num_filters=100):
        super(BERTCNN, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (k, 768)) for k in kernel_sizes
        ])
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(num_filters * len(kernel_sizes), num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs.last_hidden_state.unsqueeze(1)  # [batch_size, 1, seq_len, 768]
        x = [torch.relu(conv(x)).squeeze(3) for conv in self.convs]
        x = [torch.max_pool1d(c, c.size(2)).squeeze(2) for c in x]
        x = torch.cat(x, 1)
        x = self.dropout(x)
        logits = self.fc(x)
        return logits

In [None]:
# Load data from CSV
df = pd.read_csv('sentiment_data.csv')
texts = df['text'].values
labels = df['label'].values

In [None]:
# Initialize tokenizer and dataset for BERTCNN
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_dataset = SentimentDataset(texts, labels, tokenizer)
bert_dataloader = DataLoader(bert_dataset, batch_size=16, shuffle=True)

In [None]:
# Initialize dataset for BiLSTM
# Assume the texts are already tokenized and padded in a numerical form
# If not, you would need to build a vocabulary and convert texts to sequences of indices
# Here we use a placeholder vocab_size for demonstration
vocab_size = 20000  # Adjust based on your actual vocabulary size
embedding_dim = 100
hidden_dim = 256
output_dim = 2
n_layers = 2
bidirectional = True
dropout = 0.3
lstm_dataset = SentimentDataset(texts, labels)
lstm_dataloader = DataLoader(lstm_dataset, batch_size=16, shuffle=True)

In [None]:
# Initialize models, loss function, and optimizers
bert_cnn_model = BERTCNN(num_classes=2)
bilstm_model = BiLSTM(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout)
criterion = nn.CrossEntropyLoss()
bert_optimizer = torch.optim.Adam(bert_cnn_model.parameters(), lr=2e-5)
lstm_optimizer = torch.optim.Adam(bilstm_model.parameters(), lr=2e-3)

In [None]:
# Training loop for BERTCNN
bert_cnn_model.train()
for epoch in range(3):  # Number of epochs
    for batch in bert_dataloader:
        bert_optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        outputs = bert_cnn_model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        bert_optimizer.step()
    print(f'BERTCNN - Epoch {epoch + 1}, Loss: {loss.item()}')


In [None]:
# Training loop for BiLSTM
bilstm_model.train()
for epoch in range(3):  # Number of epochs
    for batch in lstm_dataloader:
        lstm_optimizer.zero_grad()
        texts = batch['text']  # Assuming texts are already tokenized and converted to indices
        labels = batch['label']
        outputs = bilstm_model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        lstm_optimizer.step()
    print(f'BiLSTM - Epoch {epoch + 1}, Loss: {loss.item()}')

In [None]:
# Save the trained models to .h5 format
def save_model_to_h5(model, filepath):
    model_params = {k: v.cpu().numpy() for k, v in model.state_dict().items()}
    with h5py.File(filepath, 'w') as f:
        for k, v in model_params.items():
            f.create_dataset(k, data=v)

save_model_to_h5(bert_cnn_model, 'bert_cnn_model.h5')
save_model_to_h5(bilstm_model, 'bilstm_model.h5')