In [4]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer
import h5py

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Define the dataset class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer=None, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        if self.tokenizer:
            inputs = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")
            return {
                'input_ids': inputs['input_ids'].squeeze(),
                'attention_mask': inputs['attention_mask'].squeeze(),
                'label': torch.tensor(label, dtype=torch.long)
            }
        else:
            return {
                'text': text,
                'label': torch.tensor(label, dtype=torch.long)
            }

In [6]:
# Define the BERT + CNN model
class BERTCNN(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', num_classes=5, kernel_sizes=[2, 3, 4], num_filters=100):
        super(BERTCNN, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (k, 768)) for k in kernel_sizes
        ])
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(num_filters * len(kernel_sizes), num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs.last_hidden_state.unsqueeze(1)  # [batch_size, 1, seq_len, 768]
        x = [torch.relu(conv(x)).squeeze(3) for conv in self.convs]
        x = [torch.max_pool1d(c, c.size(2)).squeeze(2) for c in x]
        x = torch.cat(x, 1)
        x = self.dropout(x)
        logits = self.fc(x)
        return logits

In [7]:
# Load data from CSV
df = pd.read_csv('./Reviews.csv')
df = df.head(20)
display(df)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
5,6,B006K2ZZ7K,ADT0SRK1MGOEU,Twoapennything,0,0,4,1342051200,Nice Taffy,I got a wild hair for taffy and ordered this f...
6,7,B006K2ZZ7K,A1SP2KVKFXXRU1,David C. Sullivan,0,0,5,1340150400,Great! Just as good as the expensive brands!,This saltwater taffy had great flavors and was...
7,8,B006K2ZZ7K,A3JRGQVEQN31IQ,Pamela G. Williams,0,0,5,1336003200,"Wonderful, tasty taffy",This taffy is so good. It is very soft and ch...
8,9,B000E7L2R4,A1MZYO9TZK0BBI,R. James,1,1,5,1322006400,Yay Barley,Right now I'm mostly just sprouting this so my...
9,10,B00171APVA,A21BT40VZCCYT4,Carol A. Reed,0,0,5,1351209600,Healthy Dog Food,This is a very healthy dog food. Good for thei...


In [8]:
texts = df['Text'].values
labels = df['Score'].values - 1 

In [9]:
# Initialize tokenizer and dataset for BERTCNN
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_dataset = SentimentDataset(texts, labels, tokenizer)
bert_dataloader = DataLoader(bert_dataset, batch_size=16, shuffle=True)



In [10]:
# Initialize model, loss function, and optimizer
bert_cnn_model = BERTCNN(num_classes=5)
criterion = nn.CrossEntropyLoss()
bert_optimizer = torch.optim.Adam(bert_cnn_model.parameters(), lr=2e-5)


In [11]:
bert_cnn_model.train()
for epoch in range(3):  # Number of epochs
    for batch in bert_dataloader:
        bert_optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        outputs = bert_cnn_model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        bert_optimizer.step()
    print(f'BERTCNN - Epoch {epoch + 1}, Loss: {loss.item()}')

BERTCNN - Epoch 1, Loss: 1.7157008647918701
BERTCNN - Epoch 2, Loss: 1.6694154739379883
BERTCNN - Epoch 3, Loss: 1.2561683654785156


In [14]:
# Save the trained model to .h5 format
def save_model_to_h5(model, filepath):
    model_params = {k: v.cpu().numpy() for k, v in model.state_dict().items()}
    with h5py.File(filepath, 'w') as f:
        for k, v in model_params.items():
            f.create_dataset(k, data=v)

In [13]:
save_model_to_h5(bert_cnn_model, 'bert_cnn_model.h5')