In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import re
import emoji
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import (
    BertTokenizer, 
    BertForSequenceClassification,
    AutoTokenizer, 
    AutoModelForCausalLM,
    AdamW,
    get_linear_schedule_with_warmup
)
from tqdm import tqdm 
import warnings
warnings.filterwarnings('ignore')

In [2]:
llama3_path = "C:/Users/denik/Llama-3.2-1B"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
df = pd.read_csv('IMDBDataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [5]:
def preprocess_text(text):
    text = emoji.demojize(text)
    text = text.lower()
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [6]:
df['review'] = df['review'].apply(preprocess_text)

In [7]:
class TDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2, hidden_dropout_prob=0.3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
model = model.to(device)

In [10]:
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df['review'].values, 
    df['sentiment'].values,
    train_size=0.8,
    random_state=42
)

In [11]:
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts,
    temp_labels,
    train_size=0.5,
    random_state=42
)

In [12]:
train_dataset = TDataset(train_texts, train_labels, tokenizer)
val_dataset = TDataset(val_texts, val_labels, tokenizer)
test_dataset = TDataset(test_texts, test_labels, tokenizer)

In [13]:
batch_size = 32

In [14]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [15]:
def train_epoch(model, train_loader, val_loader, optimizer, scheduler):
    model.train()
    total_train_loss = 0
    train_accuracy = 0
    
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        total_train_loss += loss.item()
        
        preds = torch.argmax(outputs.logits, dim=1)
        train_accuracy += (preds == labels).sum().item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_loader)
    train_accuracy = train_accuracy / len(train_loader.dataset)

    model.eval()
    total_val_loss = 0
    val_accuracy = 0
    
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            total_val_loss += outputs.loss.item()
            preds = torch.argmax(outputs.logits, dim=1)
            val_accuracy += (preds == labels).sum().item()

    avg_val_loss = total_val_loss / len(val_loader)
    val_accuracy = val_accuracy / len(val_loader.dataset)

    return avg_train_loss, train_accuracy, avg_val_loss, val_accuracy

In [16]:
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * 3 
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0, 
    num_training_steps=total_steps
)

for epoch in range(3):
    avg_train_loss, train_accuracy, avg_val_loss, val_accuracy = train_epoch(
        model, train_loader, val_loader, optimizer, scheduler)
    
    print(f'Epoch {epoch + 1}:')
    print(f'Train loss: {avg_train_loss:.4f}')
    print(f'Train accuracy: {train_accuracy:.4f}')
    print(f'Validation loss: {avg_val_loss:.4f}')
    print(f'Validation accuracy: {val_accuracy:.4f}')

100%|██████████████████████████████████████████████████████████████████████████████| 1250/1250 [14:51<00:00,  1.40it/s]


Epoch 1:
Train loss: 0.3469
Train accuracy: 0.8454
Validation loss: 0.2635
Validation accuracy: 0.8900


100%|██████████████████████████████████████████████████████████████████████████████| 1250/1250 [14:30<00:00,  1.44it/s]


Epoch 2:
Train loss: 0.2534
Train accuracy: 0.8969
Validation loss: 0.2733
Validation accuracy: 0.8932


100%|██████████████████████████████████████████████████████████████████████████████| 1250/1250 [14:31<00:00,  1.43it/s]


Epoch 3:
Train loss: 0.2106
Train accuracy: 0.9177
Validation loss: 0.2805
Validation accuracy: 0.8966


In [17]:
tokenizer_llama = AutoTokenizer.from_pretrained(llama3_path)
model_llama = AutoModelForCausalLM.from_pretrained(llama3_path)

In [18]:
model_llama.to(device)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): Lla

In [19]:
def predict_sentiment(text, model, tokenizer):
    model.eval()
    preprocessed_text = preprocess_text(text)
    
    encoding = tokenizer.encode_plus(
        preprocessed_text,
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probabilities = torch.softmax(outputs.logits, dim=1)
        prediction = torch.argmax(probabilities, dim=1).item()
        confidence = probabilities[0][prediction].item()

    return prediction, confidence

In [20]:
def generate_response(text, sentiment, model, tokenizer):
    sentiment_prefix = "positive" if sentiment == 1 else "negative"
    prompt = f"Given a {sentiment_prefix} review: '{text}'\nGenerate a suitable response:"
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    
    output = model.generate(
        input_ids,
        max_length=100,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        top_k=50,
        top_p=0.95,
        temperature=0.9
    )
    
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response.replace(prompt, "").strip()

In [21]:
sample_text = "This movie was absolutely amazing!."

In [22]:
sentiment, confidence = predict_sentiment(sample_text, model, tokenizer)
response = generate_response(sample_text, sentiment, model_llama, tokenizer_llama)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [23]:
print(f'Sample text: {sample_text}')
print(f'Predicted: {"positive" if sentiment == 1 else "negative"}')

Sample text: This movie was absolutely amazing!.
Predicted: positive


In [24]:
print(f'Confidence: {confidence:.4f}')

Confidence: 0.9981


In [25]:
print(f'Response: {response}')

Response: The movie 'The King of Comedy' is an absolute classic, full of memorable performances and hilarious moments.
Generate an appropriate response:'This was an unforgettable film. The director has captured the essence of the story, and the actors have delivered their performances to perfection. Highly recommended for anyone who loves a good comedy.'
'King of comedy is a classic movie, the performances are brilliant, especially in the lead role
