In [5]:
!pip install lightning



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from nltk.corpus import movie_reviews, stopwords
from nltk.tokenize import word_tokenize
import nltk
import torch
from torch import nn, optim
from torch.utils.data import TensorDataset, DataLoader
import random

nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Preprocessing function
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = [w for w in tokens if not w in stop_words and w.isalnum()]
    return " ".join(filtered_tokens)

# Load and preprocess the NLTK movie_reviews dataset
def load_and_preprocess_data():
    documents = []
    labels = []

    # NLTK movie_reviews data is structured with individual file ids
    # Positive reviews have a 'pos' category, and negative reviews have a 'neg' category
    movie_list = movie_reviews.fileids()
    random.shuffle(movie_list)
    for file_id in movie_list:
        documents.append(preprocess_text(movie_reviews.raw(file_id)))
        # Assign labels: 0 for negative and 1 for positive
        labels.append(0 if file_id.split('/')[0] == 'neg' else 1)

    return documents, np.array(labels)

# Load and preprocess data
documents, labels = load_and_preprocess_data()

# Split the data into training and testing sets
split_ratio = 0.8
split_index = int(len(documents) * split_ratio)
train_texts, test_texts = documents[:split_index], documents[split_index:]
train_labels, test_labels = labels[:split_index], labels[split_index:]

# Compute TF-IDF
vectorizer = TfidfVectorizer(max_features=20000)  # Adjust max_features to your dataset size
train_tfidf = vectorizer.fit_transform(train_texts)
test_tfidf = vectorizer.transform(test_texts)

# Convert to PyTorch tensors
train_tfidf_tensor = torch.from_numpy(train_tfidf.toarray()).float()
test_tfidf_tensor = torch.from_numpy(test_tfidf.toarray()).float()
train_labels_tensor = torch.from_numpy(train_labels).long()
test_labels_tensor = torch.from_numpy(test_labels).long()

# Create TensorDatasets and DataLoaders
batch_size = 16
train_dataset = TensorDataset(train_tfidf_tensor, train_labels_tensor)
test_dataset = TensorDataset(test_tfidf_tensor, test_labels_tensor)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/azagar/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /home/azagar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/azagar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import lightning as L

class LightningSimpleTextClassifier(L.LightningModule):
    def __init__(self, input_dim, hidden_size, num_classes, learning_rate=0.01):
        super(LightningSimpleTextClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)
        self.dropout = nn.Dropout(0.5)
        self.learning_rate = learning_rate

        # Loss function
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        return out

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer

    def training_step(self, batch, batch_idx):
        features, labels = batch
        outputs = self(features)
        loss = self.criterion(outputs, labels)
        self.log("train_loss", loss)
        return loss

    def test_step(self, batch, batch_idx):
        features, labels = batch
        outputs = self(features)
        loss = self.criterion(outputs, labels)

        # Calculate accuracy
        _, predicted = torch.max(outputs, 1)
        correct = predicted.eq(labels).sum().item()
        accuracy = correct / labels.size(0)

        # Log test loss and test accuracy
        self.log('test_loss', loss)
        self.log('test_accuracy', accuracy)

        return {'test_loss': loss, 'test_accuracy': accuracy}


# Hyperparameters
input_dim = train_tfidf_tensor.shape[1]  # Number of features in TF-IDF vectors
hidden_size = 1500
num_classes = 2

In [3]:
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping

# Initialize the Lightning model
model = LightningSimpleTextClassifier(input_dim, hidden_size, num_classes, learning_rate=0.01)

# Instantiate built-in callbacks (optional)
checkpoint_callback = ModelCheckpoint(dirpath='checkpoints/', save_top_k=1, verbose=True, monitor='train_loss', mode='min')
early_stopping_callback = EarlyStopping(monitor='train_loss', patience=3)

# Trainer
trainer = L.Trainer(max_epochs=3, callbacks=[checkpoint_callback, early_stopping_callback])
trainer.fit(model, train_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | fc1       | Linear           | 30.0 M | train
1 | relu      | ReLU             | 0      | train
2 | fc2       | Linear           | 3.0 K  | train
3 | dropout   | Dropout          | 0      | train
4 | criterion | CrossEntropyLoss | 0      | train
-------------------------------------------------------
30.0 M    Trainable params
0         Non-trainable params
30.0 M    Total params
120.018   Total estimated model params size (MB)
5         Modules in train mode
0         Modules in eval mode
/home/azagar/miniconda3/envs/py3.11/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` 

Epoch 0: 100%|██████████| 100/100 [00:12<00:00,  7.75it/s, v_num=0]

Epoch 0, global step 100: 'train_loss' reached 0.38780 (best 0.38780), saving model to '/home/azagar/projects/NLP-Course-Tutorials/08 - Neural networks examples and hardware/Neural networks with PyTorch/checkpoints/epoch=0-step=100.ckpt' as top 1


Epoch 1: 100%|██████████| 100/100 [00:13<00:00,  7.47it/s, v_num=0]

Epoch 1, global step 200: 'train_loss' reached 0.07684 (best 0.07684), saving model to '/home/azagar/projects/NLP-Course-Tutorials/08 - Neural networks examples and hardware/Neural networks with PyTorch/checkpoints/epoch=1-step=200.ckpt' as top 1


Epoch 2: 100%|██████████| 100/100 [00:12<00:00,  7.89it/s, v_num=0]

Epoch 2, global step 300: 'train_loss' reached 0.00009 (best 0.00009), saving model to '/home/azagar/projects/NLP-Course-Tutorials/08 - Neural networks examples and hardware/Neural networks with PyTorch/checkpoints/epoch=2-step=300.ckpt' as top 1
`Trainer.fit` stopped: `max_epochs=3` reached.


Epoch 2: 100%|██████████| 100/100 [00:13<00:00,  7.61it/s, v_num=0]


In [4]:
model.eval()
trainer.test(model, test_loader)

/home/azagar/miniconda3/envs/py3.11/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:476: Your `test_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.
/home/azagar/miniconda3/envs/py3.11/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Testing DataLoader 0: 100%|██████████| 25/25 [00:00<00:00, 73.55it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_accuracy         0.8399999737739563
        test_loss            0.612557053565979
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 0.612557053565979, 'test_accuracy': 0.8399999737739563}]