## Logistic Regression: PyTorch Implementation

In [31]:
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.is_available()

True

Model Definition

In [2]:
class LogisticRegression(nn.Module):
    def __init__(self, in_features: int, out_features: int) -> None:
        super().__init__()
        self.linear = nn.Linear(in_features, out_features, bias=True)
        
    def forward(self, x):
        return F.sigmoid(
            self.linear(x)
        )

Data Loading and preparation

In [None]:
from datasets import load_dataset
from nltk.tokenize import word_tokenize
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

ds = load_dataset("jniimi/tripadvisor-review-rating")
raw_data = pd.DataFrame(ds['train'])

text = 'review'
label = 'overall'

df = raw_data.drop(columns=['stay_year', 'post_date', 'freq', 'lang'])

# Drop the rows with missing data
df = df.dropna()

# Drop the duplicates
df = df.drop_duplicates()

# Shuffle the data
df = df.sample(frac=1).reset_index(drop=True)

# Split the data into train and test
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df[label], random_state=42)

train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=df[label], random_state=42)

vectorizer = TfidfVectorizer(tokenizer=word_tokenize)

X_train = vectorizer.fit_transform(train_df[[text]].reset_index(drop=True).review)
y_train = train_df[label]

X_test = vectorizer.transform(test_df[[text]].reset_index(drop=True).review)
y_test = test_df[label]



In [None]:
# Dataset class for sparse data from TfIdf
# class SparseDataset(Dataset):
#     def __init__(self, X, y):
#         self.X = X
#         self.y = torch.tensor(y.values, dtype=torch.long)

#     def __len__(self):
#         return self.y.size(0)

#     def __getitem__(self, idx):
#         x = self.X[idx]
#         return torch.tensor(x.toarray(), dtype=torch.float32).squeeze(0), self.y[idx]
    
class SparseTensorDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        x = self.X[idx].double()
        y = self.y[idx].double()
        return x.to(device), y.to(device)
    
def sparse_collate_fn(batch):
    return sparse_tensor, labels


NameError: name 's' is not defined

In [61]:
batch_size = 32

# Convert data to tensors.
# X_train_tensor = torch.tensor(X_train.toarray(), dtype=torch.float32)
# y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)

# X_test_tensor = torch.tensor(X_test.toarray(), dtype=torch.float32)
# y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

X_train_tensor = torch.sparse_coo_tensor(X_train.nonzero(), X_train.data, X_train.shape)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)


X_test_tensor = torch.sparse_coo_tensor(X_test.nonzero(), X_test.data, X_test.shape)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)


train_dataset = SparseTensorDataset(X_train_tensor, y_train_tensor)
test_dataset = SparseTensorDataset(X_test_tensor, y_test_tensor)



# Convert data to Dataset.
# train_dataset = SparseDataset(X_train, y_train)
# test_dataset = SparseDataset(X_test, y_test)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=sparse_collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=sparse_collate_fn)

In [62]:
# Number of features from TfIdf
in_features = X_train.shape[1]

# Number of possible labels
out_features = len(y_train.unique())

model = LogisticRegression(in_features, out_features)

# Move model on device and compile it with TorchDynamo (optimizations could be useful since we have a large Dataset)
model = torch.compile(model.to(device))

# Define the loss, CrossEntropy since we have multi-labels
loss_fn = nn.CrossEntropyLoss()

lr = 0.001
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)

In [63]:
def get_batch_accuracy(output, y, N):
    pred = output.argmax(dim=1, keepdim=True)
    correct = pred.eq(y.view_as(pred)).sum().item()
    return correct / N


def train(model, train_loader, batch_size, optimizer, loss_fn):
    loss = 0
    accuracy = 0

    model.train()
    for x, y in train_loader:
        output = model(x)
        optimizer.zero_grad()
        loss = loss_fn(output, y)
        loss.backward()
        optimizer.step()

        loss += loss.item()
        accuracy += get_batch_accuracy(output, y, batch_size)
    print(f'Train: loss={loss}, accuracy={accuracy}')

In [64]:
def test(model, test_loader, batch_size, loss_fn):
    loss = 0
    accuracy = 0

    model.eval()
    with torch.no_grad():
        for x, y in test_loader:
            output = model(x)
            loss += loss_fn(output, y).item()
            accuracy += get_batch_accuracy(output, y, batch_size)
    print(f'Test: loss={loss}, accuracy={accuracy}')

In [65]:
epochs = 10

for epoch in range(epochs):
    print(f'Epoch: {epoch}')
    train(model, train_loader, batch_size, optimizer, loss_fn)
    test(model, test_loader, batch_size, loss_fn)

Epoch: 0


RuntimeError: zero-dimensional tensor (at position 0) cannot be concatenated