## Logistic Regression: PyTorch Implementation

In [6]:
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.is_available()

True

Model Definition

In [7]:
class LogisticRegression(nn.Module):
    def __init__(self, in_features: int, out_features: int) -> None:
        super().__init__()
        self.linear = nn.Linear(in_features, out_features, bias=True)
        
    def forward(self, x):
        return F.sigmoid(
            self.linear(x)
        )

Data Loading and preparation

In [8]:
from datasets import load_dataset
from nltk.tokenize import word_tokenize
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

ds = load_dataset("jniimi/tripadvisor-review-rating")
raw_data = pd.DataFrame(ds['train'])

text = 'review'
label = 'overall'

df = raw_data.drop(columns=['stay_year', 'post_date', 'freq', 'lang'])

# Drop the rows with missing data
df = df.dropna()

# Drop the duplicates
df = df.drop_duplicates()

# Shuffle the data
df = df.sample(frac=1).reset_index(drop=True)

# To avoid too big training
df = df.sample(frac=0.1)

# Split the data into train and test
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df[label], random_state=42)

train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df[label], random_state=42)

vectorizer = TfidfVectorizer(tokenizer=word_tokenize)

X_train = vectorizer.fit_transform(train_df[[text]].reset_index(drop=True).review)
y_train = train_df[label]

X_test = vectorizer.transform(test_df[[text]].reset_index(drop=True).review)
y_test = test_df[label]



In [None]:
# Dataset class for sparse data from TfIdf
class SparseTensorDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        x = self.X[idx]
        y = self.y[idx]
        return x.to_dense().float().to(device), y.to(device)

In [None]:
batch_size = 64

# Convert data to tensors.
X_train_tensor = torch.sparse_coo_tensor(X_train.nonzero(), X_train.data, X_train.shape)
y_train_tensor = torch.tensor(y_train.values - 1, dtype=torch.long)


X_test_tensor = torch.sparse_coo_tensor(X_test.nonzero(), X_test.data, X_test.shape)
y_test_tensor = torch.tensor(y_test.values - 1, dtype=torch.long)


train_dataset = SparseTensorDataset(X_train_tensor, y_train_tensor)
test_dataset = SparseTensorDataset(X_test_tensor, y_test_tensor)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [44]:
# Number of features from TfIdf
in_features = X_train.shape[1]

# Number of possible labels
out_features = len(y_train.unique())

model = LogisticRegression(in_features, out_features).to(device)

# Move model on device and compile it with TorchDynamo (optimizations could be useful since we have a large Dataset)
model = torch.compile(model.to(device))

# Define the loss, CrossEntropy since we have multi-labels
loss_fn = nn.CrossEntropyLoss()

lr = 0.001
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)

In [None]:
def get_batch_accuracy(output, y, N):
    pred = output.argmax(dim=1, keepdim=True)
    correct = pred.eq(y.view_as(pred)).sum().item()
    return correct / N


def train(model, train_loader, batch_size, optimizer, loss_fn):
    loss = 0
    accuracy = 0

    model.train()
    for x, y in train_loader:
        output = model(x)
        optimizer.zero_grad()
        loss = loss_fn(output, y)
        loss.backward()
        optimizer.step()

        loss += loss.item()
        accuracy += get_batch_accuracy(output, y, batch_size)
        
    print(f'Train: loss={loss / len(train_loader)}, accuracy={accuracy / len(train_loader)}')

In [46]:
def test(model, test_loader, batch_size, loss_fn):
    loss = 0
    accuracy = 0

    model.eval()
    with torch.no_grad():
        for x, y in test_loader:
            output = model(x)
            loss += loss_fn(output, y).item()
            accuracy += get_batch_accuracy(output, y, batch_size)
    print(f'Test: loss={loss / len(test_loader)}, accuracy={accuracy / len(test_loader)}')
    

In [None]:
import time
epochs = 10

for epoch in range(epochs):
    print(f'Epoch: {epoch}')
    s = time.time()
    train(model, train_loader, batch_size, optimizer, loss_fn)
    test(model, test_loader, batch_size, loss_fn)
    print(f'Epoch time : {time.time() - s}')

Epoch: 0
Train: loss=0.015737656503915787, accuracy=0.4118193069306931
Test: loss=1.5925064578888908, accuracy=0.43080357142857145
Epoch time : 26.566782236099243
Epoch: 1
Train: loss=0.01562720723450184, accuracy=0.429842202970297
Test: loss=1.5758961912185427, accuracy=0.43080357142857145
Epoch time : 24.795665979385376
Epoch: 2
Train: loss=0.015471803955733776, accuracy=0.429842202970297
Test: loss=1.5605392153300937, accuracy=0.43080357142857145
Epoch time : 25.54877257347107
Epoch: 3
Train: loss=0.01521945372223854, accuracy=0.429842202970297
Test: loss=1.5465559864801073, accuracy=0.43080357142857145
Epoch time : 11.686943292617798
Epoch: 4
Train: loss=0.01515111792832613, accuracy=0.429842202970297
Test: loss=1.5337300489819239, accuracy=0.43080357142857145
Epoch time : 8.083003759384155
Epoch: 5
Train: loss=0.01489906758069992, accuracy=0.429842202970297
Test: loss=1.5220676528082953, accuracy=0.43080357142857145
Epoch time : 8.102274656295776
Epoch: 6
Train: loss=0.01498496998

Not Really relevant

In [1]:
vect = vectorizer.transform(['Disgusting'])
vect_tensor = torch.sparse_coo_tensor(vect.nonzero(), vect.data, vect.shape).float()
vect_tensor = vect_tensor.to(device)

model.eval()
with torch.no_grad():
    output = model(vect_tensor)
    print(f'Output: {output}')
    print(f'Prediction: {output.argmax(dim=1, keepdim=True) + 1}')

NameError: name 'vectorizer' is not defined