In [2]:
import torch
import torch.nn as nn
import torch.optim as optim

# Model definition
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        outputs = torch.sigmoid(self.linear(x))
        return outputs

In [62]:
import pandas as pd

data_path = "./dataset"
products_data_list = [
    pd.read_csv(f"{data_path}/products-data-{i}.tsv", sep="\t", header=None, names=["id", "category", "product_title"])
    for i in range(1, 4)
]
products_data = pd.concat(products_data_list, ignore_index=True)
reviews_data_list = [
    pd.read_csv(f"{data_path}/reviews-{i}.tsv", sep="\t", header=None, names=["id", "rating", "review_text"])
    if i != 2
    else pd.read_csv(f"{data_path}/reviews-{i}.tsv", sep="\t", header=None, names=["rating", "id", "review_text"])
    for i in range(1, 4)
]
reviews_data = pd.concat(reviews_data_list, ignore_index=True)

In [72]:
merged_data = pd.merge(products_data, reviews_data, on="id", how="inner", validate="one_to_many")

In [93]:
from sklearn.feature_extraction.text import CountVectorizer

def encode(text_data):
    vectorizer = CountVectorizer()
    vectorizer.fit(text_data)
    vector = vectorizer.transform(text_data)
    return vector.toarray()

In [128]:
merged_data['encoded_review_text'] = encode(merged_data['review_text'].to_list()).tolist()
merged_data['encoded_product_title'] = encode(merged_data['product_title'].to_list()).tolist()
# encode category into 0 1. 0 for Kitchen, 1 for Jewlery
merged_data['encoded_category'] = merged_data['category'].apply(lambda x: 0 if x == 'Kitchen' else 1)

In [135]:
merged_data['input_features'] = merged_data.apply(lambda x: x['encoded_review_text'] + x['encoded_product_title'] + [x['rating']], axis=1)

In [136]:
# dataset split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    merged_data[['input_features']],
    merged_data['encoded_category'],
    test_size=0.2,
    random_state=42
)

In [138]:
# convert to tensor
X_train = torch.tensor(X_train['input_features'].to_list(), dtype=torch.float)
y_train = torch.tensor(y_train.to_list(), dtype=torch.float)
X_test = torch.tensor(X_test['input_features'].to_list(), dtype=torch.float)
y_test = torch.tensor(y_test.to_list(), dtype=torch.float)

In [139]:
# train
from torch.utils.data import TensorDataset, DataLoader

train_dataset = TensorDataset(X_train, y_train)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [169]:
from tqdm import tqdm

# Instantiate the model
model = LogisticRegressionModel(input_dim=X_train.size(1))

# Loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)

# Train the model
total_step = len(train_dataloader)
num_epochs = 1000
for epoch in range(num_epochs):
    for i, (features, labels) in enumerate(train_dataloader):
        # Forward pass
        outputs = model(features)
        loss = criterion(outputs, labels.unsqueeze(1))

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if (epoch + 1) % 30 == 0:
        # Test the model
        with torch.no_grad():
            outputs = model(X_test)
            predicted = outputs.round().squeeze(1)
            correct = (predicted == y_test).sum().item()
            print(f'Accuracy of the model on the test set: {correct / y_test.size(0) * 100:.2f}%')

Accuracy of the model on the test set: 62.15%
Accuracy of the model on the test set: 63.35%
Accuracy of the model on the test set: 65.34%
Accuracy of the model on the test set: 66.14%
Accuracy of the model on the test set: 68.13%
Accuracy of the model on the test set: 68.92%
Accuracy of the model on the test set: 71.31%
Accuracy of the model on the test set: 72.91%
Accuracy of the model on the test set: 73.71%
Accuracy of the model on the test set: 75.70%
Accuracy of the model on the test set: 76.49%
Accuracy of the model on the test set: 77.69%
Accuracy of the model on the test set: 78.88%
Accuracy of the model on the test set: 80.48%
Accuracy of the model on the test set: 79.68%
Accuracy of the model on the test set: 80.48%
Accuracy of the model on the test set: 80.88%
Accuracy of the model on the test set: 80.88%
Accuracy of the model on the test set: 81.27%
Accuracy of the model on the test set: 82.07%
Accuracy of the model on the test set: 82.07%
Accuracy of the model on the test 