In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

In [2]:
from huggingface_hub import login

login("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")

In [3]:
# tokenizer (embedding process)
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

In [4]:
def group_description(df):
    df['description'] = df['title'] + " " + df['author'] + " " + df['description']
    return df

In [5]:
import string

def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    words = text.split()
    return ' '.join(words)

In [6]:
# data loading
train_df = pd.read_csv("/kaggle/input/books-dataset/train.csv")
val_df = pd.read_csv("/kaggle/input/books-dataset/val.csv")
test_df = pd.read_csv("/kaggle/input/books-dataset/test.csv")

train_df = group_description(train_df)
val_df = group_description(val_df)
test_df = group_description(test_df)

train_df['description'] = train_df['description'].apply(preprocess_text)
val_df['description'] = val_df['description'].apply(preprocess_text)
test_df['description'] = val_df['description'].apply(preprocess_text)

# train_df = train_df[:10]
# test_df = test_df[:10]
# val_df = val_df[:10]

In [7]:
train_df['genres'] = train_df['genres'].apply(lambda x: x.split(",") if isinstance(x, str) else x)
test_df['genres'] = test_df['genres'].apply(lambda x: x.split(",") if isinstance(x, str) else x)
val_df['genres'] = val_df['genres'].apply(lambda x: x.split(",") if isinstance(x, str) else x)

In [8]:
train_df

Unnamed: 0,title,author,description,genres
0,Tipping the Velvet,Sarah Waters,tipping the velvet sarah waters nan king an oy...,"[fiction, romance]"
1,The Thin Man,Dashiell Hammett,the thin man dashiell hammett nick and nora ch...,[fiction]
2,Saving Steele,Anne Jolin (Goodreads Author),saving steele anne jolin goodreads author kenn...,"[romance, contemporary]"
3,The Winthrop Woman,"Anya Seton, Philippa Gregory (Goodreads Author...",the winthrop woman anya seton philippa gregory...,"[fiction, romance]"
4,Operation Redwood,S. Terrell French,operation redwood s terrell french sibley cart...,"[fiction, young adult, contemporary]"
...,...,...,...,...
21922,The Philadelphia Story,Philip Barry,the philadelphia story philip barry comedy 9m ...,"[fiction, romance]"
21923,Surfacing,Margaret Atwood (Goodreads Author),surfacing margaret atwood goodreads author par...,"[fiction, contemporary]"
21924,A Clan in Need,"Erin Hunter, Dan Jolley (Goodreads Author), Ja...",a clan in need erin hunter dan jolley goodread...,"[fantasy, fiction]"
21925,Beastly,Alex Flinn (Goodreads Author),beastly alex flinn goodreads author i am a bea...,"[young adult, fantasy, romance, fiction]"


In [9]:
from collections import Counter
from sklearn.utils import resample

tag_counts = Counter([tag for genres in train_df['genres'] for tag in genres])
print("Original Tag Counts:", tag_counts)

Original Tag Counts: Counter({'fiction': 18738, 'romance': 9209, 'fantasy': 8979, 'young adult': 7091, 'contemporary': 6331})


In [10]:
min_samples_per_tag = 18000  # Adjust based on your dataset and requirements

def balance_dataset(df, min_samples, tag_column='genres'):
    balanced_data = []
    
    for tag, count in tag_counts.items():
        tag_rows = df[df[tag_column].apply(lambda genres: tag in genres)]
        
        if count < min_samples:
            oversampled = resample(tag_rows, replace=True, n_samples=min_samples, random_state=42)
            balanced_data.append(oversampled)
        else:
            balanced_data.append(tag_rows)

    combined_df = pd.concat(balanced_data)
    
    return combined_df

train_df = balance_dataset(train_df, min_samples_per_tag)

In [11]:
from collections import Counter
from sklearn.utils import resample

tag_counts = Counter([tag for genres in train_df['genres'] for tag in genres])
print("Original Tag Counts:", tag_counts)

Original Tag Counts: Counter({'fiction': 76817, 'romance': 52386, 'young adult': 45131, 'fantasy': 44900, 'contemporary': 37349})


In [12]:
# train_df = train_df[:10]
# test_df = test_df[:10]
# val_df = val_df[:10]

In [13]:
# embedding data
train_encoded_data = [tokenizer.encode(text, add_special_tokens=True) for text in tqdm(train_df['description'])]
val_encoded_data = [tokenizer.encode(text, add_special_tokens=True) for text in val_df['description']]
test_encoded_data = [tokenizer.encode(text, add_special_tokens=True) for text in test_df['description']]
max_sequence_length = max(max(max(len(seq) for seq in train_encoded_data), max(len(seq) for seq in test_encoded_data)), max(len(seq) for seq in val_encoded_data))
max_words = len(tokenizer.get_vocab())

100%|██████████| 90738/90738 [00:48<00:00, 1857.53it/s]


In [14]:
# add pad (add pad value to make all the sentences to the same length)
train_padded_sequences = torch.tensor([seq + [0] * (max_sequence_length - len(seq)) for seq in tqdm(train_encoded_data)], dtype=torch.long)
test_padded_sequences = torch.tensor([seq + [0] * (max_sequence_length - len(seq)) for seq in test_encoded_data], dtype=torch.long)
val_padded_sequences = torch.tensor([seq + [0] * (max_sequence_length - len(seq)) for seq in val_encoded_data], dtype=torch.long)

100%|██████████| 90738/90738 [00:05<00:00, 15568.85it/s]


In [15]:
# binarize label (1 if existed, 0 if not existed)
mlb = MultiLabelBinarizer()
train_binary_labels = mlb.fit_transform(train_df['genres'])
test_binary_labels = mlb.transform(test_df['genres'])
val_binary_labels = mlb.transform(val_df['genres'])

In [16]:
# put data into a Dataset class for loading and training
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

In [17]:
train_dataset = TextDataset(train_padded_sequences, train_binary_labels)
val_dataset = TextDataset(val_padded_sequences, val_binary_labels)
test_dataset = TextDataset(test_padded_sequences, test_binary_labels)

# split data into each batch (each batch has 32 rows) with dataloader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [18]:
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.RNN(hidden_size, hidden_size, batch_first=True)
        self.fc1 = nn.Linear(hidden_size, 64)
        self.fc2 = nn.Linear(64, output_size)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        out = self.fc1(out[:, -1, :])
        out = torch.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        out = torch.sigmoid(out)
        return out

In [19]:
# initialize model
hidden_size = 64
output_size = len(mlb.classes_)

model = RNNModel(max_words, hidden_size, output_size)
print(max_words, hidden_size, output_size)

128256 64 5


In [20]:
# using Binary Cross-Entropy loss and Adam optimier
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [21]:
from tqdm import tqdm

In [22]:
epochs = 3
for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    for texts, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} [Training]"):
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()
    
    # validate
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for texts, labels in tqdm(val_loader, desc=f"Epoch {epoch+1}/{epochs} [Validation]"):
            outputs = model(texts)
            loss = criterion(outputs, labels)
            total_val_loss += loss.item()
    
    avg_train_loss = total_train_loss / len(train_loader)
    avg_val_loss = total_val_loss / len(val_loader)
    
    print(f"Epoch {epoch+1}/{epochs}, Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")

Epoch 1/3 [Training]: 100%|██████████| 2836/2836 [28:22<00:00,  1.67it/s]
Epoch 1/3 [Validation]: 100%|██████████| 229/229 [00:21<00:00, 10.66it/s]


Epoch 1/3, Training Loss: 0.6385, Validation Loss: 0.6322


Epoch 2/3 [Training]: 100%|██████████| 2836/2836 [31:25<00:00,  1.50it/s]
Epoch 2/3 [Validation]: 100%|██████████| 229/229 [00:21<00:00, 10.74it/s]


Epoch 2/3, Training Loss: 0.6360, Validation Loss: 0.6366


Epoch 3/3 [Training]: 100%|██████████| 2836/2836 [47:13<00:00,  1.00it/s]
Epoch 3/3 [Validation]: 100%|██████████| 229/229 [00:21<00:00, 10.68it/s]

Epoch 3/3, Training Loss: 0.6353, Validation Loss: 0.6360





In [23]:
torch.save(model.state_dict(), "RNN_baseline.pth")

In [24]:
# threshold tuning using grid search
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

thresholds = np.arange(0.5, 0.91, 0.05)
best_threshold = None
best_accuracy = 0
best_f1 = 0

all_predictions_raw = []
all_ground_truths = []

with torch.no_grad():
    for texts, labels in tqdm(test_loader):
        outputs = model(texts)
        probabilities = torch.sigmoid(outputs)
        
        # get raw probabilities and ground truths
        all_predictions_raw.extend(probabilities.tolist())
        all_ground_truths.extend(labels.tolist())

all_predictions_raw = torch.tensor(all_predictions_raw)
all_ground_truths = torch.tensor(all_ground_truths)

# initialize variables to track the best metrics
best_metrics = {
    "threshold": None,
    "accuracy": 0,
    "f1": 0,
    "precision": 0,
    "recall": 0
}

for threshold in thresholds:
    predictions = (all_predictions_raw > threshold).int()

    correct = (predictions == all_ground_truths).all(dim=1).sum().item()
    total = all_ground_truths.size(0)
    accuracy = correct / total

    precision = precision_score(all_ground_truths.cpu().numpy(), predictions.cpu().numpy(), average="samples", zero_division=0)
    recall = recall_score(all_ground_truths.cpu().numpy(), predictions.cpu().numpy(), average="samples", zero_division=0)
    f1 = f1_score(all_ground_truths.cpu().numpy(), predictions.cpu().numpy(), average="samples", zero_division=0)

    print(f"Threshold: {threshold:.2f}, Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

    if f1 > best_metrics["f1"]:
        best_metrics.update({
            "threshold": threshold,
            "accuracy": accuracy,
            "f1": f1,
            "precision": precision,
            "recall": recall
        })

print(f"Best Threshold: {best_metrics['threshold']:.2f}, Accuracy: {best_metrics['accuracy']:.4f}, Precision: {best_metrics['precision']:.4f}, Recall: {best_metrics['recall']:.4f}, F1: {best_metrics['f1']:.4f}")

100%|██████████| 229/229 [00:20<00:00, 11.08it/s]


Threshold: 0.50, Accuracy: 0.0057, Precision: 0.4550, Recall: 1.0000, F1: 0.6005
Threshold: 0.55, Accuracy: 0.0057, Precision: 0.4550, Recall: 1.0000, F1: 0.6005
Threshold: 0.60, Accuracy: 0.0057, Precision: 0.4550, Recall: 1.0000, F1: 0.6005
Threshold: 0.65, Accuracy: 0.2054, Precision: 0.8514, Recall: 0.4582, F1: 0.5638
Threshold: 0.70, Accuracy: 0.0000, Precision: 0.0000, Recall: 0.0000, F1: 0.0000
Threshold: 0.75, Accuracy: 0.0000, Precision: 0.0000, Recall: 0.0000, F1: 0.0000
Threshold: 0.80, Accuracy: 0.0000, Precision: 0.0000, Recall: 0.0000, F1: 0.0000
Threshold: 0.85, Accuracy: 0.0000, Precision: 0.0000, Recall: 0.0000, F1: 0.0000
Threshold: 0.90, Accuracy: 0.0000, Precision: 0.0000, Recall: 0.0000, F1: 0.0000
Best Threshold: 0.50, Accuracy: 0.0057, Precision: 0.4550, Recall: 1.0000, F1: 0.6005


In [25]:
# get mismatch samples for error analysis
all_predictions_raw = []
all_ground_truths = []
all_texts = []

with torch.no_grad():
    for texts, labels in test_loader:
        outputs = model(texts)
        probabilities = torch.sigmoid(outputs)
        
        # get test's features and score
        all_predictions_raw.extend(probabilities.tolist())
        all_ground_truths.extend(labels.tolist())
        all_texts.extend(texts.tolist())

all_predictions_raw = torch.tensor(all_predictions_raw)
all_ground_truths = torch.tensor(all_ground_truths)

# apply best threshold
best_threshold = best_metrics['threshold']
predictions = (all_predictions_raw > best_threshold).int()

# get mismatch samples
mismatched_indices = (predictions != all_ground_truths).any(dim=1).nonzero(as_tuple=True)[0]

mismatch_data = []

for index in tqdm(mismatched_indices):
    index = index.item()

    # decode the tokenized input text
    decoded_text = tokenizer.decode(all_texts[index], skip_special_tokens=True)
    
    # map binary labels back to tag names
    true_tags = mlb.inverse_transform(np.array([all_ground_truths[index]]))[0]
    predicted_tags = mlb.inverse_transform(np.array([predictions[index]]))[0]
    
    # format the label description
    label_description = f"Ground Truth: {', '.join(true_tags)} | Predicted: {', '.join(predicted_tags)}"
    
    # append to the list
    mismatch_data.append({
        "description": decoded_text,
        "label": label_description
    })

# convert to DataFrame
mismatch_df = pd.DataFrame(mismatch_data)

mismatch_df

100%|██████████| 7267/7267 [02:56<00:00, 41.15it/s]


Unnamed: 0,description,label
0,the king of torts john grisham goodreads autho...,"Ground Truth: fantasy, fiction, romance, young..."
1,the apprentices quest erin hunter erin hunters...,"Ground Truth: fantasy, fiction | Predicted: co..."
2,the stolen moon of londor ap stephens goodread...,Ground Truth: fiction | Predicted: contemporar...
3,the adventure of the golden pincenez arthur co...,"Ground Truth: contemporary, fiction, romance, ..."
4,defy sara b larson goodreads author alexa holl...,"Ground Truth: fantasy, fiction, romance | Pred..."
...,...,...
7262,a mangoshaped space wendy mass goodreads autho...,Ground Truth: fiction | Predicted: contemporar...
7263,my brother jack george johnston through the st...,"Ground Truth: fantasy, romance | Predicted: co..."
7264,easy kisses kristen progoodreads author the bo...,Ground Truth: fiction | Predicted: contemporar...
7265,el quinto dragn paulina aguilar gutirrez goodr...,"Ground Truth: fiction, romance | Predicted: co..."


In [26]:
mismatch_df.to_csv("mismatch.csv")