In [15]:
import pandas as pd
import ast
from datasets import Dataset, Features, Sequence, Value
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
import numpy as np
from sklearn.metrics import f1_score

# ==================================================
# 1. Load CSV and Preprocess Input Data
# ==================================================
# Assumes your CSV file has columns: article_id, review, all_topics, selected_topics
csv_file = "csv/GroundTruthProdArea10k.csv"  # Change this to the path of your CSV file.
df = pd.read_csv(csv_file)

# Convert the string representations of lists into actual lists.
df["all_topics"] = df["all_topics"].apply(ast.literal_eval)
df["selected_topics"] = df["selected_topics"].apply(ast.literal_eval)

# --------------------------------------------------
# Build a Global Vocabulary of Candidate Topics
# --------------------------------------------------
global_vocab = sorted(set.union(*df["all_topics"].apply(set)))
print("Global Vocabulary:", global_vocab)

# --------------------------------------------------
# Create a Binary Label Vector for Each Example
# --------------------------------------------------
def create_label_vector(selected_topics, vocab):
    # 1 if topic is selected, 0 otherwise.
    return [1 if topic in selected_topics else 0 for topic in vocab]

df["labels"] = df["selected_topics"].apply(lambda st: create_label_vector(st, global_vocab))

# --------------------------------------------------
# Create the Input Text With Candidate Topics
# --------------------------------------------------
def create_input_text(row):
    candidates = ", ".join(row["all_topics"])
    review_text = row["review"]
    return f"Candidate topics: {candidates}. Review: {review_text}"

df["input_text"] = df.apply(create_input_text, axis=1)

# --------------------------------------------------
# Select Only the Columns Needed for Model Training
# --------------------------------------------------
df_processed = df[["input_text", "labels"]]

# ==================================================
# 2. Convert the DataFrame to a Hugging Face Dataset
# ==================================================
dataset = Dataset.from_pandas(df_processed)

# Optionally perform a train/test split (e.g., 80/20 split):
dataset = dataset.train_test_split(test_size=0.1)

# ==================================================
# 3. Tokenize the Data for the Model
# ==================================================
model_name = "bert-base-uncased"  # Change to "roberta-base" or "microsoft/deberta-base" if desired.
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(example):
    return tokenizer(example["input_text"], padding="max_length", truncation=True, max_length=256)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Remove the raw text column since it’s not needed anymore.
tokenized_datasets = tokenized_datasets.remove_columns(["input_text"])

# ==================================================
# 4. Convert the "labels" Column to Floats
# ==================================================
# Force each "labels" list to have float values.
def convert_labels(example):
    example["labels"] = [float(x) for x in example["labels"]]
    return example

tokenized_datasets = tokenized_datasets.map(convert_labels)

# Now, we explicitly cast the "labels" column to be a Sequence of float32.
# This prevents the DataCollator from automatically converting them back to int.
new_features = tokenized_datasets["train"].features.copy()
new_features["labels"] = Sequence(feature=Value("float32"))
tokenized_datasets = tokenized_datasets.cast(new_features)

# Set the dataset format to PyTorch tensors.
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# ==================================================
# 5. Define a Metrics Function for Evaluation
# ==================================================
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Apply sigmoid to obtain probabilities.
    probs = 1 / (1 + np.exp(-logits))
    # Threshold at 0.5 to obtain binary predictions.
    predictions = (probs > 0.5).astype(int)
    f1 = f1_score(labels, predictions, average="micro")
    return {"f1": f1}

# ==================================================
# 6. Load and Configure the Model for Fine-Tuning
# ==================================================
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(global_vocab),
    problem_type="multi_label_classification"
)

# ==================================================
# 7. Set Up the Trainer and Training Arguments
# ==================================================
training_args = TrainingArguments(
    output_dir=f"./fine_tuned_{model_name}",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# ==================================================
# 8. Fine-Tune the Model and Evaluate
# ==================================================
trainer.train()
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)


Global Vocabulary: ['Absorbency', 'Accessibility', 'Accessories', 'Ambience', 'App', 'Appearance', 'Arms', 'Assembly', 'Availability', 'Back', 'Battery', 'Beam', 'Bed', 'Bedside', 'Bits', 'Books', 'Brightness', 'Bulb', 'Bulbs', 'Burn time', 'Buttons', 'Cabinet', 'Car', 'Ceiling', 'Charging', 'Childproofing', 'Christmas', 'Clamps', 'Cleaning', 'Cleanliness', 'Closure', 'Cloth-like', 'Collection', 'Color', 'Colors', 'Comfort', 'Compatibility', 'Connection', 'Convenience', 'Cooking', 'Cookware', 'Cooling', 'Cord', 'Counter', 'Counter space', 'Cover', 'Covers', 'Cracks', 'Crunchiness', 'Cushion', 'Customer service', 'Customization', 'Cute', 'Decor', 'Decoration', 'Delivery', 'Design', 'Desk', 'Display', 'Door', 'Doors', 'Drainage', 'Drawer', 'Drawers', 'Durability', 'Ease of use', 'Easy to use', 'Edges', 'Effectiveness', 'Extension', 'Fabric', 'Falling', 'Features', 'Filling', 'Finish', 'Firmness', 'Fit', 'Flatness', 'Footboard', 'Frame', 'Functionality', 'Gift', 'Glass', 'Gnome', 'Greener

Map: 100%|██████████| 10081/10081 [00:00<00:00, 14769.16 examples/s]
Map: 100%|██████████| 1121/1121 [00:00<00:00, 15796.08 examples/s]
Map: 100%|██████████| 10081/10081 [00:00<00:00, 15744.90 examples/s]
Map: 100%|██████████| 1121/1121 [00:00<00:00, 14891.88 examples/s]
Casting the dataset: 100%|██████████| 10081/10081 [00:00<00:00, 935935.95 examples/s]
Casting the dataset: 100%|██████████| 1121/1121 [00:00<00:00, 481151.74 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1
1,0.0451,0.041415,0.323024
2,0.0387,0.039163,0.31422
3,0.0363,0.037591,0.327408


Evaluation results: {'eval_loss': 0.03759075701236725, 'eval_f1': 0.32740825688073394, 'eval_runtime': 11.4674, 'eval_samples_per_second': 97.755, 'eval_steps_per_second': 12.296, 'epoch': 3.0}
