In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset
from transformers import TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
import pandas as pd
import torch

model_name = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

def preprocess_data(examples):
    text = [t + " " + k + " " + a for t, k, a in zip(examples["title"], examples["keywords"], examples["author"])]
    return tokenizer(text, padding="max_length", truncation=True, max_length=10)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
)

df = pd.read_excel("data_file.xlsx")
train_df, test_df = train_test_split(df, test_size=0.25, random_state=21)

required_columns = {"title", "keywords", "author", "label"}
if not required_columns.issubset(train_df.columns):
    raise ValueError(f"Excel file must contain {required_columns} columns.")

train_df['keywords'].fillna('No Keywords', inplace=True)

dataset = Dataset.from_pandas(train_df)

dataset = dataset.map(preprocess_data, batched=True)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset
)

trainer.train()

if not {"title", "keywords", "author"}.issubset(test_df.columns):
    raise ValueError("Excel file must contain 'title', 'keywords', and 'author' columns.")

test_df['keywords'].fillna('No Keywords', inplace=True)

test_titles = test_df["title"].tolist()
test_keywords = test_df["keywords"].tolist()
test_authors = test_df["author"].tolist()

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
model = model.to(device)

inputs = tokenizer(
    [t + " " + k + " " + a for t, k, a in zip(test_titles, test_keywords, test_authors)],
    padding=True, truncation=True, return_tensors="pt", max_length=10
)
inputs = {key: value.to(device) for key, value in inputs.items()}

outputs = model(**inputs)
predictions = torch.argmax(outputs.logits, dim=1)

test_df["Prediction"] = predictions.cpu().numpy()

test_df.to_excel("output_file.xlsx", index=False)