In [None]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from datasets import Dataset


In [None]:
sector_list = [
    "buildings",
    "digitalisation",
    "freight",
    "mobility",
    "nutrition",
    "urban_ecology",
    "urban_governance",
    "urban_infra",
    "trade",
]
df_list = []
for sector in sector_list:
    print(f"Processing {sector} dataset")
    # Read the CSV file
    df = pd.read_csv(
        f"../data/{sector}_dataset.csv", usecols=["title", "abstract", "true_label"]
    )
    df = df[df["true_label"] == "About Sufficiency"]
    df["origin"] = sector
    df_list.append(df)
df = pd.concat(df_list, ignore_index=True).dropna()
df["true_label"] = df["origin"].astype("category").cat.codes
df.to_csv("../data/sector_positive.csv", index=False)
df

In [None]:
# Load the dataset
df = pd.read_csv("../data/sector_positive.csv")

# Split the dataset into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["abstract"], df["true_label"], test_size=0.2, random_state=42, stratify=df["true_label"]
)

# Traditional Classifier: Logistic Regression
# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)

# Train a Logistic Regression model
lr_model = LogisticRegression()
lr_model.fit(X_train, train_labels)

# Evaluate the Logistic Regression model
lr_predictions = lr_model.predict(X_test)
print("Logistic Regression Performance:")
print(classification_report(test_labels, lr_predictions, target_names=df["origin"].unique()))


In [None]:
# BERT Classifier
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained(
    "bert-base-uncased", cache_dir="../.cache"
)

# Tokenize the dataset
train_encodings = tokenizer(
    list(train_texts), truncation=True, padding=True, max_length=512
)
test_encodings = tokenizer(
    list(test_texts), truncation=True, padding=True, max_length=512
)

# Convert the data into Hugging Face Dataset format
train_dataset = Dataset.from_dict(
    {
        "input_ids": train_encodings["input_ids"],
        "attention_mask": train_encodings["attention_mask"],
        "labels": train_labels,
    }
)
test_dataset = Dataset.from_dict(
    {
        "input_ids": test_encodings["input_ids"],
        "attention_mask": test_encodings["attention_mask"],
        "labels": test_labels,
    }
)

# Load the BERT model
bert_model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(df["true_label"].unique()),
    cache_dir="../.cache",
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=200,
)

# Define the Trainer
trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the BERT model
trainer.train()

# Evaluate the BERT model
bert_predictions = trainer.predict(test_dataset)
bert_predicted_labels = bert_predictions.predictions.argmax(axis=1)

print("BERT Classifier Performance:")
print(classification_report(test_labels, bert_predicted_labels, target_names=df["origin"].unique()))

# SciBERT

In [None]:
# Load the BERT model
model_name = "allenai/scibert_scivocab_uncased"
tokenizer = BertTokenizer.from_pretrained(model_name, cache_dir="../.cache")

# Tokenize the dataset
train_encodings = tokenizer(
    list(train_texts), truncation=True, padding=True, max_length=512
)
test_encodings = tokenizer(
    list(test_texts), truncation=True, padding=True, max_length=512
)

# Convert the data into Hugging Face Dataset format
train_dataset = Dataset.from_dict(
    {
        "input_ids": train_encodings["input_ids"],
        "attention_mask": train_encodings["attention_mask"],
        "labels": train_labels,
    }
)
test_dataset = Dataset.from_dict(
    {
        "input_ids": test_encodings["input_ids"],
        "attention_mask": test_encodings["attention_mask"],
        "labels": test_labels,
    }
)
bert_model = BertForSequenceClassification.from_pretrained(
    model_name, num_labels=len(df["true_label"].unique()), cache_dir="../.cache"
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    # evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
)

# Define the Trainer
trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the BERT model
trainer.train()

# Evaluate the BERT model
bert_predictions = trainer.predict(test_dataset)
bert_predicted_labels = bert_predictions.predictions.argmax(axis=1)

print("BERT Classifier Performance:")
print(classification_report(test_labels, bert_predicted_labels, target_names=df["origin"].unique()))