In [4]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from datasets import Dataset


In [5]:
sector_list = [
    "buildings",
    "digitalisation",
    "freight",
    "mobility",
    "nutrition",
    "urban_ecology",
    "urban_governance",
    "urban_infra",
]
df_list = []
for sector in sector_list:
    print(f"Processing {sector} dataset")
    # Read the CSV file
    df = pd.read_csv(
        f"../data/{sector}_dataset.csv", usecols=["title", "abstract", "true_label"]
    )
    df = df[df["true_label"] == "About Sufficiency"]
    df["origin"] = sector
    df_list.append(df)
df = pd.concat(df_list, ignore_index=True).dropna()
df["true_label"] = df["origin"].astype("category").cat.codes
df.to_csv("../data/sector_positive.csv", index=False)
df

Processing buildings dataset
Processing digitalisation dataset
Processing freight dataset
Processing mobility dataset
Processing nutrition dataset
Processing urban_ecology dataset
Processing urban_governance dataset
Processing urban_infra dataset


Unnamed: 0,title,abstract,true_label,origin
0,Young Households' Diminishing Access to Homeow...,This multi-country article focuses particularl...,0,buildings
1,Wood buildings as a climate solution,We conducted a systematic literature search an...,0,buildings
2,Winners and Losers in Housing Markets,This paper is a quantitatively oriented theore...,0,buildings
3,Window opening behavior of occupants in reside...,Window opening behavior has a vital influence ...,0,buildings
4,Wind driven natural ventilation in the idealiz...,Improved ventilation in an urban residential n...,0,buildings
...,...,...,...,...
6733,Legal Guarantee of Smart City Pilot and Green ...,Green and smart cities are based on clean ener...,7,urban_infra
6734,Legal Guarantee of Smart City Pilot and Green ...,Green and smart cities are based on clean ener...,7,urban_infra
6735,Sustainable Urban Resource Management an Analy...,Conference Title: 2024 6th International Confe...,7,urban_infra
6736,Efficiency of green and low-carbon coordinated...,As a critical engine for national economic gro...,7,urban_infra


In [6]:
# Load the dataset
df = pd.read_csv("../data/sector_positive.csv")

# Split the dataset into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["abstract"], df["true_label"], test_size=0.2, random_state=42, stratify=df["true_label"]
)

# Traditional Classifier: Logistic Regression
# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)

# Train a Logistic Regression model
lr_model = LogisticRegression()
lr_model.fit(X_train, train_labels)

# Evaluate the Logistic Regression model
lr_predictions = lr_model.predict(X_test)
print("Logistic Regression Performance:")
print(classification_report(test_labels, lr_predictions, target_names=df["origin"].unique()))


Logistic Regression Performance:
                  precision    recall  f1-score   support

       buildings       0.84      0.70      0.76       116
  digitalisation       0.88      0.89      0.89       126
         freight       1.00      0.19      0.31        27
        mobility       0.84      0.99      0.91       753
       nutrition       0.89      0.92      0.90       148
   urban_ecology       0.68      0.27      0.39        48
urban_governance       0.50      0.08      0.13        39
     urban_infra       0.59      0.32      0.41        75

        accuracy                           0.84      1332
       macro avg       0.78      0.54      0.59      1332
    weighted avg       0.82      0.84      0.81      1332



In [7]:
# BERT Classifier
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained(
    "bert-base-uncased", cache_dir="../.cache"
)

# Tokenize the dataset
train_encodings = tokenizer(
    list(train_texts), truncation=True, padding=True, max_length=512
)
test_encodings = tokenizer(
    list(test_texts), truncation=True, padding=True, max_length=512
)

# Convert the data into Hugging Face Dataset format
train_dataset = Dataset.from_dict(
    {
        "input_ids": train_encodings["input_ids"],
        "attention_mask": train_encodings["attention_mask"],
        "labels": train_labels,
    }
)
test_dataset = Dataset.from_dict(
    {
        "input_ids": test_encodings["input_ids"],
        "attention_mask": test_encodings["attention_mask"],
        "labels": test_labels,
    }
)

# Load the BERT model
bert_model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(df["true_label"].unique()),
    cache_dir="../.cache",
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=200,
)

# Define the Trainer
trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the BERT model
trainer.train()

# Evaluate the BERT model
bert_predictions = trainer.predict(test_dataset)
bert_predicted_labels = bert_predictions.predictions.argmax(axis=1)

print("BERT Classifier Performance:")
print(classification_report(test_labels, bert_predicted_labels, target_names=df["origin"].unique()))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
200,0.9156
400,0.4852
600,0.4234


BERT Classifier Performance:
                  precision    recall  f1-score   support

       buildings       0.86      0.88      0.87       116
  digitalisation       0.81      0.96      0.88       126
         freight       1.00      0.07      0.14        27
        mobility       0.92      0.98      0.95       753
       nutrition       0.93      0.93      0.93       148
   urban_ecology       0.68      0.35      0.47        48
urban_governance       0.92      0.28      0.43        39
     urban_infra       0.55      0.52      0.53        75

        accuracy                           0.88      1332
       macro avg       0.83      0.62      0.65      1332
    weighted avg       0.88      0.88      0.86      1332



# SciBERT

In [8]:
# Load the BERT model
model_name = "allenai/scibert_scivocab_uncased"
tokenizer = BertTokenizer.from_pretrained(model_name, cache_dir="../.cache")

# Tokenize the dataset
train_encodings = tokenizer(
    list(train_texts), truncation=True, padding=True, max_length=512
)
test_encodings = tokenizer(
    list(test_texts), truncation=True, padding=True, max_length=512
)

# Convert the data into Hugging Face Dataset format
train_dataset = Dataset.from_dict(
    {
        "input_ids": train_encodings["input_ids"],
        "attention_mask": train_encodings["attention_mask"],
        "labels": train_labels,
    }
)
test_dataset = Dataset.from_dict(
    {
        "input_ids": test_encodings["input_ids"],
        "attention_mask": test_encodings["attention_mask"],
        "labels": test_labels,
    }
)
bert_model = BertForSequenceClassification.from_pretrained(
    model_name, num_labels=len(df["true_label"].unique()), cache_dir="../.cache"
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    # evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
)

# Define the Trainer
trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the BERT model
trainer.train()

# Evaluate the BERT model
bert_predictions = trainer.predict(test_dataset)
bert_predicted_labels = bert_predictions.predictions.argmax(axis=1)

print("BERT Classifier Performance:")
print(classification_report(test_labels, bert_predicted_labels, target_names=df["origin"].unique()))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.9929
100,0.4745
150,0.4002


BERT Classifier Performance:
                  precision    recall  f1-score   support

       buildings       0.88      0.84      0.86       116
  digitalisation       0.83      0.95      0.89       126
         freight       0.91      0.37      0.53        27
        mobility       0.93      0.98      0.95       753
       nutrition       0.92      0.95      0.93       148
   urban_ecology       0.71      0.42      0.53        48
urban_governance       0.86      0.46      0.60        39
     urban_infra       0.59      0.55      0.57        75

        accuracy                           0.89      1332
       macro avg       0.83      0.69      0.73      1332
    weighted avg       0.88      0.89      0.88      1332

