In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, classification_report

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load data
train_df = pd.read_csv('../data/processed/clean_train.csv')
valid_df = pd.read_csv('../data/processed/clean_valid.csv')

train_usmpl_df = pd.read_csv('../data/processed/clean_train_upsampled.csv')


# Initialize Sentence Transformer Model
model = SentenceTransformer('BAAI/bge-small-en-v1.5')

# Transform 'clean_content' using Sentence Transformer
X_train = model.encode(train_df['clean_content'].to_list(), show_progress_bar=True)
X_valid = model.encode(valid_df['clean_content'].to_list(), show_progress_bar=True)
X_train_usmpl = model.encode(train_usmpl_df['clean_content'].to_list(), show_progress_bar=True)


# Prepare labels for multilabel classification
y_train = train_df[['cyber_label', 'environmental_issue']]
y_valid = valid_df[['cyber_label', 'environmental_issue']]
y_train_usmpl = train_usmpl_df[['cyber_label', 'environmental_issue']]


Batches: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:07<00:00,  4.22it/s]
Batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  4.43it/s]
Batches: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 91/91 [00:20<00:00,  4.52it/s]


In [3]:

# MultiOutput Classifier (Upsampled cleaned dataset)
multioutput_classifier = MultiOutputClassifier(SVC(probability=True, random_state=42), n_jobs=-1)
multioutput_classifier.fit(X_train_usmpl, y_train_usmpl)


# Prediction and evaluation
y_pred = multioutput_classifier.predict(X_valid)
for i, label in enumerate(y_train_usmpl.columns):
    print(f"Accuracy for {label}: {accuracy_score(y_valid.iloc[:, i], y_pred[:, i])}")
    print(f"Classification Report for {label}:\n", classification_report(y_valid.iloc[:, i], y_pred[:, i]))


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Accuracy for cyber_label: 0.9444444444444444
Classification Report for cyber_label:
               precision    recall  f1-score   support

           0       0.98      0.96      0.97       235
           1       0.57      0.71      0.63        17

    accuracy                           0.94       252
   macro avg       0.77      0.83      0.80       252
weighted avg       0.95      0.94      0.95       252

Accuracy for environmental_issue: 0.8690476190476191
Classification Report for environmental_issue:
               precision    recall  f1-score   support

           0       0.94      0.89      0.92       200
           1       0.65      0.79      0.71        52

    accuracy                           0.87       252
   macro avg       0.80      0.84      0.81       252
weighted avg       0.88      0.87      0.87       252



## Predict test set labels

In [7]:
test_df = pd.read_csv('../data/processed/clean_test.csv')
X_test = model.encode(test_df['clean_content'].to_list(), show_progress_bar=True)
y_test_pred = multioutput_classifier.predict(X_test)
y_test_pred.shape

Batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  3.88it/s]


(142, 2)

In [8]:
test_df["cyber_label"] = y_test_pred[:, 0]
test_df["environmental_issue"] = y_test_pred[:, 1]
test_df

Unnamed: 0,clean_content,cyber_label,environmental_issue
0,More recently there has been a focus on risks ...,0,1
1,Energy crisis\n3. Business interruption\nThe i...,1,1
2,The survey results indicate that a lack of ski...,0,0
3,Respondents could\nselect more than one risk.0...,1,0
4,The interest payable on\nthe country's public ...,0,0
...,...,...,...
137,Negative No impact Positive Unsure\nNo changes...,0,0
138,"In Ethiopia, Kenya, and Somalia, for example, ...",0,1
139,"artificial intelligence, automation in all of ...",0,0
140,Personal Data Regulatory\nFragmentation\nThe r...,0,1


In [9]:
test_df.to_csv("../reports/best_model_test_preds.csv", index=False)