In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
import lightgbm as lgb


In [2]:


# Load data
train_df = pd.read_csv('../data/processed/clean_train.csv')
valid_df = pd.read_csv('../data/processed/clean_valid.csv')

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Transform 'clean_content'
X_train = vectorizer.fit_transform(train_df['clean_content'])
X_valid = vectorizer.transform(valid_df['clean_content'])

# Prepare labels for multilabel classification
y_train = train_df[['cyber_label', 'environmental_issue']]
y_valid = valid_df[['cyber_label', 'environmental_issue']]

# MultiOutput Classifier
multioutput_classifier = MultiOutputClassifier(lgb.LGBMClassifier(verbosity=2), n_jobs=-1)
multioutput_classifier.fit(X_train, y_train)


In [3]:

# Prediction and evaluation
y_pred = multioutput_classifier.predict(X_valid)

# Evaluating the model using F1 score and classification report
print("Accuracy:", accuracy_score(y_valid, y_pred))
print("F1 Score:", f1_score(y_valid, y_pred, average='weighted'))
print("Classification Report (multilabel eval):\n", classification_report(y_valid, y_pred))

for i, label in enumerate(y_train.columns):
    print(f"Accuracy for {label}: {accuracy_score(y_valid.iloc[:, i], y_pred[:, i])}")
    print(f"Classification Report for {label}:\n", classification_report(y_valid.iloc[:, i], y_pred[:, i]))


Accuracy: 0.7857142857142857
F1 Score: 0.500292022663367
Classification Report (multilabel eval):
               precision    recall  f1-score   support

           0       0.58      0.41      0.48        17
           1       0.68      0.40      0.51        52

   micro avg       0.65      0.41      0.50        69
   macro avg       0.63      0.41      0.49        69
weighted avg       0.65      0.41      0.50        69
 samples avg       0.11      0.11      0.11        69

Accuracy for cyber_label: 0.9404761904761905
Classification Report for cyber_label:
               precision    recall  f1-score   support

           0       0.96      0.98      0.97       235
           1       0.58      0.41      0.48        17

    accuracy                           0.94       252
   macro avg       0.77      0.70      0.73       252
weighted avg       0.93      0.94      0.94       252

Accuracy for environmental_issue: 0.8373015873015873
Classification Report for environmental_issue:
        

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [4]:

# Load data
train_df = pd.read_csv('../data/processed/clean_train.csv')
valid_df = pd.read_csv('../data/processed/clean_valid.csv')

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Transform 'clean_content'
X_train = vectorizer.fit_transform(train_df['clean_content'])
X_valid = vectorizer.transform(valid_df['clean_content'])

# Prepare labels for multilabel classification
y_train = train_df[['cyber_label', 'environmental_issue']]
y_valid = valid_df[['cyber_label', 'environmental_issue']]

# MultiOutput Classifier
multioutput_classifier = MultiOutputClassifier(
    lgb.LGBMClassifier(
        verbosity=2,
        min_data_in_leaf=20, 
        class_weight='balanced',
    ),
    n_jobs=-1,
)
multioutput_classifier.fit(X_train, y_train)


# Prediction and evaluation
y_pred = multioutput_classifier.predict(X_valid)

# Evaluating the model using F1 score and classification report
print("Accuracy:", accuracy_score(y_valid, y_pred))
print("F1 Score:", f1_score(y_valid, y_pred, average='weighted'))
print("Classification Report (multilabel eval):\n", classification_report(y_valid, y_pred))

for i, label in enumerate(y_train.columns):
    print(f"Accuracy for {label}: {accuracy_score(y_valid.iloc[:, i], y_pred[:, i])}")
    print(f"Classification Report for {label}:\n", classification_report(y_valid.iloc[:, i], y_pred[:, i]))


Accuracy: 0.8055555555555556
F1 Score: 0.6050343249427917
Classification Report (multilabel eval):
               precision    recall  f1-score   support

           0       0.59      0.59      0.59        17
           1       0.67      0.56      0.61        52

   micro avg       0.65      0.57      0.60        69
   macro avg       0.63      0.57      0.60        69
weighted avg       0.65      0.57      0.61        69
 samples avg       0.15      0.15      0.15        69

Accuracy for cyber_label: 0.9444444444444444
Classification Report for cyber_label:
               precision    recall  f1-score   support

           0       0.97      0.97      0.97       235
           1       0.59      0.59      0.59        17

    accuracy                           0.94       252
   macro avg       0.78      0.78      0.78       252
weighted avg       0.94      0.94      0.94       252

Accuracy for environmental_issue: 0.8531746031746031
Classification Report for environmental_issue:
       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [5]:

# Load data
train_df = pd.read_csv('../data/processed/clean_train.csv')
valid_df = pd.read_csv('../data/processed/clean_valid.csv')

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Transform 'clean_content'
X_train = vectorizer.fit_transform(train_df['clean_content'])
X_valid = vectorizer.transform(valid_df['clean_content'])

# Prepare labels for multilabel classification
y_train = train_df[['cyber_label', 'environmental_issue']]
y_valid = valid_df[['cyber_label', 'environmental_issue']]

# MultiOutput Classifier
multioutput_classifier = MultiOutputClassifier(
    lgb.LGBMClassifier(
        verbosity=0,
        min_data_in_leaf=30, 
        class_weight='balanced',
        learning_rate=0.15,
    ),
    n_jobs=-1,
)
multioutput_classifier.fit(X_train, y_train)


# Prediction and evaluation
y_pred = multioutput_classifier.predict(X_valid)

# Evaluating the model using F1 score and classification report
print("Accuracy:", accuracy_score(y_valid, y_pred))
print("F1 Score:", f1_score(y_valid, y_pred, average='weighted'))
print("Classification Report (multilabel eval):\n", classification_report(y_valid, y_pred))

for i, label in enumerate(y_train.columns):
    print(f"Accuracy for {label}: {accuracy_score(y_valid.iloc[:, i], y_pred[:, i])}")
    print(f"Classification Report for {label}:\n", classification_report(y_valid.iloc[:, i], y_pred[:, i]))


Accuracy: 0.8055555555555556
F1 Score: 0.6092161767470107
Classification Report (multilabel eval):
               precision    recall  f1-score   support

           0       0.64      0.53      0.58        17
           1       0.67      0.58      0.62        52

   micro avg       0.66      0.57      0.61        69
   macro avg       0.65      0.55      0.60        69
weighted avg       0.66      0.57      0.61        69
 samples avg       0.15      0.15      0.15        69

Accuracy for cyber_label: 0.9484126984126984
Classification Report for cyber_label:
               precision    recall  f1-score   support

           0       0.97      0.98      0.97       235
           1       0.64      0.53      0.58        17

    accuracy                           0.95       252
   macro avg       0.80      0.75      0.78       252
weighted avg       0.94      0.95      0.95       252

Accuracy for environmental_issue: 0.8531746031746031
Classification Report for environmental_issue:
       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:
valid_df.sample(5)

Unnamed: 0,clean_content,cyber_label,environmental_issue
217,These tools will also be a gift to autocrats b...,0,0
198,"""Increasingly we see that business leaders see...",0,0
87,"post-Brexit implementations, escalating tariff...",0,1
182,"If the risk rankings are changing rapidly,\nth...",0,0
232,Educate the Audit Committee\nEducate the audit...,0,0


In [7]:
test_v = vectorizer.transform([valid_df.iloc[67]['clean_content']])
print(multioutput_classifier.predict(test_v))

print("\n\n---TEXT HERE:---\n" + valid_df.iloc[67]['clean_content'])

[[1 0]]


---TEXT HERE:---
Source: Global Technology Industry Risk Study 202369%
46%
17%
51%
30%
Direct cost reduction initiatives
(administrative, operational)
Reduction in workforce
or slowdown in hiring
None of these
Preservation of capital
and rethinking our
ability to retain risk
Reevaluation of sources of
risk capital to fund losses
URL
priorities6
Reliability and security remain
top worries, but reputational risk
emerging as a major concern
Maintaining reliable and secure networks remains the biggest challenge for technology companies (see Figure 2). The
top three risks in this year's report data security and privacy, digital business interruption, and technology errors and
omissions have consistently ranked as top risks for all nine years of this report. IT resilience, which is closely linked to the
top three risks, remained in the top 5, the same as last year.
But survey respondents have expressed a deepening worry about reputational risk. Ranked tenth in 2022, nearly half of


In [8]:
test_v = vectorizer.transform([valid_df.iloc[3]['clean_content']])
print(multioutput_classifier.predict(test_v))

print("\n\n---TEXT HERE:---\n" + valid_df.iloc[3]['clean_content'])

[[0 1]]


---TEXT HERE:---
87
URL
FIGURE 10A
Board Members 2023
Changes in the overall work environment including shifts to hybrid work environments,
expansion of digital labor, changes in the nature of work and who does that work, and M&A
activities may lead to challenges to sustaining our organization's culture and business model
O 6.115.08
NA
Anticipated increases in labor costs may affect our opportunity to meet profitability targets
M 6.035.135.66
Our organization's succession challenges and ability to attract and retain top talent and
labor amid the constraints of a tightening talent/labor market may limit our ability to achieve
operational targets
O 6.015.325.80
Resistance to change in our culture may restrict our organization from making necessary
adjustments to the business model and core operations on a timely basis
O 6.014.835.63
Uncertainty surrounding our organization's core supply chain including the viability of key
suppliers, scarcity of supplies, energy sources, unpred

In [9]:
test_text = """
Reliability and security remain
top worries, but reputational risk
emerging as a major concern
Maintaining reliable and secure networks remains the biggest challenge for technology companies (see Figure 2). The
top three risks in this year's report data security and privacy, digital business interruption, and technology errors and
omissions have consistently ranked as top risks for all nine years of this report. IT resilience, which is closely linked to the
top three risks, remained in the top 5, the same as last year.

 """

test_v = vectorizer.transform([test_text])
print(multioutput_classifier.predict(test_v))


[[1 0]]


## Using upsampled training set
This time, let's test the upsampled dataset with different hyperparams

In [10]:

# Load data
train_df = pd.read_csv('../data/processed/clean_train_upsampled.csv')
valid_df = pd.read_csv('../data/processed/clean_valid.csv')

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Transform 'clean_content'
X_train = vectorizer.fit_transform(train_df['clean_content'])
X_valid = vectorizer.transform(valid_df['clean_content'])

# Prepare labels for multilabel classification
y_train = train_df[['cyber_label', 'environmental_issue']]
y_valid = valid_df[['cyber_label', 'environmental_issue']]

# MultiOutput Classifier
multioutput_classifier = MultiOutputClassifier(
    lgb.LGBMClassifier(
        verbosity=0,
        min_data_in_leaf=20,
        class_weight='balanced',
        boosting_type='dart',
        n_estimators=250,
    ),
    n_jobs=-1,
)
multioutput_classifier.fit(X_train, y_train)


# Prediction and evaluation
y_pred = multioutput_classifier.predict(X_valid)

# Evaluating the model using F1 score and classification report
print("Accuracy:", accuracy_score(y_valid, y_pred))
print("F1 Score:", f1_score(y_valid, y_pred, average='weighted'))
print("Classification Report (multilabel eval):\n", classification_report(y_valid, y_pred))

for i, label in enumerate(y_train.columns):
    print(f"Accuracy for {label}: {accuracy_score(y_valid.iloc[:, i], y_pred[:, i])}")
    print(f"Classification Report for {label}:\n", classification_report(y_valid.iloc[:, i], y_pred[:, i]))


Accuracy: 0.7857142857142857
F1 Score: 0.5272985610581519
Classification Report (multilabel eval):
               precision    recall  f1-score   support

           0       0.50      0.47      0.48        17
           1       0.70      0.44      0.54        52

   micro avg       0.63      0.45      0.53        69
   macro avg       0.60      0.46      0.51        69
weighted avg       0.65      0.45      0.53        69
 samples avg       0.12      0.12      0.12        69

Accuracy for cyber_label: 0.9325396825396826
Classification Report for cyber_label:
               precision    recall  f1-score   support

           0       0.96      0.97      0.96       235
           1       0.50      0.47      0.48        17

    accuracy                           0.93       252
   macro avg       0.73      0.72      0.72       252
weighted avg       0.93      0.93      0.93       252

Accuracy for environmental_issue: 0.8452380952380952
Classification Report for environmental_issue:
       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


We can see that this model is working well on the validation dataset, however, we doubt the contexual abilities in case of any incoming text for prediction, it will rely on the frequency of words, let's try something else.