<a href="https://colab.research.google.com/github/dafrie/fin-disclosures-nlp/blob/master/notebooks/CR_Identification_and_Classification_Step_II.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CRO Classification with Transformers

  1. Load best binary/multi-label classifier
  2. Load best multi-label sub type classifier
  3. Load test data
  4. Predict with binary classifier
  5. Predict with multi-label classifier
  6. Set labels to 0 for all those paragraphs not classified as positive in 3.
  7. Calculate evaluation metrics

# Setup

In [None]:
try:
  import google.colab
  is_running_in_colab = True
except:
  is_running_in_colab = False

if is_running_in_colab:
  # Load Google drive where the data and models are stored
  from google.colab import drive
  drive.mount('/content/drive')

Mounted at /content/drive


### Overall config

In [None]:
############################## CONFIG ##############################
# Dataset config
SCENARIO = "2step"
TASK = "multi-label"
CATEGORY_LEVEL = "cro_sub_type_combined"
FILTER_OP = True #@param { type: "boolean"}

# TODO: Change to best identifier!
BEST_IDENTIFIER = "transformer/binary/realistic_distilbert-base-uncased_cro_sub_type_combined/checkpoint-6258"
BEST_CLASSIFIER = "transformer/multi-label/efficient-realistic_distilbert-base-uncased_cro_sub_type_combined"

# Evaluation metric config. See for context: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
AVERAGING_STRATEGY = 'macro' #@param ["micro",  "macro", "weighted"]

RESULTS_FILE_NAME = f"{CATEGORY_LEVEL}_{SCENARIO}_results.csv"

# To make the notebook reproducible (not guaranteed for pytorch on different releases/platforms!)
SEED_VALUE = 42

###########################

# Set to true if fine-tuning should be enabled. Else it loads fine-tuned model
ENABLE_FINE_TUNING = True #@param {type:"boolean"}


# See list here: https://huggingface.co/models
MODEL_NAME = 'albert-large-v2' #@param ["bert-base-uncased", "bert-large-uncased", "albert-base-v2", "albert-large-v2", "albert-xlarge-v2", "albert-xxlarge-v2", "roberta-base", "roberta-large", "distilbert-base-uncased", "distilbert-base-cased", "xlnet-base-cased", "xlnet-large-cased"]

# The DataLoader needs to know our batch size for training. BERT Authors recommend 16 or 32, however this leads to an error due to not enough GPU memory
BATCH_SIZE = 16 #@param ["8", "16", "32"] {type:"raw"}
MAX_TOKEN_SIZE = 256 #@param [512,256,128] {type:"raw"}
EPOCHS = 3 # @param [1,2,3,4] {type:"raw"}
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0

# Enable comet-ml logging
DISABLE_COMET_ML = True #@param {type:"boolean"}
SHOULD_HYPERPARAMETER_SEARCH = False #@param {type:"boolean"}
####################################################################
LOCAL_DIR = "/Users/david/Nextcloud/Dokumente/Education/Uni Bern/Master Thesis/Analyzing Financial Climate Disclosures with NLP/Methodology/"
COLAB_DIR = "/content/drive/MyDrive/fin-disclosures-nlp"

parameters = {
    "task": TASK,
    "category_level": CATEGORY_LEVEL,
    "scenario": SCENARIO,
    "enable_fine_tuning": ENABLE_FINE_TUNING,
    "model_type": "transformer",
    "model_name": MODEL_NAME,
    "batch_size": BATCH_SIZE,
    "max_token_size": MAX_TOKEN_SIZE,
    "epochs": EPOCHS,
    "learning_rate": LEARNING_RATE,
    "weight_decay": WEIGHT_DECAY,
    "seed_value": SEED_VALUE,
}

In [None]:
if is_running_in_colab:
  # Install transformers library + datasets helper
  !pip install transformers --quiet &> /dev/null
  !pip install datasets --quiet &> /dev/null
  !pip install optuna --quiet &> /dev/null

  # Latex for output
  ! apt install texlive-latex-recommended -qq &> /dev/null
  ! apt install texlive-latex-extra -qq &> /dev/null
  ! apt install dvipng -qq &> /dev/null
  ! apt install cm-super -qq &> /dev/null

  # Load repository

  !git clone https://github.com/dafrie/fin-disclosures-nlp.git    
  %cd /content/fin-disclosures-nlp
  !git pull

%load_ext autoreload
%autoreload 2

# Load repository utils
import sys
import os
sys.path.append('..')

from data import constants
from data import cro_dataset
from data import evaluation
from data import cro_transformer_models

DIR = COLAB_DIR if is_running_in_colab else LOCAL_DIR
DATA_DIR = os.path.join(DIR, "data", "labels")
MODELS_DIR = os.path.join(DIR, "models")
RESULTS_DIR = os.path.join(DIR, 'results')
RESULTS_FILE_PATH = os.path.join(RESULTS_DIR, RESULTS_FILE_NAME)

fatal: destination path 'fin-disclosures-nlp' already exists and is not an empty directory.
/content/fin-disclosures-nlp
Already up to date.


## Load best models

### Identication model

In [None]:
from transformers import Trainer, AutoModelForSequenceClassification, AutoTokenizer

# Get path of the best finetuned identifier 
identifier_checkpoint_path = os.path.join(MODELS_DIR, BEST_IDENTIFIER)
print(f"Loading identifier model (First Step) from {identifier_checkpoint_path}")

def init_identifier_model(): return AutoModelForSequenceClassification.from_pretrained(identifier_checkpoint_path, num_labels=2, local_files_only=True)
identifier_model = Trainer(model_init=init_identifier_model)
identifier_tokenizer = AutoTokenizer.from_pretrained(identifier_checkpoint_path, local_files_only=True)

Loading identifier model (First Step) from /content/drive/MyDrive/fin-disclosures-nlp/models/transformer/binary/realistic_distilbert-base-uncased_cro_sub_type_combined/checkpoint-6258


### Classification model

In [None]:
# Get path of the best finetuned classifier 
classifier_checkpoint_path = os.path.join(MODELS_DIR, BEST_CLASSIFIER)
print(f"Loading classifier model (Second Step) from {classifier_checkpoint_path}")

def init_classifier_model(): return AutoModelForSequenceClassification.from_pretrained(classifier_checkpoint_path, num_labels=5, local_files_only=True)
classifier_model = Trainer(model_init=init_classifier_model)
classifier_tokenizer = AutoTokenizer.from_pretrained(classifier_checkpoint_path, local_files_only=True)

Loading classifier model (Second Step) from /content/drive/MyDrive/fin-disclosures-nlp/models/transformer/multi-label/efficient-realistic_distilbert-base-uncased_cro_sub_type_combined


## Load data

In [None]:
# Load only test data
first_dataset = cro_dataset.prepare_datasets(
    data_dir=DATA_DIR,
    task="binary", 
    cro_category_level="cro_sub_type_combined",
    should_filter_op=FILTER_OP,
    validation_split=0.1,
    train_neg_sampling_strategy=None,
    test_neg_sampling_strategy="all",
    seed_value=SEED_VALUE,
    as_huggingface_ds=True,
)

# Load only test data
second_dataset = cro_dataset.prepare_datasets(
    data_dir=DATA_DIR,
    task="multi-label", 
    cro_category_level="cro_sub_type_combined",
    should_filter_op=FILTER_OP,
    validation_split=0.1,
    train_neg_sampling_strategy=None,
    test_neg_sampling_strategy="all",
    seed_value=SEED_VALUE,
    as_huggingface_ds=True,
)

first_dataset = first_dataset['test']
second_dataset = second_dataset['test']


# Need to tokenize with both model's tokenizer, as they could be different
first_step_dataset = first_dataset.map(lambda ds: identifier_tokenizer(
            ds["text"], truncation=True, padding='max_length', max_length=MAX_TOKEN_SIZE), batched=True)
second_step_dataset = second_dataset.map(lambda ds: classifier_tokenizer(
            ds["text"], truncation=True, padding='max_length', max_length=MAX_TOKEN_SIZE), batched=True)

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 251
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 28209
    })
    valid: Dataset({
        features: ['text', 'labels'],
        num_rows: 28
    })
})
DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 251
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 28209
    })
    valid: Dataset({
        features: ['text', 'labels'],
        num_rows: 28
    })
})


HBox(children=(FloatProgress(value=0.0, max=29.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=29.0), HTML(value='')))




In [None]:
preds = identifier_model.predict(first_step_dataset)

## Predict (identification) the test set

In [None]:
import numpy as np
import math
from tqdm.notebook import trange, tqdm

def do_predict(model, dataset, batch_size=1000):
  # Tracking variable for the output predictions
  predictions = None

  # Shard the dataset
  no_of_shards = math.ceil(dataset.num_rows / batch_size)
  # Do prediction for each shard
  for i in trange(no_of_shards):
    shard = dataset.shard(no_of_shards, i)
    shard_output = model.predict(shard)
    if predictions is None:
      predictions = shard_output.predictions
    else:
      predictions = np.concatenate((predictions, shard_output.predictions), axis=0)
  return predictions

In [None]:
test_labels = first_step_dataset['labels']
first_step_dataset.remove_columns_("labels")
first_dataset_predictions = do_predict(identifier_model, first_step_dataset)

HBox(children=(FloatProgress(value=0.0, max=29.0), HTML(value='')))




In [None]:
second_step_dataset.remove_columns_("labels")
second_dataset_predictions = do_predict(classifier_model, second_step_dataset)

HBox(children=(FloatProgress(value=0.0, max=29.0), HTML(value='')))




In [None]:
from scipy.special import softmax

# Convert to probabilities
first_dataset_probs = softmax(first_dataset_predictions, axis=1)
second_dataset_probs = softmax(second_dataset_predictions, axis=1)

# TODO: Load best thresholds of this model (copy paste for now)
identifier_best_thresholds = [8]   # [0.45253575]
classifier_best_thresholds = [0.8539462, 0.52350485, 0.73815167, 0.6694577, 0.59423065]

# Only consider the positive column
first_dataset_probs = first_dataset_probs[:,1]
# first_dataset_probs = (first_dataset_probs > identifier_best_thresholds[0]) * 1

assert len(first_dataset_predictions) == (len(second_dataset_probs))

# Multiply the indicator vector of (0, 1, ..., n) with the probabilities array so we set to 0 all rows that would have been filtered in the previous step
first_dataset_probs = first_dataset_probs.reshape(-1, 1)
filtered_probs = first_dataset_probs * second_dataset_probs

filtered_probs.sum(axis=0)
first_dataset_probs.sum()

7250.3223

In [None]:
from sklearn.metrics import confusion_matrix, roc_auc_score, average_precision_score, roc_curve, precision_recall_curve, classification_report, multilabel_confusion_matrix

preds_probs = first_dataset_probs
roc_auc = roc_auc_score(test_labels, preds_probs, average="macro")
pr_auc = average_precision_score(test_labels, preds_probs, average="macro")
print(roc_auc, pr_auc)

0.4358175970323014 0.00163162649149815


In [None]:
first_dataset_probs

array([[0.31515253],
       [0.05226306],
       [0.08898468],
       ...,
       [0.18585016],
       [0.03461825],
       [0.08023971]], dtype=float32)

In [None]:
first_dataset_probs.sum()

7250.3223

# Compare results

In [None]:
second_dataset_predictions

array([[-0.48558527, -0.6597072 , -0.06553887, -0.42671126, -0.535562  ],
       [ 0.19544666, -0.68530756,  0.51019925, -0.003137  , -0.38136312],
       [-0.8579987 , -0.84948504, -0.07705627, -0.622136  , -0.7075825 ],
       ...,
       [-0.6445179 , -0.77097297, -0.12218759, -0.52835494, -0.5942524 ],
       [-0.39127487, -0.9740663 ,  0.08745108, -0.52693313, -0.96705264],
       [-0.36492407, -0.86849505, -0.08511597, -0.37672988, -0.6840325 ]],
      dtype=float32)

## Load second dataset for training & validation

## Training of the sub type classifier

## Inference

In [None]:
from scipy.special import softmax 
import pandas as pd
from datasets import DatasetDict, Dataset, load_dataset, Sequence, ClassLabel, Features, Value, concatenate_datasets

inference_df = pd.DataFrame([
                             """
                             The Company’s operations result in the generation of Scope 1 emissions. The Company’s emissions are primarily generated by its processing facilities as well by transportation of goods. 
                             Our processing facilities account for approximately 62% of our total Scope 1 emissions. Currently, none of our emissions are subject to a carbon- pricing regulation. 
                             We do not anticipate any such risks to emerge in the short-term; however, we continue to monitor emerging regulatory developments, including Nationally Determined Contributions per the 
                             Paris Agreement, and are assessing and utilizing these contributions to guide our medium- and long-term strategy to mitigate policy and legal risk. 
                             Our transportation infrastructure, including a company-owned fleet of marine, road, and rail vehicles, account for approximately 35% of our total Scope 1 emissions. 
                             Currently, none of our transportation-related emissions are subject to a carbon-pricing regulation. We are in the process of retiring old vehicles in favor of newer, more fuel-efficient 
                             vehicles based on normal attrition. However, in the medium to long term we anticipate the early retirement of some vehicles before the end of their useful life as part of our 
                             emissions-reduction strategy, discussed in the section below. 
                             """,
                             """
                             We anticipate that carbon-pricing regulations may emerge over the medium- to long-term timeframe, and such regulations may result in a significant financial impact to the company’s 
                             operations, including an increase in operating costs as well as potential capital expenditures to reduce emissions. 12 To mitigate this risk, the Company established a strategy to 
                             both reduce overall energy consumption by 20%, generate half of our energy from company- owned renewable sources, and achieve a 40% reduction in GHG emissions by 2035. 
                             """,
                             """The Company has suppliers located in areas that are subject to acute physical climate risks. 
                             The Company has evaluated risks to key products and has developed associated strategies to mitigate such risks, as noted below.  
                             The Company faces potential disruption to its ability to process peanuts into peanut oil as a result of increasing storm frequency and severity in 
                             the Gulf Coast, USA region. Such disruption could result in an adverse impact to the Company’s revenues. 
                             """,
                             """
                             EPA has authority under the Clean Air Act to monitor and regulate greenhouse gas emissions.
                             """,
                             """Energy saving solutions like ours will help the world in combating climate change and it will increase our expected yearly income in the future. """,
                             """The physical consequences of climate change, such as anticipated increasing storm activity over the carribean will most likely affect the quality and value of our production in the next few decades.""",
                             """We will increase our target of carbon emission reductions to 50% in comparison to 1990 for the next decade.""",
                             """The Company’s manufacturing facilities are located in regions that may be impacted by severe weather events, including potential damage to physical assets as well as disruptions to manufacturing activities.""",
                             """Hello world! How are you today?""",
                             """The physical consequences of climate change are the biggest risk to the world and may also affect our financial results in the near future.""",
                             """Customers may not view our response to climate change as sufficient, affecting our reputation and thus affecting our financial results."""
], columns=["text"])

inference_ds = Dataset.from_pandas(inference_df)

inference_ds = inference_ds.map(lambda ds: identifier_tokenizer(
            ds["text"], truncation=True, padding='max_length', max_length=256), batched=True)


inference_result = identifier_model.predict(inference_ds)
preds = inference_result.predictions
preds_prob = softmax(preds, axis=1)
preds_prob

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




array([[0.03751682, 0.9096143 ],
       [0.02312849, 0.9407649 ],
       [0.02844577, 0.93055284],
       [0.13002154, 0.7899205 ],
       [0.04880824, 0.88986737],
       [0.03595606, 0.9212987 ],
       [0.10802615, 0.8122958 ],
       [0.05026895, 0.8973977 ],
       [0.5209146 , 0.44052956],
       [0.03670503, 0.9144431 ],
       [0.03286976, 0.9234117 ]], dtype=float32)