# Multi-Label Baseline Models

This is the notebook containing End-To-End models for multi-label classification of CRO's, for both level using TF-IDF as input features for a set of classifiers

In [33]:
############################## CONFIG ##############################
# Task config
TASK = "binary" #@param ["multi-label", "binary"]
CATEGORY_LEVEL = 'cro_sub_type_combined' #@param ["cro", "cro_sub_type_combined"]
MODEL_TYPE = "baseline" #@param ["baseline", "transformer"]
MODEL_NAME = "svm"

# Dataset config
FILTER_OP = True #@param { type: "boolean"}
SCENARIO = "efficient-realistic" #@param [ "optimistic", "efficient-realistic", "realistic"]

# Evaluation metric config. See for context: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
AVERAGING_STRATEGY = 'macro' #@param ["micro",  "macro", "weighted"]

RESULTS_FILE_NAME = f"{CATEGORY_LEVEL}_{TASK}_results.csv"

# To make the notebook reproducible (not guaranteed for pytorch on different releases/platforms!)
SEED_VALUE = 42

##################

SCORING_METRIC = 'average_precision'  # 'average_precision', 'roc_auc'. AP is equal to Precision/Recall AUC! See for discussion: https://github.com/scikit-learn/scikit-learn/issues/5992

####################################################################
LOCAL_DIR = "/Users/david/Nextcloud/Dokumente/Education/Uni Bern/Master Thesis/Analyzing Financial Climate Disclosures with NLP/Methodology/"
COLAB_DIR = "/content/drive/MyDrive/fin-disclosures-nlp"

if SCENARIO == "optimistic":
  TRAIN_NEG_SAMPLING_STRATEGY = "None"
  TEST_NEG_SAMPLING_STRATEGY = "None"

elif SCENARIO == "efficient-realistic":
  TRAIN_NEG_SAMPLING_STRATEGY = "only_OP"
  TEST_NEG_SAMPLING_STRATEGY = "all"

elif SCENARIO == "realistic":
  TRAIN_NEG_SAMPLING_STRATEGY = "all"
  TEST_NEG_SAMPLING_STRATEGY = "all"


parameters = {
    "task": TASK,
    "category_level": CATEGORY_LEVEL,
    "model_type": MODEL_TYPE,
    "model_name": MODEL_NAME,
    "scenario": SCENARIO,
    "seed_value": SEED_VALUE,
}

In [34]:
if is_running_in_colab:
  # Load Google drive where the data and models are stored
  from google.colab import drive
  drive.mount('/content/drive')

In [40]:
if is_running_in_colab:
  # Install transformers library + datasets helper
  !pip install transformers --quiet
  !pip install datasets --quiet
  !pip install optuna --quiet

  # Latex for output
  ! apt install texlive-latex-recommended -qq
  ! apt install texlive-latex-extra -qq
  ! apt install dvipng -qq
  ! apt install cm-super -qq

  # Load repository

  !git clone https://github.com/dafrie/fin-disclosures-nlp.git    
  %cd /content/fin-disclosures-nlp
  !git pull

%load_ext autoreload
%autoreload 2

import sys
import os
import numpy as np
import pandas as pd
sys.path.append('..')

from data import constants
from data import cro_dataset
from data import dataframe_preparation
from data import evaluation

DIR = COLAB_DIR if is_running_in_colab else LOCAL_DIR
DATA_DIR = os.path.join(DIR, "data", "labels")
MODELS_DIR = os.path.join(DIR, "models", MODEL_TYPE)
RESULTS_FILE_PATH = os.path.join(DIR, 'results', RESULTS_FILE_NAME)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [41]:
# Load/Initialize results file
results = evaluation.Results(RESULTS_FILE_PATH, parameters)

In [42]:
train_docs, train_doc_labels, test_docs, test_doc_labels = cro_dataset.prepare_datasets(
    data_dir=DATA_DIR,
    task=TASK, 
    cro_category_level=CATEGORY_LEVEL, 
    should_filter_op=FILTER_OP, 
    train_neg_sampling_strategy=TRAIN_NEG_SAMPLING_STRATEGY, 
    test_neg_sampling_strategy=TEST_NEG_SAMPLING_STRATEGY, 
    seed_value=SEED_VALUE
)

Loaded dataset. Train: 1103, Test: 28209, Dim: 1


In [43]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, average_precision_score, roc_auc_score

# Custom scorer so we can pass in the averaging strategy
avg_scorer = make_scorer(average_precision_score if SCORING_METRIC == 'average_precision' else roc_auc_score, average=AVERAGING_STRATEGY)

svc_clf = SVC(probability=True, random_state=SEED_VALUE, class_weight="balanced") # Balanced: n_samples / (n_classes * np.bincount(y)). Since we are doing OneVsRest, this should be giving correct weights!
# Wrap with OvR in case of multi-label
multi_label_clf = OneVsRestClassifier(svc_clf)

pipeline_svm = Pipeline([
    ('bow', CountVectorizer(strip_accents = 'ascii')),
    ('tfidf', TfidfTransformer()),
    ('classifier', multi_label_clf if TASK == "multi-label" else svc_clf),
     ])

# Parameters to tune automatically with a grid search
# Note: The nested estimator is accessible via the __estimator identifier
param_svm = [
  {
      'bow__tokenizer': [dataframe_preparation.spacy_tokenizer, None],
      'bow__stop_words': ['english', None],
      'bow__ngram_range': [(1, 1), (1, 2)],
      'bow__max_features': [50, 200],
      'tfidf__use_idf': (True), 
      'classifier__estimator__C' if TASK == "multi-label" else 'classifier__C': [1, 10, 100], 
      'classifier__estimator__kernel' if TASK == "multi-label" else 'classifier__kernel': ['linear', 'rbf']},
]

# TODO: Remove
param_svm = [
  {
      'bow__tokenizer': [dataframe_preparation.spacy_tokenizer],
      'bow__stop_words': ['english'],
      'bow__ngram_range': [(1, 2)],
      'bow__max_features': [200],
      'tfidf__use_idf': [True], 
      'classifier__estimator__C' if TASK == "multi-label" else 'classifier__C': [10], 
      'classifier__estimator__kernel' if TASK == "multi-label" else 'classifier__kernel': ['linear']},
]

grid_clf = GridSearchCV(
    pipeline_svm,
    param_grid=param_svm,
    refit=True,
    n_jobs=-1, 
    scoring=avg_scorer,
)

# Grid search fitting
grid_clf.fit(train_docs, train_doc_labels)

GridSearchCV(estimator=Pipeline(steps=[('bow',
                                        CountVectorizer(strip_accents='ascii')),
                                       ('tfidf', TfidfTransformer()),
                                       ('classifier',
                                        SVC(class_weight='balanced',
                                            probability=True,
                                            random_state=42))]),
             n_jobs=-1,
             param_grid=[{'bow__max_features': [200],
                          'bow__ngram_range': [(1, 2)],
                          'bow__stop_words': ['english'],
                          'bow__tokenizer': [<function spacy_tokenizer at 0x7fc06859e550>],
                          'classifier__C': [10],
                          'classifier__kernel': ['linear'],
                          'tfidf__use_idf': [True]}],
             scoring=make_scorer(average_precision_score, average=macro))

In [None]:
cv_results = pd.DataFrame(grid_clf.cv_results_)

print(f"Best {SCORING_METRIC} score: {grid_clf.best_score_}")
print(f"Best params: \n{grid_clf.best_params_}")

In [None]:
# train_preds = grid_clf.predict(train_docs)
train_preds_prob = grid_clf.predict_proba(train_docs)

In [None]:
train_eval_scores, best_roc_threshold, best_pr_threshold = evaluation.threshold_moving_report(train_doc_labels, train_preds_prob)
results.log_experiment(train_eval_scores, prefix="train")
results.log_experiment({ "best_pr_threshold": best_pr_threshold.values(), "best_roc_threshold": best_roc_threshold.values()}, prefix="train")
train_eval_scores

In [None]:
# Predict for test
test_preds_prob = grid_clf.predict_proba(test_docs)

In [None]:
test_eval_scores = evaluation.test_evaluation_report(test_doc_labels, test_preds_prob, best_pr_threshold.values(), averaging=AVERAGING_STRATEGY)
results.log_experiment(test_eval_scores, prefix="test")
test_eval_scores

In [None]:
import pickle

SAVE_MODEL = False

if SAVE_MODEL:
    with open(os.path.join(MODELS_DIR), f" {TASK}_svm_{CATEGORY_LEVEL}.pkl", 'wb') as f:
        grid_clf.label_list = label_list
        pickle.dump(grid_clf, f, 4)