In [2]:
#%pip install transformers
#%pip install torch

Collecting transformers
  Using cached transformers-4.30.0-py3-none-any.whl (7.2 MB)
Collecting tqdm>=4.27
  Downloading tqdm-4.65.0-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.1/77.1 kB[0m [31m858.7 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting safetensors>=0.3.1
  Downloading safetensors-0.3.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
Collecting regex!=2019.12.17
  Downloading regex-2023.6.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (772 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.3/772.3 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting huggingface-hub<1.0,>=0.14.1
  Using cached huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-

In [1]:
import os
import sys
import time
import numpy as np
import pandas as pd
import torch
from time import time

torch.no_grad()
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print("device:", device)

from scipy.special import softmax
from tqdm.notebook import tqdm
from transformers import ZeroShotClassificationPipeline, pipeline

device: cuda:0


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class MyPipeline(ZeroShotClassificationPipeline):
    def postprocess(self, model_outputs, multi_label=False):
      candidate_labels = [outputs["candidate_label"] for outputs in model_outputs]
      sequences = [outputs["sequence"] for outputs in model_outputs]
      logits = np.concatenate([output["logits"].numpy() for output in model_outputs])
      N = logits.shape[0]
      n = len(candidate_labels)
      num_sequences = N // n
      reshaped_outputs = logits.reshape((num_sequences, n, -1))

      if multi_label or len(candidate_labels) == 1:
          # softmax over the entailment vs. contradiction dim for each label independently
          ids=classifier.model.config.label2id
          entailment_id = ids["entailment"]
          contradiction_id = ids["contradiction"]
          neutral_id = ids["neutral"]
          entail_contr_logits = reshaped_outputs[..., [contradiction_id, entailment_id, neutral_id]]
          scores = np.exp(entail_contr_logits) / np.exp(entail_contr_logits).sum(-1, keepdims=True)
          scores = scores[..., 1]
      else:
          # softmax the "entailment" logits over all candidate labels
          entail_logits = reshaped_outputs[..., self.entailment_id]
          scores = np.exp(entail_logits) / np.exp(entail_logits).sum(-1, keepdims=True)

      top_inds = list(reversed(scores[0].argsort()))
      return {
          "sequence": sequences[0],
          "labels": [candidate_labels[i] for i in top_inds],
          "scores": scores[0, top_inds].tolist(),
      }

In [4]:
classifier = pipeline(model="MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli",
                      #model="facebook/bart-large-mnli",
                      pipeline_class=MyPipeline, 
                      device=device)

In [5]:
df = pd.read_pickle("paragraphs_stoxx.pkl")
df_m = pd.read_pickle("master_stoxx.pkl")
swiss_re_reports = df_m[df_m.company.isin(["SwissReAG"])].report_id.tolist()
swiss_re = df[df.report_id.isin(swiss_re_reports)]
swiss_re = swiss_re[swiss_re.n_words > 10]
swiss_re = swiss_re.sort_values(by=['n_words'])
swiss_re

Unnamed: 0,report_id,page_no,paragraph_nr,text,n_words,loss_kw,unexpected_kw
2308538,SwissReAG-AR_2020,166,2063,R e g u la to ry m at te rs,11,False,False
2293345,SwissReAG-AR_2004,136,1808,How much influence does event-based modelling ...,11,False,False
2309868,SwissReAG-AR_2020,248,3393,US IG corporate bond portfolio US Corp IG ESG ...,11,False,False
2309871,SwissReAG-AR_2020,248,3396,UK IG corporate bond portfolio UK Corp IG ESG ...,11,False,False
1774249,SwissReAG-AR_2012,57,586,42% Americas 1 027 ASIA-Pacific 64.2% EMEA 25....,11,False,False
...,...,...,...,...,...,...,...
2326785,SwissReAG-AR_1967,9,44,Investments The income from investments again ...,522,True,False
2359535,SwissReAG-AR_2013,144,1573,date news Method of dissemination 3 January Sw...,525,True,False
2345319,SwissReAG-AR_1963,12,44,The last General Meeting re-elected Mr. Karl B...,567,False,False
1772604,SwissReAG-AR_2017,379,4022,Forward-looking statements typically are ident...,581,True,True


In [6]:
candidate_labels = ["a business loss or adverse development", 
                    "a loss",
                    "an adverse development",
                    "an unexpected event",
                    "a surprising event",
                    "a surprising development",
                    "a unexpected development",
                    "a surprising or unexpected development",
                    "a surprising or unexpected event"]
hypothesis_template = "This statement describes {}."

In [15]:
files = [x.split(".")[0] for x in os.listdir(f"Swiss_Re_DeBERTa-v3") if x[-4:] == ".pkl"]
starting_index = max( [int(x) for x in files] ) if files else -1
print(starting_index)

42


In [21]:
from tqdm import tqdm

In [22]:
n = 1_008
for i in tqdm(range(starting_index+1, int(np.ceil(swiss_re.shape[0]/n)))):
  sequences = swiss_re.iloc[(i*n):((i+1)*n)].text.to_list()
  pipe_results = classifier(sequences, 
                          candidate_labels, 
                          hypothesis_template=hypothesis_template, 
                          multi_label=True,
                          batch_size=24)
  res = pd.DataFrame(pipe_results)
  res = res.explode(["labels", "scores"])\
           .pivot(columns="labels",values="scores")\
           .rename(columns={
              "a business loss or adverse development": "L_A_dev", 
              "a loss": "L",
              "an adverse development": "A_dev",
              "an unexpected event": "U_ev",
              "a surprising event": "S_ev",
              "a surprising development": "S_dev",
              "a unexpected development": "U_dev",
              "a surprising or unexpected development": "S_U_dev",
              "a surprising or unexpected event": "S_U_ev"})
  res.set_index(swiss_re.iloc[i*n:((i+1)*n)].index, inplace=True)
  res[["report_id", "paragraph_nr"]] = swiss_re.iloc[(i*n):((i+1)*n)][["report_id", "paragraph_nr"]]
  res.to_pickle(f"Swiss_Re_DeBERTa-v3/{i}.pkl")




  0%|          | 0/15 [00:00<?, ?it/s][A[A[A


  7%|▋         | 1/15 [10:13<2:23:11, 613.67s/it][A[A[A


 20%|██        | 3/15 [31:23<2:06:13, 631.16s/it][A[A[A


 27%|██▋       | 4/15 [42:24<1:57:52, 642.97s/it][A[A[A


 33%|███▎      | 5/15 [54:09<1:50:52, 665.23s/it][A[A[A


 40%|████      | 6/15 [1:06:07<1:42:29, 683.30s/it][A[A[A


 47%|████▋     | 7/15 [1:18:30<1:33:42, 702.85s/it][A[A[A


 60%|██████    | 9/15 [1:44:57<1:15:18, 753.04s/it][A[A[A





Exception ignored in: <function tqdm.__del__ at 0x7fd9cdd1feb0>
Traceback (most recent call last):
  File "/anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/tqdm/std.py", line 1145, in __del__
    self.close()
  File "/anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/tqdm/notebook.py", line 283, in close
    self.disp(bar_style='danger', check_delay=False)
AttributeError: 'tqdm_notebook' object has no attribute 'disp'
Exception ignored in: <function tqdm.__del__ at 0x7fd9cdd1feb

In [23]:
files = [x.split(".")[0] for x in os.listdir(f"Swiss_Re_DeBERTa-v3") if x[-4:] == ".pkl"]
starting_index = max( [int(x) for x in files] ) if files else -1
print(starting_index)

57


In [32]:
df = pd.DataFrame({})
for x in tqdm(range(starting_index)):
    df = pd.concat([df, pd.read_pickle(f"Swiss_Re_DeBERTa-v3/{x}.pkl")], ignore_index=False)

100%|██████████| 57/57 [00:01<00:00, 44.77it/s]


In [35]:
df.head()

labels,L_A_dev,L,S_dev,S_ev,S_U_dev,S_U_ev,U_dev,A_dev,U_ev,report_id,paragraph_nr
2308538,0.084783,0.016385,0.004853,0.004972,0.39657,0.278761,0.012138,0.015624,0.013598,SwissReAG-AR_2020,2063
2293345,0.037174,0.005705,0.003604,0.003324,0.174486,0.431465,0.010967,0.016735,0.050623,SwissReAG-AR_2004,1808
2309868,0.061951,0.001881,0.001068,0.001066,0.641089,0.529536,0.001646,0.00368,0.002908,SwissReAG-AR_2020,3393
2309871,0.05634,0.001755,0.001084,0.001082,0.646112,0.5458,0.001653,0.003227,0.002986,SwissReAG-AR_2020,3396
1774249,0.005489,0.002543,0.000293,0.000395,0.078103,0.083594,0.000675,0.001259,0.002368,SwissReAG-AR_2012,586


In [28]:
df.to_pickle("Swiss_Re_DeBERTa-v3/compiled.pkl")

### Timing

In [16]:
n = 48
i = 0
sequences = swiss_re.iloc[:int(n/2)].text.to_list() + swiss_re.iloc[-int(n/2):].text.to_list()

In [17]:
import random 
random_sequences = random.sample(sequences, n)

In [18]:
%%time 
pipe_results = classifier(sequences, 
                        candidate_labels, 
                        hypothesis_template=hypothesis_template, 
                        multi_label=True,
                        batch_size=24)

CPU times: user 1min 27s, sys: 767 ms, total: 1min 28s
Wall time: 1min 28s


In [19]:
%%time
pipe_results = classifier(sequences, 
                        candidate_labels, 
                        hypothesis_template=hypothesis_template, 
                        multi_label=True,
                        batch_size=25)

CPU times: user 1min 34s, sys: 790 ms, total: 1min 35s
Wall time: 1min 35s


In [20]:
%%time
pipe_results = classifier(sequences, 
                        candidate_labels, 
                        hypothesis_template=hypothesis_template, 
                        multi_label=True)



CPU times: user 2min 5s, sys: 120 ms, total: 2min 5s
Wall time: 2min 5s


In [21]:
%%time 
pipe_results = classifier(random_sequences, 
                        candidate_labels, 
                        hypothesis_template=hypothesis_template, 
                        multi_label=True,
                        batch_size=24)

CPU times: user 2min 20s, sys: 1.18 s, total: 2min 21s
Wall time: 2min 21s


In [22]:
%%time
pipe_results = classifier(random_sequences, 
                        candidate_labels, 
                        hypothesis_template=hypothesis_template, 
                        multi_label=True)



CPU times: user 2min 5s, sys: 83.6 ms, total: 2min 6s
Wall time: 2min 5s


In [37]:
sequences = swiss_re.iloc[:n].text.to_list()

In [38]:
%%time 
pipe_results = classifier(sequences, 
                        candidate_labels, 
                        hypothesis_template=hypothesis_template, 
                        multi_label=True,
                        batch_size=24)

CPU times: user 10.2 s, sys: 4.33 ms, total: 10.2 s
Wall time: 10.2 s
