# Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
! pip install simpletransformers &> /dev/null

In [3]:
import os
import pandas as pd
from simpletransformers.classification import (
    MultiLabelClassificationModel, MultiLabelClassificationArgs
)

In [4]:
DATA_INPUT_PATH = "/content/drive/MyDrive/fin-disclosures-nlp/data/inference/"
MODELS_DIR = "/content/drive/MyDrive/fin-disclosures-nlp/models"

## Data loading

In [5]:
df_inference_300 = pd.read_csv(os.path.join(DATA_INPUT_PATH, "stox300_reports_paragraphs.csv")) 
#df_inference_50 = pd.read_csv(os.path.join(DATA_INPUT_PATH, "stox50_reports_paragraphs.csv")) 

## Model loading

In [6]:

# Load the saved model
model = MultiLabelClassificationModel(
    "roberta",
    os.path.join(MODELS_DIR, "multi-label", "realistic_roberta-large_cro_sub_type"),
)

# Prediction

## Trial predictions

In [7]:
predictions, raw_outputs = model.predict(["Storms, floodings and other climate related disasters could affect our sales volumes.", "Regulation because of climate change, for example carbon taxes might affect the profitability of our business and could lead to lower income."])
predictions, raw_outputs

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

([[1, 0, 0, 0, 0], [0, 0, 1, 1, 1]],
 array([[0.93164062, 0.18664551, 0.03210449, 0.02276611, 0.03710938],
        [0.16662598, 0.2668457 , 0.92626953, 0.86230469, 0.8125    ]]))

## Predict whole dataset

In [8]:
is_first_half = True
df_inference_300.loc[(df_inference_300.text != df_inference_300.text), "text"] = ""
inference_df = df_inference_300.loc[0:1000000] if is_first_half else df_inference_300.loc[1000000:]
inference_docs = inference_df.text.to_list()

In [9]:
import numpy as np
thresholds = [0.970703125, 0.958984375, 0.818359375, 0.90380859375, 0.7890625]
CATEGORY_CODES= ["ACUTE", "CHRON", "POLICY", "MARKET", "REPUT"]

def chunk_list(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

all_predictions = []
all_raw_outputs = []
n_in_chunk = 20000
chunks = chunk_list(inference_docs, n_in_chunk)
for idx, c in enumerate(chunks):
  print(f"Processing chunk {idx + 1}")
  predictions, raw_outputs = model.predict(c)
  predictions = (raw_outputs > np.array([i for i in thresholds])) * 1
  all_predictions.extend(predictions.tolist())
  all_raw_outputs.extend(raw_outputs.tolist())

Processing chunk 1


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 2


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 3


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 4


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 5


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 6


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 7


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 8


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 9


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 10


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 11


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 12


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 13


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 14


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 15


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 16


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 17


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 18


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 19


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 20


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 21


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 22


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 23


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 24


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 25


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 26


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 27


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 28


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 29


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 30


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 31


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 32


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 33


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 34


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 35


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 36


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 37


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 38


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 39


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 40


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 41


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 42


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 43


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 44


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 45


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 46


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 47


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 48


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 49


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 50


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Processing chunk 51


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
inference_df["preds_labels"] = all_predictions
inference_df["preds_prob"] = all_raw_outputs

inference_df.to_csv(os.path.join(DATA_INPUT_PATH, "inference_output_first.csv" if is_first_half else "inference_output_second.csv"))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
