## Set Up
- Install SentenceTransformers library<br>
- Import relevant packages

In [None]:
!pip install -U sentence-transformers
# SentenceTransformers is a Python framework for state-of-the-art sentence, text and image embeddings.

In [None]:
import os
import numpy as np
import pandas as pd
import string

from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L12-v2')  # Load the `all-MiniLM-L12-v2` Transformer model

In [4]:
input_path = "data/input/parsed_text/"  # path storing the parsed annual reports 
output_path = "data/output/relevant_text/"  # path storing the relevant text extracted from the parsed annual reports

filenames = os.listdir(input_path)

In [5]:
filenames

['WH_SMITH_2021_text.csv',
 'PANTHEON_INTERNATIONAL_2022_text.csv',
 'HAYS_2021_text.csv',
 'GENUS_2021_text.csv',
 'CHEMRING_GROUP_2021_text.csv',
 'SPECTRIS_2021_text.csv',
 'IG_GROUP_HOLDINGS_2021_text.csv',
 'FIDELITY_EMERGING_MARKETS_2022_text.csv',
 'MERCHANTS_TRUST_2022_text.csv',
 'ASTON_MARTIN_LAGONDA_GLOBAL_HOLDING_2021_text.csv']

## Helper Functions

In [6]:
def remove_punc(s):
  exclude = string.punctuation
  final_punc = ''.join(list(i for i in exclude if i not in ['%', '$', '&']))
  s = ''.join(ch for ch in s if ch not in list(final_punc))
  return s

def keyword_check(sentence, filter):
  tokenized = str(sentence).split()
  relevant_keywords = set(tokenized) & set(filter)
  return relevant_keywords  

def build_repository_matrix(relevant_df):
  relevant_df["text_processed"] = relevant_df.text_raw.map(remove_punc)
  relevant_sentences = list(relevant_df["text_processed"])  # repository of sentences relevant to FX hedging
  repository_matrix = model.encode(relevant_sentences)  # vectorize our relevant sentences into a matrix using sentence embeddings
  return repository_matrix

def relevance(sentence, threshold=50):
  input_sentence = model.encode([sentence]) # sentence embedding vectorization for input sentence
  similarity = cosine_similarity(input_sentence, repository_matrix) # compute cosine similarity
  angle = np.rad2deg(np.arccos(np.clip(similarity,-1,1))) # convert cosine similarity ==> radian ==> degree
  min_index = np.argmin(angle) # find index of smallest angle 
  min_angle = angle[0][min_index] # retrieve smallest angle
  if abs(min_angle) < threshold:
    return True, min_angle
  else:
    return False, min_angle

def text_extraction(df):

  # extracting the columns that we want to keep: (text_raw, text_partial_preprocessed, page_num)
  text_raw =  list(df["text_raw"])
  text_preprocessed = list(df["text_partial_preprocessed"])
  page_num = list(df["page_num"])
  combined = []
  for i in range(len(text_raw)):
    data = [text_raw[i], text_preprocessed[i], page_num[i]]
    combined.append(data)
  
  # preliminary sentence filtering & storing associated keywords --> crucial step for reduced time complexity!
  input_data = []
  for sentence in combined:
    relevant_keywords = keyword_check(sentence[1], keywords_filter)
    if bool(relevant_keywords):
      sentence.append(", ".join(list(relevant_keywords)))
      input_data.append(sentence)

  # relevance prediction
  relevant_results = []
  for sentence in input_data:
    label, score = relevance(sentence[1]) 
    if label:
      relevant_results.append((sentence[0], sentence[1], sentence[2], score, sentence[3]))

  # display results / metrics
  print("initial number of sentences: {}".format(df.shape[0]))
  print("after preliminary filtering: {}".format(len(input_data)))
  print("number of relevant sentences identified: {}".format(len(relevant_results)))

  return relevant_results

## Main Script

In [7]:
def CS_pipeline_main():

  # common keywords associated with FX hedging
  global keywords_filter
  keywords_filter = ["hedge", "hedges", "hedging", "foreign", "currency", "currencies", "forward", 
                    "contracts", "swaps", "cash flow", "fair value", "derivatives", "exposure", 
                    "rate", "risk", "risks"]
  metadata = []
  metadata_filename = "overview.csv"

  # reading relevant sentences dataset and building repository matrix
  relevant_df = pd.read_csv("data/repository.csv")

  global repository_matrix
  repository_matrix = build_repository_matrix(relevant_df)

  # reading input data
  for filename in filenames:
    old_filename, file_extension = os.path.splitext(filename)
    new_filename = old_filename + "_relevant" + file_extension
    input_csv = input_path + filename
    output_csv = output_path + new_filename

    df = pd.read_csv(input_csv, index_col=0)

  # running predictions, Sorting by cosine_similarity score, Saving into output file
    relevant_results = text_extraction(df)
    relevant_results_sorted = pd.DataFrame(relevant_results, columns = ['text_raw', 'text_partial_preprocessed', 'page_num', 'cosine_similarity_score', 'relevant_keywords']).sort_values(by="cosine_similarity_score", ascending=True)
    relevant_results_sorted.to_csv(output_csv, index=False)
    print(output_csv, "created...")
    print("------------------------------------------------------------")
    metadata.append([old_filename, df.shape[0], len(relevant_results)])

  # overview of metadata
  metadata_df = pd.DataFrame(metadata, columns = ['annual_report', 'total_num_of_sentences', 'num_of_relevant_sentences'])
  metadata_df["ratio"] = (metadata_df["num_of_relevant_sentences"] / metadata_df["total_num_of_sentences"]).round(6)
  metadata_df.sort_values(by="ratio", ascending=False, inplace=True)
  if os.path.exists(output_path + metadata_filename):
    old_metadata = pd.read_csv(output_path + metadata_filename)
    new_metadata = pd.concat([old_metadata, metadata_df])   
    os.remove(output_path + metadata_filename)
    new_metadata.to_csv(output_path + metadata_filename, index=False)
  else:
    metadata_df.to_csv(output_path + metadata_filename, index=False)

In [8]:
CS_pipeline_main()  # approx. 8 mins to process 10 annual reports

initial number of sentences: 3209
after preliminary filtering: 257
number of relevant sentences identified: 25
data/output/relevant_text/WH_SMITH_2021_text_relevant.csv created...
------------------------------------------------------------
initial number of sentences: 2254
after preliminary filtering: 195
number of relevant sentences identified: 26
data/output/relevant_text/PANTHEON_INTERNATIONAL_2022_text_relevant.csv created...
------------------------------------------------------------
initial number of sentences: 3355
after preliminary filtering: 259
number of relevant sentences identified: 26
data/output/relevant_text/HAYS_2021_text_relevant.csv created...
------------------------------------------------------------
initial number of sentences: 3080
after preliminary filtering: 338
number of relevant sentences identified: 42
data/output/relevant_text/GENUS_2021_text_relevant.csv created...
------------------------------------------------------------
initial number of sentences: 