In [1]:
import os
import sys
import ast
import random
import numpy as np 
import pandas as pd
from pinecone import Pinecone

import torch
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel

sys.path.append('..')

from main import VectorDatabase, BiEncoder, CrossEncoder

  from tqdm.autonotebook import tqdm


In [2]:
import warnings

warnings.filterwarnings('ignore')

## Initiliaze Database

In [3]:
API_KEY = "c4ac140e-932e-40c3-84e5-e407580eef2a"
pc = Pinecone(api_key=API_KEY)
indexes = pc.list_indexes()
print(indexes)

[{
    "name": "finbert",
    "dimension": 768,
    "metric": "dotproduct",
    "host": "finbert-im26dq4.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "deletion_protection": "disabled"
}]


In [4]:
INDEX_NAME = 'finbert'
DIMENSION = 768 
CLOUD = 'aws'
REGION = 'us-west-1'

In [5]:
vector_db = VectorDatabase(api_key=API_KEY)
handler = vector_db.start_db(index_name=INDEX_NAME, dimension=DIMENSION, cloud=CLOUD, region=REGION)

## Read Data

In [6]:
data_folder_path = os.path.join('..', 'data')
query_df = pd.read_csv(os.path.join(data_folder_path, "FinDER/queries.csv"), index_col=0)
document_df = pd.read_csv(os.path.join(data_folder_path, "FinDER/corpus.csv"), index_col=0)
query_df

Unnamed: 0,text,Related Documents
q00001,What are the service and product offerings fro...,"['MSFT20230014', 'MSFT20230015']"
q00002,MSFT segment breakdown,[]
q00003,Who are Microsoft`s key customers?,[]
q00004,What is Microsoft`s business model,[]
q00005,MSFT Capex commitment,[]
...,...,...
q00214,How many distinct insurance underwriting group...,[]
q00215,What is the ticker symbol for Berkshire Hathaw...,[]
q00216,What is the largest operating segment of the B...,[]
q00217,Source of invested assets of insurance busines...,[]


## Embed Documents and Save to DB

In [7]:
embed_documents = False

In [8]:
tokenizer = AutoTokenizer.from_pretrained('ProsusAI/finbert')
model = AutoModel.from_pretrained('ProsusAI/finbert')

In [9]:
encoder = BiEncoder(tokenizer, model)

In [10]:
document_df["text"] = document_df["text"].fillna("") 
texts = document_df["text"].astype(str).tolist() 

In [11]:
try:
    if embed_documents:
        handler.delete_all()
except:
    print("No data in index to delete")

In [12]:
if embed_documents:

    batch_size = 100  
    document_df["text"] = document_df["text"].fillna("")
    document_df["title"] = document_df["title"].fillna("")
    texts = document_df["text"].astype(str).tolist()
    titles = document_df["title"].astype(str).tolist()

    # Should be len(texts) in the final version
    limit = len(texts)

    def batch_upsert(titles, texts, batch_size):
        # Iterate through batches of texts and titles
        for i in range(0, limit, batch_size):
            try:
                batch_texts = texts[i:i+batch_size]
                batch_titles = titles[i:i+batch_size]
                batch_indexes = document_df.index[i:i+batch_size]

                # Encode the titles and texts separately
                encoded_titles = encoder.encode_batch(batch_titles)
                encoded_texts = encoder.encode_batch(batch_texts)

                # Mean pooling: element-wise mean of title and text embeddings
                pooled_embeddings = (encoded_titles + encoded_texts) / 2.0

                # Prepare the batch data for upsert
                batch_data = [(str(idx), embedding.tolist()) for idx, embedding in zip(batch_indexes, pooled_embeddings)]

                # Perform batch upsert
                handler.index.upsert(vectors=batch_data)
                print(f"Upserted batch {i//batch_size + 1}")

            except Exception as e:
                print(f"Error: {e}")


    # Call the function with both titles and texts
    batch_upsert(titles, texts, batch_size=batch_size)

## Retrieve

In [13]:
retrieved_df = pd.DataFrame([[[] ] for _ in query_df.index], index=query_df.index, columns=["Documents"])
retrieved_df

Unnamed: 0,Documents
q00001,[]
q00002,[]
q00003,[]
q00004,[]
q00005,[]
...,...
q00214,[]
q00215,[]
q00216,[]
q00217,[]


In [14]:
for idx, row in query_df.iterrows():
    query = row["text"]
    query = encoder.encode(query)
    query = np.array(query, dtype=np.float32)

    query_list = query.tolist()

    results = handler.query_vector(query_list, top_k=20)
    retrieved_df.at[idx, "Documents"] = [ result["id"] for result in results]

In [15]:
retrieved_df

Unnamed: 0,Documents
q00001,"[ADBE20230209, BRK.A20230330, ADBE20230173, AD..."
q00002,"[ADBE20231062, TSLA20230439, TSLA20230008, TSL..."
q00003,"[GOOGL20230652, ORCL20231713, MSFT20230446, MS..."
q00004,"[LIN20231708, CPNG20231083, BRK.A20230330, PG2..."
q00005,"[MSFT20230430, MSFT20230429, ADBE20230975, NVD..."
...,...
q00214,"[MSFT20230238, ADBE20231055, PG20230206, MSFT2..."
q00215,"[MSFT20230446, TSLA20231571, ADBE20231748, DAL..."
q00216,"[ORCL20230433, NFLX20230009, AMZN20231245, TSL..."
q00217,"[BRK.A20230463, BRK.A20230080, BRK.A20230089, ..."


## Re-Rank

In [16]:
tokenizer = AutoTokenizer.from_pretrained('ProsusAI/finbert')
model = AutoModel.from_pretrained('ProsusAI/finbert')

In [17]:
cross_encoder = CrossEncoder(tokenizer=tokenizer,model=model)

In [18]:
class ContrastiveDataset(Dataset):
    """
    Dataset class for contrastive learning with positive and negative document sampling.
    """
    def __init__(self, queries, document_dict, related_docs, all_documents):
        # Filter out queries without positive examples
        self.queries = [q for q, rel_docs in zip(queries, related_docs) if len(rel_docs) > 0]
        self.related_docs = [rel_docs for rel_docs in related_docs if len(rel_docs) > 0]
        
        self.document_dict = document_dict  # Maps doc IDs to actual texts
        self.all_documents = all_documents  # List of all document IDs

    def __len__(self):
        return len(self.queries)

    def __getitem__(self, idx):
        query = self.queries[idx]
        positive_doc_ids = self.related_docs[idx]

        # Select a random positive document from the related documents
        positive_doc_id = random.choice(positive_doc_ids)
        positive_doc = self.document_dict[positive_doc_id]
        
        # Sample a negative document ID from the rest of the documents
        negative_doc_id = random.choice([doc for doc in self.all_documents if doc not in positive_doc_ids])
        negative_doc = self.document_dict[negative_doc_id]
        
        return query, positive_doc, negative_doc


In [19]:
def contrastive_loss(query_embeddings, positive_doc_embeddings, negative_doc_embeddings, temperature=0.07):
    # Cosine similarity between query and positive/negative document pairs
    pos_sim = F.cosine_similarity(query_embeddings, positive_doc_embeddings)
    neg_sim = F.cosine_similarity(query_embeddings, negative_doc_embeddings)
    
    # Compute the InfoNCE-like loss
    pos_exp = torch.exp(pos_sim / temperature)
    neg_exp = torch.exp(neg_sim / temperature)
    
    # Contrastive loss: encourage similarity with positive, discourage with negative
    loss = -torch.log(pos_exp / (pos_exp + neg_exp))
    
    return loss.mean()

In [20]:
def fine_tune(encoder: CrossEncoder, dataset, epochs=3, batch_size=8, learning_rate=1e-5, temperature=0.07):

    # Create a DataLoader for the dataset
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    # Define optimizer
    optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    
    encoder.train() 
    
    for epoch in range(epochs):
        total_loss = 0.0
        
        for batch in data_loader:
            queries, positive_docs, negative_docs = batch
            
            optimizer.zero_grad()  # Zero the gradients
            
            # Encode the queries and corresponding documents
            query_embeddings = torch.stack([encoder.encode(query, query) for query in queries])
            positive_doc_embeddings = torch.stack([encoder.encode(query, doc) for query, doc in zip(queries, positive_docs)])
            negative_doc_embeddings = torch.stack([encoder.encode(query, doc) for query, doc in zip(queries, negative_docs)])
            
            # Compute the contrastive loss
            loss = contrastive_loss(query_embeddings, positive_doc_embeddings, negative_doc_embeddings, temperature)
            total_loss += loss.item()
            
            # Backpropagate and update model parameters
            loss.backward()
            optimizer.step()
        
        # Print the loss for each epoch
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(data_loader)}")
    
    return encoder

In [21]:
dataset = ContrastiveDataset(query_df["text"], document_df["text"].to_dict(), query_df["Related Documents"].apply(ast.literal_eval),  document_df.index)
cross_encoder = fine_tune(cross_encoder, dataset, epochs=3, batch_size=8, learning_rate=1e-5, temperature=0.07)

Epoch [1/3], Loss: 1.2426532544195652
Epoch [2/3], Loss: 0.6164814867079258
Epoch [3/3], Loss: 0.4872545227408409


In [23]:
def compute_cosine_similarity(embedding_a, embedding_b):
    return F.cosine_similarity(embedding_a.unsqueeze(0), embedding_b.unsqueeze(0)).item()

def rerank_documents(encoder, query, documents, top_k=5):
    # Generate query embedding
    query_embedding = encoder.encode(query, query)
    
    # List to store document IDs and their cosine similarity scores
    ranked_results = []

    # Loop through each document and compute the cosine similarity score
    for doc_id in documents:
        document_text = document_df.loc[doc_id, 'text']  
        document_embedding = encoder.encode(query, document_text)
        similarity_score = compute_cosine_similarity(query_embedding, document_embedding)
        ranked_results.append((doc_id, similarity_score))

    # Sort the documents by cosine similarity score in descending order
    ranked_results = sorted(ranked_results, key=lambda x: x[1], reverse=True)
    
    # Keep only the top-k documents
    top_k_documents = [doc_id for doc_id, score in ranked_results[:top_k]]
    
    return top_k_documents

for idx, row in query_df.iterrows():
    query = row["text"]
    related_docs = retrieved_df.at[idx, "Documents"]
    reranked_docs = rerank_documents(cross_encoder, query, related_docs, top_k=3)
    retrieved_df.at[idx, "Documents"] = reranked_docs

In [24]:
retrieved_df

Unnamed: 0,Documents
q00001,"[NVDA20230430, MSFT20231710, MSFT20230975]"
q00002,"[TSLA20230439, ADBE20230975, LIN20230371]"
q00003,"[GOOGL20230652, NVDA20230140, ADBE20231748]"
q00004,"[PG20231472, MSFT20231788, PG20230011]"
q00005,"[GOOGL20230760, ADBE20230986, ADBE20231131]"
...,...
q00214,"[PG20230206, MSFT20231865, TSLA20230007]"
q00215,"[MSFT20230446, NFLX20230219, NVDA20230979]"
q00216,"[ORCL20230433, BRK.A20230715, TSLA20230439]"
q00217,"[NVDA20230395, TSLA20230886, BRK.A20230080]"


## Evaluate

In [25]:
def evaluate_retrieval(actual_related_ids, retrieved_docs_ids, top_k=None):
    precisions = []
    recalls = []
    
    for actual_ids, retrieved_ids in zip(actual_related_ids, retrieved_docs_ids):
        if top_k:
            retrieved_ids = retrieved_ids[:top_k]
        
        actual_set = set(actual_ids)
        retrieved_set = set(retrieved_ids)
        
        true_positives = len(actual_set & retrieved_set)
        precision = true_positives / len(retrieved_set) if retrieved_set else 0
        recall = true_positives / len(actual_set) if actual_set else 0
        
        precisions.append(precision)
        recalls.append(recall)
    
    avg_precision = sum(precisions) / len(precisions) if precisions else 0
    avg_recall = sum(recalls) / len(recalls) if recalls else 0
    
    avg_f1 = 2 * (avg_precision * avg_recall) / (avg_precision + avg_recall) if (avg_precision + avg_recall) > 0 else 0
    
    return avg_precision, avg_recall, avg_f1

In [26]:
evaluate_retrieval(query_df["Related Documents"], retrieved_df["Documents"])

(0.0, 0.0, 0)

In [38]:
non_empty = query_df[query_df["Related Documents"].apply(lambda x: len(ast.literal_eval(x)) > 0)]
index = non_empty.index
non_empty

Unnamed: 0,text,Related Documents
q00001,What are the service and product offerings fro...,"['MSFT20230014', 'MSFT20230015']"
q00007,How much revenue does Microsoft generate from ...,['MSFT20231529']
q00008,MSFT remaining performance obligation,['MSFT20231529']
q00010,ADBE share repurchase,"['ADBE20231571', 'ADBE20231572', 'ADBE20230728..."
q00019,When did Coupang`s Farfetch consolidation start,['CPNG20230732']
...,...,...
q00197,What factors contributed to the significant in...,"['UNH20230432', 'UNH20230433', 'UNH20230436', ..."
q00200,Primary revenue source of Google Services,['GOOGL20230050']
q00204,Capex guidance Alphabet,['GOOGL20230680']
q00210,Who runs berkshire,['BRK.A20230396']


In [40]:
retrieved_df.loc[index]

Unnamed: 0,Documents
q00001,"[NVDA20230430, MSFT20231710, MSFT20230975]"
q00007,"[AMZN20230018, ADBE20230055, ORCL20230115]"
q00008,"[BRK.A20230905, BRK.A20230426, ADBE20231289]"
q00010,"[GOOGL20231589, TSLA20230886, GOOGL20230698]"
q00019,"[GOOGL20230652, ADBE20231131, ORCL20231909]"
...,...
q00197,"[MSFT20231048, TSLA20230097, MSFT20230059]"
q00200,"[GOOGL20230565, ADBE20230975, TSLA20230841]"
q00204,"[NFLX20230502, DAL20231891, MSFT20231710]"
q00210,"[ORCL20231907, TSLA20230569, AMZN20230531]"


In [46]:
document_df.loc["TSLA20230569"]["text"]

'Opinions on the Financial Statements and Internal Control over Financial Reporting'