In [28]:
!pip install rank-bm25 -qq

In [1]:
import os
import pickle
import asyncio
import numpy as np
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
from typing import List, Dict, Any, Tuple, Optional
from tqdm.asyncio import tqdm_asyncio
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

from datasets import load_dataset

from pydantic import BaseModel, Field
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser

from rank_bm25 import BM25Okapi

import warnings
load_dotenv()
warnings.filterwarnings("ignore")

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

## 1. Document Loading

In [3]:
document_df = pd.read_parquet("data/document.parquet")
document_df.head()

Unnamed: 0,docno,Date,Article_title,Stock_symbol,Url,Article,Textrank_summary
0,0,2023-12-16 22:00:00+00:00,My 6 Largest Portfolio Holdings Heading Into 2...,AAPL,https://www.nasdaq.com/articles/my-6-largest-p...,"After an absolute disaster of a year in 2022, ...",3: Apple There's little question that Apple (N...
1,1,2023-12-16 22:00:00+00:00,Brokers Suggest Investing in Apple (AAPL): Rea...,AAPL,https://www.nasdaq.com/articles/brokers-sugges...,"When deciding whether to buy, sell, or hold a ...",Let's take a look at what these Wall Street he...
2,2,2023-12-16 21:00:00+00:00,"Company News for Dec 19, 2023",AAPL,https://www.nasdaq.com/articles/company-news-f...,Shares of Apple Inc. AAPL lost 0.9% on China’s...,Click to get this free report Apple Inc. (AAPL...
3,3,2023-12-16 21:00:00+00:00,NVIDIA (NVDA) Up 243% YTD: Will It Carry Momen...,AAPL,https://www.nasdaq.com/articles/nvidia-nvda-up...,NVIDIA Corporation NVDA has witnessed a remark...,Other Stocks in the $1T Club Apart from NVIDIA...
4,4,2023-12-16 21:00:00+00:00,"Pre-Market Most Active for Dec 19, 2023 : BMY,...",AAPL,https://www.nasdaq.com/articles/pre-market-mos...,The NASDAQ 100 Pre-Market Indicator is up 10.1...,"Apple Inc. (AAPL) is +0.86 at $196.75, with 1,..."


## 2. Query Generator

In [3]:
QUERY_PROMPT_TEMPLATE = """
You are a query generation assistant for a financial news retrieval system. Your task is to generate natural and conversational search queries based on a given financial news article. These queries should mimic the style of queries humans would type into Google Search to find information related to the article.

Input:
- Document: {document}
- Number of Queries: {num_queries}

Instructions:
1. Focus on creating queries that are concise, natural, and written in a conversational style.
2. Reflect the key topics, entities (e.g., companies, individuals, financial instruments), or events in the article with the corresponding date in document.
3. Ensure the queries are relevant and diverse by covering different aspects of the article.
4. Avoid making the queries too specific by using niche or overly professional terms.
5. Periodically, use human keyword like "Best", "Top" to make the queries more engaging.
6. Avoid using word "today". If you create query that require timedate event, attend to the date in the document but make it wisely (either monthly for general event or explicit date for specific event)

Format each query as a standalone search phrase that feels natural for a human to type into Google.

Output:
{format_instructions}
"""

class QueryResponse(BaseModel):
    queries: List[str] = Field(description="a list of generated queries from response")

class QueryGenerator:
    def __init__(self, model: ChatOpenAI, parser: str, prompt: str):
        self.model = model
        self.prompt = prompt
        self.parser = JsonOutputParser(pydantic_object=parser)

    def __call(self, document: str, num_queries: int = 5) -> QueryResponse:
        prompt = PromptTemplate(
            template=self.prompt,
            input_variables=["document", "num_queries"],
            partial_variables={"format_instructions": self.parser.get_format_instructions()}
        )

        chain = prompt | self.model | self.parser
        response = chain.invoke({"document": document, "num_queries": num_queries})
        return response
    
    def generate_queries(self, docno: int, document: str, num_queries: int = 5) -> Tuple[int, QueryResponse]:
        response = self.__call(document=document, num_queries=num_queries)
        return docno, response

# Usage
query_generator = QueryGenerator(
    model=ChatOpenAI(model="gpt-4o-mini", temperature=0.5),
    parser=QueryResponse,
    prompt=QUERY_PROMPT_TEMPLATE
)

print(document_df["Article"][0])
query_generator.generate_queries(docno=0, document=document_df["Article"][0], num_queries=3)

After an absolute disaster of a year in 2022, the stock market appears to have turned the corner. Each of the major market indexes has gained more than 20% from their respective trough. Perhaps more importantly, the S&P 500 and the Nasdaq Composite are within striking distance of new highs, which will check the final box marking the start of a new bull market.
Closing out the old and ringing in the new is a great time for examination, and one of the places I start is with my portfolio. A review of my top investments and how they came to be that way can offer valuable insight for the future.
Here's a look at my six largest holdings heading into 2024 (as of the market close on Dec. 15) and the incredibly valuable lesson I learned from each one.
Image source: Getty Images.
No. 6: Nvidia
Every investor has one -- the "stock that got away." The one you meant to buy, only to find that it got away from you and has risen 100%, 500%, or even 1,000%. In my case, that stock was Nvidia (NASDAQ: NV

(0,
 {'queries': ['What are the top stocks to invest in for 2024?',
   'Why is Nvidia stock so popular among investors?',
   'How has Apple managed to grow despite smartphone market challenges?']})

In [19]:
async def async_inference(df, num_queries=3):
    loop = asyncio.get_running_loop()

    tasks = [
        loop.run_in_executor(
            None,  
            query_generator.generate_queries,
            df["docno"].iloc[idx],
            df["Article"].iloc[idx],
            num_queries
        )
        for idx in range(len(df))
    ]

    responses = await tqdm_asyncio.gather(*tasks)
    return responses

query_response = await async_inference(document_df, num_queries=3)
query_response

100%|██████████| 18992/18992 [19:57<00:00, 15.86it/s]


[(0,
  {'queries': ['What are the top stocks to invest in for 2024?',
    'How has Nvidia stock performed over the years?',
    'What lessons can investors learn from holding Apple and Netflix stocks?']}),
 (1,
  {'queries': ['What are the latest analyst recommendations for Apple stock?',
    "Is Apple's stock a good investment right now?",
    'How does Zacks Rank compare to average brokerage recommendations for stocks?']}),
 (2,
  {'queries': ['Why did Apple shares drop 0.9% recently?',
    'What caused Prologis shares to fall 1.2%?',
    "What is the impact of Nippon Steel's acquisition on United States Steel Corporation shares?"]}),
 (3,
  {'queries': ["What is NVIDIA's stock price surge in 2023?",
    "How will generative AI impact NVIDIA's growth in 2024?",
    'Which companies are in the $1 trillion market cap club alongside NVIDIA?']}),
 (4,
  {'queries': ['What are the latest pre-market indicators for NASDAQ 100?',
    'Which stocks are most active in pre-market trading right 

In [21]:
# save query response to pickle
with open("data/query_response.pkl", "wb") as f:
    pickle.dump(query_response, f)

In [4]:
# load query response from pickle
with open("data/query_response.pkl", "rb") as f:
    query_response = pickle.load(f)

## 3. Qrels Matching

In [5]:
def create_gold_qrels(query_response: List[Tuple[int, QueryResponse]], document_df: pd.DataFrame) -> pd.DataFrame:
    qrels = {"docno": [], 
            "qid": [], 
            "query": [], 
            "document": [],
            "label": []}
    qid = 0

    for response in query_response:
        for query in response[1]["queries"]:
            qrels["docno"].append(response[0])
            qrels["qid"].append(qid)
            qrels["query"].append(query)
            qrels["document"].append(document_df["Article"][response[0]])
            qrels["label"].append(1)
            qid += 1

    qrels = pd.DataFrame(qrels)

    non_duplicate_doc = document_df.drop_duplicates(subset="Article")
    non_dup_docno = non_duplicate_doc["docno"].values

    qrels = qrels[qrels["docno"].isin(non_dup_docno)]
    return qrels

non_duplicate_doc = document_df.drop_duplicates(subset="Article")
non_dup_docno = non_duplicate_doc["docno"].values

gold_qrels_df = create_gold_qrels(query_response, document_df)
display(gold_qrels_df.head())
gold_qrels_df.shape

Unnamed: 0,docno,qid,query,document,label
0,0,0,What are the top stocks to invest in for 2024?,"After an absolute disaster of a year in 2022, ...",1
1,0,1,How has Nvidia stock performed over the years?,"After an absolute disaster of a year in 2022, ...",1
2,0,2,What lessons can investors learn from holding ...,"After an absolute disaster of a year in 2022, ...",1
3,1,3,What are the latest analyst recommendations fo...,"When deciding whether to buy, sell, or hold a ...",1
4,1,4,Is Apple's stock a good investment right now?,"When deciding whether to buy, sell, or hold a ...",1


(50754, 5)

In [6]:
def select_documents_by_index(doc_scores: pd.DataFrame, 
                              index_percentile: int, 
                              top_n: int = 3) -> pd.DataFrame:
    threshold_index = int(len(doc_scores) * (index_percentile / 100.0))
    threshold_index = min(threshold_index, len(doc_scores) - 1)
    selected_docs = doc_scores.iloc[threshold_index:threshold_index + top_n]
    return selected_docs


def match_negative_query(scorer: BM25Okapi, 
                             query: str, 
                             documents: List[str], 
                             index_percentiles: List[int]=[90, 95, 99],
                             debug=False,
                             qid=None) -> Dict[str, Any]:
    selected_docs_by_percentile = {}
    
    tokenized_query = query.split()

    scores = scorer.get_scores(tokenized_query)
    doc_scores = pd.DataFrame({"document": documents, "score": scores})

    sorted_doc_scores = doc_scores.sort_values(by="score", ascending=False).reset_index(drop=True)
    for percentile in index_percentiles:
        selected_docs = select_documents_by_index(sorted_doc_scores, index_percentile=percentile, top_n=3)
        if debug:
            print(selected_docs)
        selected_docs_by_percentile[percentile] = selected_docs["document"].values 

    if qid is None:
        return {"negative_documents": [doc for docs in selected_docs_by_percentile.values() for doc in docs]}
    
    return {"qid": qid, 
            "negative_documents": [doc for docs in selected_docs_by_percentile.values() for doc in docs]}

async def bm25_qrels_matcher(gold_qrels_df: pd.DataFrame, documents: List[str]) -> pd.DataFrame:
    documents = non_duplicate_doc["Article"].values
    tokenized_documents = [doc.split() for doc in documents]
    bm25_scorer = BM25Okapi(tokenized_documents)

    loop = asyncio.get_running_loop()

    tasks = [
        loop.run_in_executor(
            None,
            match_negative_query,
            bm25_scorer,
            gold_qrels_df["query"].iloc[idx],
            documents,
            [90, 95, 99],
            False,
            gold_qrels_df["qid"].iloc[idx],
        )
        for idx in range(len(gold_qrels_df))
    ]

    responses = []
    for response in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Processing"):
        responses.append(await response)

    return responses


In [7]:
documents = non_duplicate_doc["Article"].values

qrels_response = await bm25_qrels_matcher(gold_qrels_df=gold_qrels_df, documents=documents)

Processing: 100%|██████████| 50754/50754 [1:12:37<00:00, 11.65it/s]


In [10]:
qrels = {"qid": [], "query": [], "docno": [], "document": [], "label": []}

for response in tqdm(qrels_response, total=len(qrels_response)):
    qid = response["qid"]

    gold_qrel = gold_qrels_df[gold_qrels_df["qid"] == qid].iloc[0]
    qrels["qid"].append(qid)
    qrels["query"].append(gold_qrel["query"])
    qrels["docno"].append(gold_qrel["docno"])
    qrels["document"].append(gold_qrel["document"])
    qrels["label"].append(1)

    for doc in response["negative_documents"]:
        docno = non_duplicate_doc[non_duplicate_doc["Article"] == doc]["docno"].values[0]

        qrels["qid"].append(qid)
        qrels["query"].append(gold_qrel["query"])
        qrels["docno"].append(docno)
        qrels["document"].append(doc)
        qrels["label"].append(0)

qrels_df = pd.DataFrame(qrels)

100%|██████████| 50754/50754 [13:08<00:00, 64.34it/s] 


In [11]:
qrels_df

Unnamed: 0,qid,query,docno,document,label
0,1,How has Nvidia stock performed over the years?,0,"After an absolute disaster of a year in 2022, ...",1
1,1,How has Nvidia stock performed over the years?,13854,Looking today at week-over-week shares outstan...,0
2,1,How has Nvidia stock performed over the years?,13917,Looking today at week-over-week shares outstan...,0
3,1,How has Nvidia stock performed over the years?,1304,Looking today at week-over-week shares outstan...,0
4,1,How has Nvidia stock performed over the years?,18893,Shares of Elbit Systems Ltd. (ESLT) declined 2...,0
...,...,...,...,...,...
507535,56967,What is Wedbush's latest recommendation for Zs...,13507,Yelp Inc. YELP shares gained 4.5% during Thurs...,0
507536,56967,What is Wedbush's latest recommendation for Zs...,9921,Nordson Corp.’s (NDSN) shares surged 3.9% afte...,0
507537,56967,What is Wedbush's latest recommendation for Zs...,11524,Nov 20 (Reuters) - Wall Street's main indexes ...,0
507538,56967,What is Wedbush's latest recommendation for Zs...,16031,Consumer stocks were gaining late Thursday aft...,0


## 4. Dataset Splitting

In [None]:
np.random.seed(42)

TRAIN_SIZE = 6000
VAL_SIZE = 2000
TEST_SIZE = 2000

qids = qrels_df["qid"].unique()

train_qids = np.random.choice(qids, TRAIN_SIZE, replace=False)
remaining_qids = np.setdiff1d(qids, train_qids)

val_qids = np.random.choice(remaining_qids, VAL_SIZE, replace=False)
remaining_qids = np.setdiff1d(remaining_qids, val_qids)

test_qids = np.random.choice(remaining_qids, TEST_SIZE, replace=False)

train_qrels_df = qrels_df[qrels_df["qid"].isin(train_qids)]
val_qrels_df = qrels_df[qrels_df["qid"].isin(val_qids)]
test_qrels_df = qrels_df[qrels_df["qid"].isin(test_qids)]

train_qrels_df.shape, val_qrels_df.shape, test_qrels_df.shape

((60000, 5), (20000, 5), (20000, 5))

In [None]:
# save qrels to parquet
train_qrels_df.to_parquet("data/train_qrels.parquet", index=False)
val_qrels_df.to_parquet("data/val_qrels.parquet", index=False)
test_qrels_df.to_parquet("data/test_qrels.parquet", index=False)