Loading the data from /data directory

In [2]:
import pandas as pd
import os

Stock Price data

In [3]:
hpe_stock = pd.read_csv("data/HPE_stock.csv")
hpe_stock.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Company
0,2023-01-03,16.09,16.139999,15.83,16.059999,15.367215,18233900,Hewlett Packard Enterprise
1,2023-01-04,16.15,16.525,16.120001,16.42,15.711686,15865500,Hewlett Packard Enterprise
2,2023-01-05,16.290001,16.495001,16.184999,16.450001,15.740394,13099500,Hewlett Packard Enterprise
3,2023-01-06,16.68,17.209999,16.67,17.110001,16.371922,15242500,Hewlett Packard Enterprise
4,2023-01-09,17.1,17.25,16.889999,16.959999,16.228392,19294700,Hewlett Packard Enterprise


In [4]:
import numpy as np
from copy import deepcopy

df = deepcopy(hpe_stock)

# Simple Moving Average (SMA)
df['SMA_20'] = df['Close'].rolling(window=20).mean()

# Exponential Moving Average (EMA)
df['EMA_12'] = df['Close'].ewm(span=12, adjust=False).mean()
df['EMA_26'] = df['Close'].ewm(span=26, adjust=False).mean()

# Relative Strength Index (RSI)
delta = df['Close'].diff(1)
gain = np.where(delta > 0, delta, 0)
loss = np.where(delta < 0, -delta, 0)
avg_gain = pd.Series(gain).rolling(window=14).mean()
avg_loss = pd.Series(loss).rolling(window=14).mean()
rs = avg_gain / avg_loss
df['RSI'] = 100 - (100 / (1 + rs))

# MACD
df['MACD'] = df['EMA_12'] - df['EMA_26']
df['Signal_Line'] = df['MACD'].ewm(span=9, adjust=False).mean()

# Bollinger Bands
df['Middle_Band'] = df['Close'].rolling(window=20).mean()
df['Upper_Band'] = df['Middle_Band'] + 2*df['Close'].rolling(window=20).std()
df['Lower_Band'] = df['Middle_Band'] - 2*df['Close'].rolling(window=20).std()

# On-Balance Volume (OBV)
df['OBV'] = (np.sign(df['Close'].diff()) * df['Volume']).fillna(0).cumsum()

# Volume Moving Average
df['Volume_MA'] = df['Volume'].rolling(window=20).mean()
# Normalize selected indicators
df['SMA_20_norm'] = df['SMA_20'] / df['Close']
df['RSI_norm'] = df['RSI'] / 100
df['MACD_norm'] = df['MACD'] / df['Close']

# Create sparse vectors using selected indicators
sparse_vectors = df[['SMA_20_norm', 'RSI_norm', 'MACD_norm', 'OBV']].fillna(0).values

print("Sparse Vectors:\n", sparse_vectors.shape)


Sparse Vectors:
 (333, 4)


In [5]:
sparse_vectors.shape

(333, 4)

In [6]:
import numpy as np
from scipy.sparse import csr_matrix

sparse_matrix = csr_matrix(sparse_vectors)

# Show sparse matrix
print(sparse_matrix)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1298 stored elements and shape (333, 4)>
  Coords	Values
  (1, 2)	0.0017489645111771653
  (1, 3)	15865500.0
  (2, 2)	0.00323913659553549
  (2, 3)	28965000.0
  (3, 2)	0.007280703900160117
  (3, 3)	44207500.0
  (4, 2)	0.009849088042238139
  (4, 3)	24912800.0
  (5, 2)	0.01209768869051712
  (5, 3)	41690400.0
  (6, 2)	0.013181281356237924
  (6, 3)	25687600.0
  (7, 2)	0.011410792678221388
  (7, 3)	3241800.0
  (8, 2)	0.010075456988319614
  (8, 3)	15633900.0
  (9, 2)	0.008113857584141853
  (9, 3)	1444400.0
  (10, 2)	0.005611536845850715
  (10, 3)	-12185000.0
  (11, 2)	0.002217655343961456
  (11, 3)	-24718300.0
  (12, 2)	0.00016322217039208827
  (12, 3)	-12004600.0
  (13, 1)	0.5173502327022645
  :	:
  (326, 3)	-52407400.0
  (327, 0)	1.043672033724372
  (327, 1)	0.13440780824755522
  (327, 2)	-0.004151766632197229
  (327, 3)	-52407400.0
  (328, 0)	1.035262842325699
  (328, 1)	0.20467780812933525
  (328, 2)	-0.005118413324478753
  (328

Setup Qdrant for Sparse Vector store

In [7]:
import qdrant_client
from qdrant_client.models import VectorParams, Distance
from qdrant_client.http.models import PointStruct, VectorParams
from qdrant_client import QdrantClient



# Initialize Qdrant client
client = QdrantClient(url="http://localhost:6333")#, api_key=qdrant_api)

# Create a collection named "financial_data"
client.recreate_collection(
    collection_name="financial_data",
    vectors_config=VectorParams(
        size=sparse_vectors.shape[1],  # Dimensionality of the vector (SMA_20_norm, RSI_norm, MACD_norm)
        distance=Distance.COSINE  # Distance metric (can be COSINE, EUCLID, etc.)
    )
)

for i, vector in enumerate(sparse_vectors):
    client.upsert(
        collection_name="financial_data",
        points=[
            {
                "id": i+1,
                "vector": vector.tolist(),
                "payload": {"date": df['Date'].iloc[i],
                            "RSI" : df["RSI_norm"].iloc[i]}
            }
        ]
    )

  from .autonotebook import tqdm as notebook_tqdm
  client.recreate_collection(


Financial News Data

In [8]:
hpe_news = pd.read_csv("data/hpe_financial_news.csv")
hpe_news.head()


Unnamed: 0,title,date,text,source
0,"The stock market should be great, not on a kni...",2024-08-11,"When I look at Thursday's rally, I am beginnin...",https://www.cnbc.com/technology/


In [9]:
## Creating Dense Vectors for News Articles
from sentence_transformers import SentenceTransformer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

# Load the pre-trained model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

news_articles = list(hpe_news.text.values)

# Generate dense vectors for each article
dense_vectors = model.encode(news_articles)

dense_vectors.shape


[nltk_data] Downloading package punkt to /home/saranath/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/saranath/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(1, 384)

In [11]:
from qdrant_client import QdrantClient
# Create a collection named "financial_data"
client.recreate_collection(
    collection_name="news_sentiment",
    vectors_config=VectorParams(
        size=dense_vectors.shape[1],  # Dimensionality of the vector (SMA_20_norm, RSI_norm, MACD_norm)
        distance=Distance.COSINE  # Distance metric (can be COSINE, EUCLID, etc.)
    )
)

for i, vector in enumerate(dense_vectors):
    client.upsert(
        collection_name="news_sentiment",
        points=[
            {
                "id": i+1,
                "vector": vector.tolist(),
                "payload":{"title":hpe_news.title.iloc[i],
                           "date":hpe_news.date.iloc[i]}
            }
        ]
    )

  client.recreate_collection(


You can query the sparse vectors to retrieve real-time market indicators.



In [12]:
sparse_vectors.shape

(333, 4)

In [13]:
query_vector = np.array([1, 1, 0, 0])
results_sparse = client.search(
    collection_name="financial_data",
    query_vector=query_vector.tolist(),
    limit=4,  
)

print("Market Indicators Results:", results_sparse)

Market Indicators Results: [ScoredPoint(id=309, version=308, score=4.977789e-07, payload={'date': '2024-03-26', 'RSI': 0.3532608160449554}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=313, version=312, score=4.7790604e-07, payload={'date': '2024-04-02', 'RSI': 0.448071426550075}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=22, version=21, score=4.209826e-07, payload={'date': '2023-02-02', 'RSI': 0.5235602747783329}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=311, version=310, score=4.1530433e-07, payload={'date': '2024-03-28', 'RSI': 0.4563757659002917}, vector=None, shard_key=None, order_value=None)]


Similarly, dense vectors can be queried to analyze financial news sentiment.

In [14]:
query_vector = model.encode("HPE performance in Q3")

results_dense = client.search(
    collection_name="news_sentiment",
    query_vector=query_vector.tolist(),
    limit=4
)

print("News Sentiment Results:", results_dense)


News Sentiment Results: [ScoredPoint(id=1, version=0, score=0.05374437, payload={'title': "The stock market should be great, not on a knife's edge. But that's the opportunity", 'date': '2024-08-11'}, vector=None, shard_key=None, order_value=None)]


In [15]:
# Combine results based on relevance or score
def combine_results(results_sparse,results_dense):
    combined_results = {
        "market_indicators": results_sparse,  # From sparse vector query
        "news_sentiment": results_dense  # From dense vector query
    }

    #print("Combined Results:", combined_results)
    return combined_results

def analyze_combined_results(combined_results):
    """
    Analyzes the combined results from sparse and dense vector queries.

    Parameters:
        combined_results (dict): Dictionary containing search results from sparse and dense vector queries.
                                 Example structure:
                                 {
                                     "market_indicators": [ ... ],
                                     "news_sentiment": [ ... ]
                                 }

    Returns:
        dict: A summary of the analysis, including key insights.
    """

    # Extract results
    market_results = combined_results.get('market_indicators', [])
    news_results = combined_results.get('news_sentiment', [])

    # Analyze market indicators
    market_insights = []
    for result in market_results:
        market_insights.append({
            "date": result.payload.get('date',"N/A"),
            "score": result.score,  # Relevance score
            "id": result.id,
            "indicator_vector": result.vector,  # The sparse vector itself
            "RSI": result.payload.get("RSI","N/A")
        })

    # Analyze news sentiment
    news_insights = []
    for result in news_results:
        news_insights.append({
            "headline": result.payload.get('title', 'N/A'),
            "score": result.score,  # Relevance score
            "id": result.id,
            "date": result.payload.get('date', 'N/A'),
            "sentiment_vector": result.vector  # The dense vector itself
        })

    # Combine insights for a summary
    analysis_summary = {
        "market_insights": market_insights,
        "news_insights": news_insights,
        "combined_summary": f"Top market indicator on {market_insights[0]['date']} with relevance score {market_insights[0]['score']} with an RSI of about {market_insights[0]["RSI"]}."
                            f" Associated news headline: '{news_insights[0]['headline']}' dated {news_insights[0]['date']} with sentiment score {news_insights[0]['score']}."
    }

    return analysis_summary




Building a Real-Time Trading Signal Generator Using Qdrant

This system can generate real-time trading signals by querying both sparse and dense vectors.

In [16]:
# Example function to generate a trading signal
def generate_trading_signal(stock_query, sentiment_query):
    market_results = client.search(collection_name="financial_data", query_vector=stock_query.tolist(), limit=5)
    sentiment_results = client.search(collection_name="news_sentiment", query_vector=sentiment_query.tolist(), limit=5)
    
    # Combine or analyze results to generate a signal
    signal = combine_results(market_results, sentiment_results)
    analysis_summary = analyze_combined_results(signal)
    return analysis_summary

# Example use
stock_query = np.array([1, 1, 0, 0])
sentiment_query = model.encode("HPE stock rises after Q3")

signal = generate_trading_signal(stock_query, sentiment_query)
print("Generated Trading Signal:", signal.get("combined_summary"))


Generated Trading Signal: Top market indicator on 2024-03-26 with relevance score 4.977789e-07 with an RSI of about 0.3532608160449554. Associated news headline: 'The stock market should be great, not on a knife's edge. But that's the opportunity' dated 2024-08-11 with sentiment score -0.043091506.


In [17]:
signal.get("combined_summary")

"Top market indicator on 2024-03-26 with relevance score 4.977789e-07 with an RSI of about 0.3532608160449554. Associated news headline: 'The stock market should be great, not on a knife's edge. But that's the opportunity' dated 2024-08-11 with sentiment score -0.043091506."

Hybrid Search in Financial Analytics
Using Qdrant’s Hybrid Search Capabilities
Qdrant supports hybrid search, which combines both sparse and dense vectors for nuanced insights.

### Hybrid Search by Combining Sparse and Dense Together through Qdrant

In [18]:
from tqdm import tqdm

client.set_model("sentence-transformers/all-MiniLM-L6-v2")
# comment this line to use dense vectors only
client.set_sparse_model("prithivida/Splade_PP_en_v1")

if not client.collection_exists("hybrid_search"):
    client.create_collection(
        collection_name="hybrid_search",
        vectors_config=client.get_fastembed_vector_params(),
        # comment this line to use dense vectors only
        sparse_vectors_config=client.get_fastembed_sparse_vector_params(),  
    )
documents = list(hpe_news["text"].values)
metadata = list(hpe_news[["title","text","date"]])

client.add(
    collection_name="hybrid_search",
    documents=documents,
    parallel=0,  # Use all available CPU cores to encode data. 
    ids=tqdm(range(len(documents)))
    # Requires wrapping code into if __name__ == '__main__' block
)


Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 65948.18it/s]


Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 80043.97it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 88115.63it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 51150.05it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 13197.94it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 16045.54it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 16513.01it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 23224.27it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 80350.65it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 78545.02it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 22381.56it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 29959.31it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 36535.75it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 13460.54it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 13824.34it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 17909.07

[0]

In [20]:
from fastapi import FastAPI

# The file where HybridSearcher is stored
from hybrid_search import HybridSearcher

app = FastAPI()

# Create a neural searcher instance
hybrid_searcher = HybridSearcher(collection_name="hybrid_search")

def search_db(q: str):
    return {"result": hybrid_searcher.search(text=q)}

search_db("HPE is performing well")["result"]

[{'document': "Apple will overhaul its iPad family next year to make it less confusing for customers and to recharge slumping sales, according to Bloomberg. New versions of the iPad Pro and iPad Air are reportedly coming as soon as March.\n\nApple didn't release new iPads in 2023, and sales fell 3% from fiscal 2022 to fiscal 2023, after declining 8% the year before. IPad revenue in the company's fiscal fourth quarter fell 10% year over year after Apple warned of double-digit declines in its iPad and Mac segments.\n\nApple reportedly wants to make it easier to differentiate between the iPad models and also plans big upgrades. Apple currently sells two sizes of the iPad Pro, the iPad Air, the 10th generation iPad, the ninth generation iPad and the iPad mini. Accessories, such as keyboards and Apple Pencils, as well as chargers, vary depending on which model you're buying. It makes choosing an iPad more confusing than with any of Apple's other products.\n\nThe plan is to make the iPad Pro

### Hybrid Search by Individual Collections from Sparse and Dense

In [21]:
from qdrant_client import QdrantClient


class HybridSearcher:
    def __init__(self, collection_name):
        self.collection_name = collection_name
        # initialize Qdrant client
        self.qdrant_client = client
    def search(self, vector):
        if type(self.collection_name) == str:
            search_result = client.search(collection_name=self.collection_name, query_vector=vector.tolist(), limit=5)
        else:
            search_result = client.search(collection_name=self.collection_name[0], query_vector=vector[0].tolist(), limit=5)
            search_result2 = client.search(collection_name=self.collection_name[1], query_vector=vector[1].tolist(), limit=5)

        # `search_result` contains found vector ids with similarity scores 
        # along with the stored payload
        
        # Select and return metadata
        if self.collection_name == "news_sentiment":
            metadata = [hit.score for hit in search_result]
        elif self.collection_name == "financial_data": 
            metadata = [hit.score for hit in search_result]
        else:
            metadata1 = [[hit.payload.get("date"), hit.payload.get("RSI")] for hit in search_result2]
            metadata2 = [[hit.payload.get("date"), hit.payload.get("title")] for hit in search_result]

            metadata = pd.DataFrame(metadata1,columns=["date","RSI"]),pd.DataFrame(metadata2,columns=["date","title"])
        return metadata

hybrid_searcher = HybridSearcher(collection_name=["news_sentiment","financial_data"])
dense_vector = model.encode("Apple is performing well")
sparse_vector = np.array([1,1,0,0])
vector = [dense_vector,sparse_vector]
metadata = hybrid_searcher.search(vector)
metadata

(         date       RSI
 0  2024-03-26  0.353261
 1  2024-04-02  0.448071
 2  2023-02-02  0.523560
 3  2024-03-28  0.456376
 4  2023-02-15  0.583733,
          date                                              title
 0  2024-08-11  The stock market should be great, not on a kni...)

In [23]:
# Dense Query Result
hybrid_searcher = HybridSearcher(collection_name="news_sentiment")
dense_vector = model.encode("Apple is performing well")
dense_results = hybrid_searcher.search(dense_vector)
print(dense_results)

hybrid_searcher = HybridSearcher(collection_name="financial_data")
sparse_vector = np.array([1,1,0,0])
sparse_results = hybrid_searcher.search(sparse_vector)
print(sparse_results)


[0.08641739]
[4.977789e-07, 4.7790604e-07, 4.209826e-07, 4.1530433e-07, 3.4968178e-07]


Implement advanced techniques like Reciprocal Rank Fusion (RRF) for combining results:



In [24]:
def reciprocal_rank_fusion(results):
    combined_rank = 0
    for rank, result in enumerate(results, start=1):
        combined_rank += 1 / rank
    return combined_rank

rrf_score = reciprocal_rank_fusion([sparse_results, dense_results])
print(rrf_score)

1.5
