Loading the data from /data directory

In [1]:
import pandas as pd
import os

Stock Price data

In [2]:
apple_stock = pd.read_csv("data/apple_stock.csv")
msft_stock = pd.read_csv("data/msft_stock.csv")
apple_stock.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Company
0,2023-01-03,130.279999,130.899994,124.169998,125.07,123.904617,112117500,Apple
1,2023-01-04,126.889999,128.660004,125.080002,126.360001,125.18261,89113600,Apple
2,2023-01-05,127.129997,127.769997,124.760002,125.019997,123.855103,80962700,Apple
3,2023-01-06,126.010002,130.289993,124.889999,129.619995,128.412231,87754700,Apple
4,2023-01-09,130.470001,133.410004,129.889999,130.149994,128.937286,70790800,Apple


In [3]:
import numpy as np
from copy import deepcopy

df = deepcopy(apple_stock)

# Simple Moving Average (SMA)
df['SMA_20'] = df['Close'].rolling(window=20).mean()

# Exponential Moving Average (EMA)
df['EMA_12'] = df['Close'].ewm(span=12, adjust=False).mean()
df['EMA_26'] = df['Close'].ewm(span=26, adjust=False).mean()

# Relative Strength Index (RSI)
delta = df['Close'].diff(1)
gain = np.where(delta > 0, delta, 0)
loss = np.where(delta < 0, -delta, 0)
avg_gain = pd.Series(gain).rolling(window=14).mean()
avg_loss = pd.Series(loss).rolling(window=14).mean()
rs = avg_gain / avg_loss
df['RSI'] = 100 - (100 / (1 + rs))

# MACD
df['MACD'] = df['EMA_12'] - df['EMA_26']
df['Signal_Line'] = df['MACD'].ewm(span=9, adjust=False).mean()

# Bollinger Bands
df['Middle_Band'] = df['Close'].rolling(window=20).mean()
df['Upper_Band'] = df['Middle_Band'] + 2*df['Close'].rolling(window=20).std()
df['Lower_Band'] = df['Middle_Band'] - 2*df['Close'].rolling(window=20).std()

# On-Balance Volume (OBV)
df['OBV'] = (np.sign(df['Close'].diff()) * df['Volume']).fillna(0).cumsum()

# Volume Moving Average
df['Volume_MA'] = df['Volume'].rolling(window=20).mean()
# Normalize selected indicators
df['SMA_20_norm'] = df['SMA_20'] / df['Close']
df['RSI_norm'] = df['RSI'] / 100
df['MACD_norm'] = df['MACD'] / df['Close']

# Create sparse vectors using selected indicators
sparse_vectors = df[['SMA_20_norm', 'RSI_norm', 'MACD_norm', 'OBV']].fillna(0).values

print("Sparse Vectors:\n", sparse_vectors.shape)


Sparse Vectors:
 (333, 4)


In [4]:
sparse_vectors.shape

(333, 4)

In [5]:
import numpy as np
from scipy.sparse import csr_matrix

sparse_matrix = csr_matrix(sparse_vectors)

# Show sparse matrix
print(sparse_matrix)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1298 stored elements and shape (333, 4)>
  Coords	Values
  (1, 2)	0.0008143879031531835
  (1, 3)	89113600.0
  (2, 2)	0.0006036061334825662
  (2, 3)	8150900.0
  (3, 2)	0.003240651325841453
  (3, 3)	95905600.0
  (4, 2)	0.005589885737773369
  (4, 3)	166696400.0
  (5, 2)	0.007698281297126299
  (5, 3)	230592600.0
  (6, 2)	0.010739299080735329
  (6, 3)	300051500.0
  (7, 2)	0.013084215264993454
  (7, 3)	228671900.0
  (8, 2)	0.015418448479918965
  (8, 3)	286481600.0
  (9, 2)	0.017717620003478856
  (9, 3)	350128200.0
  (10, 2)	0.019096084414488183
  (10, 3)	280455400.0
  (11, 2)	0.01991007667189441
  (11, 3)	338735800.0
  (12, 2)	0.02144858170240243
  (12, 3)	418959400.0
  (13, 1)	0.8942968974176718
  :	:
  (326, 3)	542428400.0
  (327, 0)	1.0167046445049432
  (327, 1)	0.45098037082409465
  (327, 2)	-0.012230970254624383
  (327, 3)	591966200.0
  (328, 0)	1.0037480449169194
  (328, 1)	0.5034082629378515
  (328, 2)	-0.010991507458494849

Setup Qdrant for Sparse Vector store

In [6]:
import qdrant_client
from qdrant_client.models import VectorParams, Distance
from qdrant_client.http.models import PointStruct, VectorParams
from qdrant_client import QdrantClient



# Initialize Qdrant client
client = QdrantClient(url="http://localhost:6333")#, api_key=qdrant_api)

# Create a collection named "financial_data"
client.recreate_collection(
    collection_name="financial_data",
    vectors_config=VectorParams(
        size=sparse_vectors.shape[1],  # Dimensionality of the vector (SMA_20_norm, RSI_norm, MACD_norm)
        distance=Distance.COSINE  # Distance metric (can be COSINE, EUCLID, etc.)
    )
)

for i, vector in enumerate(sparse_vectors):
    client.upsert(
        collection_name="financial_data",
        points=[
            {
                "id": i+1,
                "vector": vector.tolist(),
                "payload": {"date": df['Date'].iloc[i],
                            "RSI" : df["RSI_norm"].iloc[i]}
            }
        ]
    )

  from .autonotebook import tqdm as notebook_tqdm
  client.recreate_collection(


Financial News Data

In [7]:
microsoft_news = pd.read_csv("data/microsoft_financial_news.csv")
apple_news = pd.read_csv("data/apple_financial_news.csv")
apple_news.head()


Unnamed: 0,title,date,text,source
0,Apple announces new MacBook Air laptops with i...,2024-03-04,Apple on Monday announced new versions of its ...,https://www.cnbc.com/technology/
1,Here's what Meta CEO Mark Zuckerberg has to sa...,2024-02-14,Meta CEO Mark Zuckerberg demonstrates an Oculu...,https://www.cnbc.com/technology/
2,Apple's Vision Pro virtual reality headset lau...,2024-02-02,The first customer walks out of the Apple Stor...,https://www.cnbc.com/technology/
3,Apple Vision Pro review: This is the future of...,2024-01-30,In this article AAPL Follow your favorite stoc...,https://www.cnbc.com/technology/
4,"Apple $3,499 Vision Pro headset now available ...",2024-01-19,"Preorders for Apple 's $3,499 Vision Pro heads...",https://www.cnbc.com/technology/


In [8]:
## Creating Dense Vectors for News Articles
from sentence_transformers import SentenceTransformer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

# Load the pre-trained model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

news_articles = list(apple_news.text.values)

# Generate dense vectors for each article
dense_vectors = model.encode(news_articles)

dense_vectors.shape


[nltk_data] Downloading package punkt to /home/saranath/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/saranath/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(14, 384)

In [9]:
from qdrant_client import QdrantClient
# Create a collection named "financial_data"
client.recreate_collection(
    collection_name="news_sentiment",
    vectors_config=VectorParams(
        size=dense_vectors.shape[1],  # Dimensionality of the vector (SMA_20_norm, RSI_norm, MACD_norm)
        distance=Distance.COSINE  # Distance metric (can be COSINE, EUCLID, etc.)
    )
)

for i, vector in enumerate(dense_vectors):
    client.upsert(
        collection_name="news_sentiment",
        points=[
            {
                "id": i+1,
                "vector": vector.tolist(),
                "payload":{"title":apple_news.title.iloc[i],
                           "date":apple_news.date.iloc[i]}
            }
        ]
    )

  client.recreate_collection(


You can query the sparse vectors to retrieve real-time market indicators.



In [10]:
sparse_vectors.shape

(333, 4)

In [11]:
query_vector = np.array([1, 1, 0, 0])
results_sparse = client.search(
    collection_name="financial_data",
    query_vector=query_vector.tolist(),
    limit=4,  
)

print("Market Indicators Results:", results_sparse)

Market Indicators Results: [ScoredPoint(id=326, version=325, score=2.0630735e-09, payload={'date': '2024-04-19', 'RSI': 0.4106571959163894}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=20, version=19, score=1.9640336e-09, payload={'date': '2023-01-31', 'RSI': 0.8029493541256731}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=327, version=326, score=1.916854e-09, payload={'date': '2024-04-22', 'RSI': 0.44604314177506993}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=325, version=324, score=1.8119324e-09, payload={'date': '2024-04-18', 'RSI': 0.4194484246240115}, vector=None, shard_key=None, order_value=None)]


Similarly, dense vectors can be queried to analyze financial news sentiment.

In [12]:
query_vector = model.encode("Apple stock rises due to new product launch")

results_dense = client.search(
    collection_name="news_sentiment",
    query_vector=query_vector.tolist(),
    limit=4
)

print("News Sentiment Results:", results_dense)


News Sentiment Results: [ScoredPoint(id=1, version=0, score=0.513553, payload={'title': 'Apple announces new MacBook Air laptops with its latest M3 chip', 'date': '2024-03-04'}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=6, version=5, score=0.46926737, payload={'title': 'Apple reportedly plans big overhaul to iPad family to make it less confusing', 'date': '2023-12-11'}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=8, version=7, score=0.36333683, payload={'title': 'Apple iPhone 14 gets another free year of satellite Emergency SOS', 'date': '2023-11-15'}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=4, version=3, score=0.35279945, payload={'title': 'Apple Vision Pro review: This is the future of computing and entertainment', 'date': '2024-01-30'}, vector=None, shard_key=None, order_value=None)]


In [13]:
# Combine results based on relevance or score
def combine_results(results_sparse,results_dense):
    combined_results = {
        "market_indicators": results_sparse,  # From sparse vector query
        "news_sentiment": results_dense  # From dense vector query
    }

    #print("Combined Results:", combined_results)
    return combined_results

def analyze_combined_results(combined_results):
    """
    Analyzes the combined results from sparse and dense vector queries.

    Parameters:
        combined_results (dict): Dictionary containing search results from sparse and dense vector queries.
                                 Example structure:
                                 {
                                     "market_indicators": [ ... ],
                                     "news_sentiment": [ ... ]
                                 }

    Returns:
        dict: A summary of the analysis, including key insights.
    """

    # Extract results
    market_results = combined_results.get('market_indicators', [])
    news_results = combined_results.get('news_sentiment', [])

    # Analyze market indicators
    market_insights = []
    for result in market_results:
        market_insights.append({
            "date": result.payload.get('date',"N/A"),
            "score": result.score,  # Relevance score
            "id": result.id,
            "indicator_vector": result.vector,  # The sparse vector itself
            "RSI": result.payload.get("RSI","N/A")
        })

    # Analyze news sentiment
    news_insights = []
    for result in news_results:
        news_insights.append({
            "headline": result.payload.get('title', 'N/A'),
            "score": result.score,  # Relevance score
            "id": result.id,
            "date": result.payload.get('date', 'N/A'),
            "sentiment_vector": result.vector  # The dense vector itself
        })

    # Combine insights for a summary
    analysis_summary = {
        "market_insights": market_insights,
        "news_insights": news_insights,
        "combined_summary": f"Top market indicator on {market_insights[0]['date']} with relevance score {market_insights[0]['score']} with an RSI of about {market_insights[0]["RSI"]}."
                            f" Associated news headline: '{news_insights[0]['headline']}' dated {news_insights[0]['date']} with sentiment score {news_insights[0]['score']}."
    }

    return analysis_summary




Building a Real-Time Trading Signal Generator Using Qdrant

This system can generate real-time trading signals by querying both sparse and dense vectors.

In [14]:
# Example function to generate a trading signal
def generate_trading_signal(stock_query, sentiment_query):
    market_results = client.search(collection_name="financial_data", query_vector=stock_query.tolist(), limit=5)
    sentiment_results = client.search(collection_name="news_sentiment", query_vector=sentiment_query.tolist(), limit=5)
    
    # Combine or analyze results to generate a signal
    signal = combine_results(market_results, sentiment_results)
    analysis_summary = analyze_combined_results(signal)
    return analysis_summary

# Example use
stock_query = np.array([1, 1, 0, 0])
sentiment_query = model.encode("Apple stock rises due to new product launch")

signal = generate_trading_signal(stock_query, sentiment_query)
print("Generated Trading Signal:", signal.get("combined_summary"))


Generated Trading Signal: Top market indicator on 2024-04-19 with relevance score 2.0630735e-09 with an RSI of about 0.4106571959163894. Associated news headline: 'Apple announces new MacBook Air laptops with its latest M3 chip' dated 2024-03-04 with sentiment score 0.513553.


In [15]:
signal.get("combined_summary")

"Top market indicator on 2024-04-19 with relevance score 2.0630735e-09 with an RSI of about 0.4106571959163894. Associated news headline: 'Apple announces new MacBook Air laptops with its latest M3 chip' dated 2024-03-04 with sentiment score 0.513553."

Hybrid Search in Financial Analytics
Using Qdrant’s Hybrid Search Capabilities
Qdrant supports hybrid search, which combines both sparse and dense vectors for nuanced insights.

### Hybrid Search by Combining Sparse and Dense Together through Qdrant

In [16]:
from tqdm import tqdm

client.set_model("sentence-transformers/all-MiniLM-L6-v2")
# comment this line to use dense vectors only
client.set_sparse_model("prithivida/Splade_PP_en_v1")

if not client.collection_exists("hybrid_search"):
    client.create_collection(
        collection_name="hybrid_search",
        vectors_config=client.get_fastembed_vector_params(),
        # comment this line to use dense vectors only
        sparse_vectors_config=client.get_fastembed_sparse_vector_params(),  
    )
documents = list(apple_news["text"].values)
metadata = list(apple_news[["title","text","date"]])

client.add(
    collection_name="hybrid_search",
    documents=documents,
    parallel=0,  # Use all available CPU cores to encode data. 
    ids=tqdm(range(len(documents)))
    # Requires wrapping code into if __name__ == '__main__' block
)


Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 68985.26it/s]


Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 77385.68it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 18236.10it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 38339.16it/s]

Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 13943.83it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 11618.57it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 12242.57it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 57456.22it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 9425.40it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 68089.35it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 45889.54it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 43419.30it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 73326.99it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 45392.90it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 18251.98it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 21076.90

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]

In [17]:
from fastapi import FastAPI

# The file where HybridSearcher is stored
from hybrid_search import HybridSearcher

app = FastAPI()

# Create a neural searcher instance
hybrid_searcher = HybridSearcher(collection_name="hybrid_search")

def search_db(q: str):
    return {"result": hybrid_searcher.search(text=q)}

search_db("Apple is performing well")["result"]

[{'document': 'In this article AAPL Follow your favorite stocks CREATE FREE ACCOUNT\n\nwatch now\n\nIt\'s night. I\'m at a lake near Oregon\'s Mount Hood, sitting on the beach. Jazz music is playing as I write. I\'m not in the real world. Well, I sort of am. I\'m wearing Apple\' s new Vision Pro headset, which looks like a fancy pair of glowing ski goggles. Apple\'s long-awaited headset, which starts at $3,500, launches in the U.S. on Friday. It\'s the company\'s first major new gadget to hit the market since the Apple Watch debuted in April 2015. I\'ve been testing it for nearly a week. While it has some shortcomings, it\'s easily the most fun new product I\'ve tried out in years. Analysts don\'t expect the Vision Pro to drive massive amounts of revenue initially. UBS anticipates Apple will ship about 400,000 headsets, leading to a "relatively immaterial" $1.4 billion in revenue this year. However, I\'m convinced that if Apple eventually sells cheaper versions, we\'ll see millions of 

### Hybrid Search by Individual Collections from Sparse and Dense

In [18]:
from qdrant_client import QdrantClient


class HybridSearcher:
    def __init__(self, collection_name):
        self.collection_name = collection_name
        # initialize Qdrant client
        self.qdrant_client = client
    def search(self, vector):
        if type(self.collection_name) == str:
            search_result = client.search(collection_name=self.collection_name, query_vector=vector.tolist(), limit=5)
        else:
            search_result = client.search(collection_name=self.collection_name[0], query_vector=vector[0].tolist(), limit=5)
            search_result2 = client.search(collection_name=self.collection_name[1], query_vector=vector[1].tolist(), limit=5)

        # `search_result` contains found vector ids with similarity scores 
        # along with the stored payload
        
        # Select and return metadata
        if self.collection_name == "news_sentiment":
            metadata = [hit.score for hit in search_result]
        elif self.collection_name == "financial_data": 
            metadata = [hit.score for hit in search_result]
        else:
            metadata1 = [[hit.payload.get("date"), hit.payload.get("RSI")] for hit in search_result2]
            metadata2 = [[hit.payload.get("date"), hit.payload.get("title")] for hit in search_result]

            metadata = pd.DataFrame(metadata1,columns=["date","RSI"]),pd.DataFrame(metadata2,columns=["date","title"])
        return metadata

hybrid_searcher = HybridSearcher(collection_name=["news_sentiment","financial_data"])
dense_vector = model.encode("Apple is performing well")
sparse_vector = np.array([1,1,0,0])
vector = [dense_vector,sparse_vector]
metadata = hybrid_searcher.search(vector)
metadata

(         date       RSI
 0  2024-04-19  0.410657
 1  2023-01-31  0.802949
 2  2024-04-22  0.446043
 3  2024-04-18  0.419448
 4  2024-04-23  0.450980,
          date                                              title
 0  2023-12-11  Apple reportedly plans big overhaul to iPad fa...
 1  2024-03-04  Apple announces new MacBook Air laptops with i...
 2  2023-11-15  Apple iPhone 14 gets another free year of sate...
 3  2024-01-19  Apple $3,499 Vision Pro headset now available ...
 4  2024-01-30  Apple Vision Pro review: This is the future of...)

In [19]:
# Dense Query Result
hybrid_searcher = HybridSearcher(collection_name="news_sentiment")
dense_vector = model.encode("Apple is performing well")
dense_results = hybrid_searcher.search(dense_vector)
print(dense_results)

hybrid_searcher = HybridSearcher(collection_name="financial_data")
sparse_vector = np.array([1,1,0,0])
sparse_results = hybrid_searcher.search(sparse_vector)
print(sparse_results)


[0.4623789, 0.45850566, 0.45688626, 0.32975656, 0.321362]
[2.0630735e-09, 1.9640336e-09, 1.916854e-09, 1.8119324e-09, 1.7531575e-09]


Implement advanced techniques like Reciprocal Rank Fusion (RRF) for combining results:



In [20]:
def reciprocal_rank_fusion(results):
    combined_rank = 0
    for rank, result in enumerate(results, start=1):
        combined_rank += 1 / rank
    return combined_rank

rrf_score = reciprocal_rank_fusion([sparse_results, dense_results])
print(rrf_score)

1.5
