In [5]:
import spacy
import networkx as nx
from sentence_transformers import SentenceTransformer
from collections import defaultdict

In [8]:
# Load NLP model for Named Entity Recognition (NER)
nlp = spacy.load("en_core_web_sm")
embed_model = SentenceTransformer("all-MiniLM-L6-v2")  # Efficient embedding model

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [19]:
class MiniRAGGraph:
    def __init__(self):
        self.graph = nx.Graph()
        self.entity_chunk_map = defaultdict(set)  # Track which entities belong to which chunk
    
    def add_chunk(self, chunk_id, text):
        """Add text chunk as a node."""
        self.graph.add_node(chunk_id, type="chunk", text=text, embedding=embed_model.encode(text))
    
    def add_entity(self, entity_text, chunk_id):
        """Add entity node and link it to the corresponding chunk."""
        entity_text = entity_text.lower()
        if entity_text not in self.graph:
            self.graph.add_node(entity_text, type="entity", embedding=embed_model.encode(entity_text))
        
        # Link entity to chunk
        self.graph.add_edge(entity_text, chunk_id, relation="mentions")
        self.entity_chunk_map[entity_text].add(chunk_id)
    
    def extract_entities(self, text, chunk_id):
        """Extract entities from text and add them to the graph."""
        doc = nlp(text)
        for ent in doc.ents:
            self.add_entity(ent.text, chunk_id)
    
    def add_entity_entity_edges(self):
        """Create edges between entities if they co-occur in the same chunk."""
        for chunk_id in self.graph.nodes:
            if self.graph.nodes[chunk_id].get("type") == "chunk":
                connected_entities = [n for n in self.graph.neighbors(chunk_id) if self.graph.nodes[n].get("type") == "entity"]
                for i in range(len(connected_entities)):
                    for j in range(i + 1, len(connected_entities)):
                        self.graph.add_edge(connected_entities[i], connected_entities[j], relation="co-occurs")

    def construct_graph(self, text_chunks):
        """Build the entire heterogeneous graph from text chunks."""
        for i, chunk in enumerate(text_chunks):
            chunk_id = f"chunk_{i}"
            self.add_chunk(chunk_id, chunk)
            self.extract_entities(chunk, chunk_id)
        
        self.add_entity_entity_edges()

In [14]:
# Example Usage
text_chunks = [
    "The Master Fund generated a 7.6% gain in the Fourth Quarter, with contributions from equity longs, risk arbitrage, and corporate credit offset modestly by equity shorts and hedges",
    "Third Point’s Chief Compliance Officer, William Song, left the firm earlier in February.",
    "Chief Marketing Officer Jenny Wood left the firm at the end of 2023, marking the culmination of an orderly transition following her decision to leave earlier in the year."
]
rag_graph = MiniRAGGraph()
rag_graph.construct_graph(text_chunks)

In [18]:
[print(i) for i in rag_graph.graph.edges(data=True)]

('chunk_0', 'the master fund', {'relation': 'mentions'})
('chunk_0', '7.6%', {'relation': 'mentions'})
('chunk_0', 'the fourth quarter', {'relation': 'mentions'})
('the master fund', '7.6%', {'relation': 'co-occurs'})
('the master fund', 'the fourth quarter', {'relation': 'co-occurs'})
('7.6%', 'the fourth quarter', {'relation': 'co-occurs'})
('chunk_1', 'third point’s', {'relation': 'mentions'})
('chunk_1', 'william song', {'relation': 'mentions'})
('chunk_1', 'february', {'relation': 'mentions'})
('third point’s', 'william song', {'relation': 'co-occurs'})
('third point’s', 'february', {'relation': 'co-occurs'})
('william song', 'february', {'relation': 'co-occurs'})
('chunk_2', 'jenny wood', {'relation': 'mentions'})
('chunk_2', 'the end of 2023', {'relation': 'mentions'})
('chunk_2', 'earlier in the year', {'relation': 'mentions'})
('jenny wood', 'the end of 2023', {'relation': 'co-occurs'})
('jenny wood', 'earlier in the year', {'relation': 'co-occurs'})
('the end of 2023', 'earli

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [16]:
rag_graph.graph.nodes(data=True)

NodeDataView({'chunk_0': {'type': 'chunk', 'text': 'The Master Fund generated a 7.6% gain in the Fourth Quarter, with contributions from equity longs, risk arbitrage, and corporate credit offset modestly by equity shorts and hedges', 'embedding': array([ 6.24931417e-02, -2.32484676e-02, -4.62568216e-02,  6.07725531e-02,
        1.03526197e-01,  1.52742201e-02, -8.57703574e-03,  6.72909478e-03,
        1.83169115e-02,  1.17052738e-02, -2.82588731e-02,  4.01610509e-02,
       -5.60715748e-03, -7.12699518e-02, -5.78971542e-02, -3.92884947e-03,
       -3.88442278e-02, -2.29247101e-02,  3.42992544e-02,  3.65375094e-02,
       -7.89846107e-02, -5.03293313e-02, -2.37232284e-03,  4.51938994e-02,
        9.13157687e-02, -5.25180511e-02,  3.00289225e-03,  1.31479725e-02,
       -2.92160432e-03, -7.35904500e-02,  3.81264165e-02,  7.02231377e-02,
        4.09961231e-02,  3.62100825e-03, -3.21506672e-02,  5.94636798e-02,
        2.43651234e-02,  3.07800435e-02, -3.11074425e-02,  2.02401611e-03,
   