In [None]:
# pip installs
!pip install rdflib
!pip install datasets
!pip install transformers
!pip install torch
!pip install elasticsearch
!pip install datasets
!pip install pandas tqdm fastapi pydantic chromadb huggingface_hub
!pip install bitsandbytes
!pip install SPARQLWrapper
!pip install rdflib-sqlalchemy
!pip install networkx pyvis


Collecting rdflib
  Downloading rdflib-7.1.1-py3-none-any.whl.metadata (11 kB)
Collecting isodate<1.0.0,>=0.7.2 (from rdflib)
  Downloading isodate-0.7.2-py3-none-any.whl.metadata (11 kB)
Downloading rdflib-7.1.1-py3-none-any.whl (562 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m562.4/562.4 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading isodate-0.7.2-py3-none-any.whl (22 kB)
Installing collected packages: isodate, rdflib
Successfully installed isodate-0.7.2 rdflib-7.1.1
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (f

# creating Knowledge Graph


To create a knowledge graph from a Relational Database we did the following:

Ontology Design: Developed an ontology with entities like User, Location, Profile, Session, Content, Reaction, and ReactionType, defining properties and relationships (e.g., hasName, hasLocation, hasInterests) to represent real-world semantics.

Data Loading: Imported data from CSV files representing entities, ensuring integrity and limiting size for demonstration.

Knowledge Graph Construction: Converted dataset records into RDF triples using the ontology, assigning unique URIs to entities and relationships. Established inter-entity links (e.g., users to profiles, content, and reactions).

Visualization: Transformed the RDF graph into a NetworkX DiGraph for visualization, using color-coded nodes for entities, annotated edges for relationships, and a spring layout. Saved the visualized graph as an image.

Serialization: Exported the knowledge graph in Turtle (.ttl) format for reuse and further analysis.








In [None]:
import pandas as pd
from rdflib import Graph, Namespace, Literal, URIRef
from rdflib.namespace import RDF, RDFS, XSD
import networkx as nx
import matplotlib.pyplot as plt
from datetime import datetime
import re

class SocialNetworkKG:
    def __init__(self):
        self.g = Graph()
        # Define namespaces
        self.ns = Namespace("http://example.org/social/")
        self.schema = Namespace("http://schema.org/")

        # Bind namespaces for prettier output
        self.g.bind("social", self.ns)
        self.g.bind("schema", self.schema)

    @staticmethod
    def sanitize_uri(text):
        """Convert text to URI-safe format"""
        # Replace spaces with underscores and remove special characters
        safe_text = re.sub(r'[^a-zA-Z0-9_-]', '', text.replace(' ', '_'))
        return safe_text

    def create_ontology(self):
        """Create ontology for all entities"""
        # Classes
        classes = ['User', 'Location', 'Profile', 'Session', 'Content',
                  'Reaction', 'ReactionType']
        for class_name in classes:
            self.g.add((self.ns[class_name], RDF.type, RDFS.Class))

        # Properties for User
        self.g.add((self.ns['hasName'], RDF.type, RDF.Property))
        self.g.add((self.ns['hasEmail'], RDF.type, RDF.Property))

        # Properties for Location
        self.g.add((self.ns['hasAddress'], RDF.type, RDF.Property))
        self.g.add((self.ns['hasLocation'], RDF.type, RDF.Property))

        # Properties for Profile
        self.g.add((self.ns['hasInterests'], RDF.type, RDF.Property))
        self.g.add((self.ns['hasAge'], RDF.type, RDF.Property))

        # Properties for Session
        self.g.add((self.ns['hasDevice'], RDF.type, RDF.Property))
        self.g.add((self.ns['hasDuration'], RDF.type, RDF.Property))

        # Properties for Content
        self.g.add((self.ns['hasType'], RDF.type, RDF.Property))
        self.g.add((self.ns['hasCategory'], RDF.type, RDF.Property))
        self.g.add((self.ns['hasURL'], RDF.type, RDF.Property))

        # Properties for Reaction
        self.g.add((self.ns['hasReactionType'], RDF.type, RDF.Property))
        self.g.add((self.ns['hasDateTime'], RDF.type, RDF.Property))

        # Properties for ReactionType
        self.g.add((self.ns['hasSentiment'], RDF.type, RDF.Property))
        self.g.add((self.ns['hasScore'], RDF.type, RDF.Property))

    def load_data(self, data_files):
        """Load data from CSV files"""
        try:
            self.dataframes = {}
            for key, file_path in data_files.items():
                self.dataframes[key] = pd.read_csv(file_path).head(50)
                print(f"Loaded {len(self.dataframes[key])} {key} records")
        except Exception as e:
            print(f"Error loading CSV files: {str(e)}")
            raise

    def create_knowledge_graph(self):
        """Convert all dataframes to knowledge graph"""
        # Add Users
        if 'users' in self.dataframes:
            for _, user in self.dataframes['users'].iterrows():
                user_uri = URIRef(self.ns[f"user_{user['User ID']}"])
                self.g.add((user_uri, RDF.type, self.ns['User']))
                self.g.add((user_uri, self.ns['hasName'], Literal(user['Name'], datatype=XSD.string)))
                self.g.add((user_uri, self.ns['hasEmail'], Literal(user['Email'], datatype=XSD.string)))

        # Add Locations
        if 'locations' in self.dataframes:
            for _, location in self.dataframes['locations'].iterrows():
                location_uri = URIRef(self.ns[f"location_{location['User ID']}"])
                user_uri = URIRef(self.ns[f"user_{location['User ID']}"])
                self.g.add((location_uri, RDF.type, self.ns['Location']))
                self.g.add((location_uri, self.ns['hasAddress'], Literal(location['Address'], datatype=XSD.string)))
                self.g.add((user_uri, self.ns['hasLocation'], location_uri))

        # Add Profiles
        if 'profiles' in self.dataframes:
            for _, profile in self.dataframes['profiles'].iterrows():
                user_uri = URIRef(self.ns[f"user_{profile['User ID']}"])
                profile_uri = URIRef(self.ns[f"profile_{profile['User ID']}"])
                self.g.add((profile_uri, RDF.type, self.ns['Profile']))
                self.g.add((profile_uri, self.ns['hasAge'], Literal(profile['Age'], datatype=XSD.integer)))
                interests = profile['Interests'] if isinstance(profile['Interests'], str) else str(profile['Interests'])
                self.g.add((profile_uri, self.ns['hasInterests'], Literal(interests, datatype=XSD.string)))
                self.g.add((user_uri, self.ns['hasProfile'], profile_uri))

        # Add Sessions
        if 'sessions' in self.dataframes:
            for _, session in self.dataframes['sessions'].iterrows():
                session_uri = URIRef(self.ns[f"session_{session['User ID']}_{session.name}"])
                user_uri = URIRef(self.ns[f"user_{session['User ID']}"])
                self.g.add((session_uri, RDF.type, self.ns['Session']))
                self.g.add((session_uri, self.ns['hasDevice'], Literal(session['Device'], datatype=XSD.string)))
                self.g.add((session_uri, self.ns['hasDuration'], Literal(session['Duration'], datatype=XSD.float)))
                self.g.add((user_uri, self.ns['hasSession'], session_uri))

        # Add Content
        if 'content' in self.dataframes:
            for _, content in self.dataframes['content'].iterrows():
                content_uri = URIRef(self.ns[f"content_{content['Content ID']}"])
                user_uri = URIRef(self.ns[f"user_{content['User ID']}"])
                self.g.add((content_uri, RDF.type, self.ns['Content']))
                self.g.add((content_uri, self.ns['hasType'], Literal(content['Type'], datatype=XSD.string)))
                self.g.add((content_uri, self.ns['hasCategory'], Literal(content['Category'], datatype=XSD.string)))
                self.g.add((content_uri, self.ns['hasURL'], Literal(content['URL'], datatype=XSD.string)))
                self.g.add((user_uri, self.ns['hasContent'], content_uri))

        # Add Reactions
        if 'reactions' in self.dataframes:
            for _, reaction in self.dataframes['reactions'].iterrows():
                reaction_uri = URIRef(self.ns[f"reaction_{reaction['Content ID']}_{reaction['User ID']}"])
                user_uri = URIRef(self.ns[f"user_{reaction['User ID']}"])
                content_uri = URIRef(self.ns[f"content_{reaction['Content ID']}"])
                self.g.add((reaction_uri, RDF.type, self.ns['Reaction']))
                reaction_type_safe = self.sanitize_uri(str(reaction['Type']))
                self.g.add((reaction_uri, self.ns['hasType'], Literal(reaction_type_safe, datatype=XSD.string)))
                self.g.add((reaction_uri, self.ns['hasDateTime'], Literal(reaction['Datetime'], datatype=XSD.dateTime)))
                self.g.add((user_uri, self.ns['hasReaction'], reaction_uri))
                self.g.add((content_uri, self.ns['hasReaction'], reaction_uri))

        # Add ReactionTypes
        if 'reaction_types' in self.dataframes:
            for _, r_type in self.dataframes['reaction_types'].iterrows():
                # Sanitize the type string for URI
                safe_type = self.sanitize_uri(str(r_type['Type']))
                type_uri = URIRef(self.ns[f"reactionType_{safe_type}"])
                self.g.add((type_uri, RDF.type, self.ns['ReactionType']))
                self.g.add((type_uri, self.ns['hasSentiment'], Literal(r_type['Sentiment'], datatype=XSD.string)))
                self.g.add((type_uri, self.ns['hasScore'], Literal(r_type['Score'], datatype=XSD.integer)))


    def visualize(self, output_file='social_network_graph.png'):
        """Visualize the knowledge graph"""
        G = nx.DiGraph()

        # Add edges to NetworkX graph with different colors for different relationship types
        edge_colors = []
        for s, p, o in self.g:
            s_label = str(s).split('/')[-1]
            p_label = str(p).split('/')[-1]

            if isinstance(o, Literal):
                o_label = str(o)[:30] + '...' if len(str(o)) > 30 else str(o)
            else:
                o_label = str(o).split('/')[-1]

            # Add nodes with types
            node_types = ['user', 'location', 'profile', 'session', 'content', 'reaction', 'reactionType']
            for node_type in node_types:
                if node_type in s_label.lower():
                    G.add_node(s_label, node_type=node_type)
                    break
            else:
                G.add_node(s_label)

            G.add_node(o_label)
            G.add_edge(s_label, o_label, label=p_label)

        # Create visualization with improved layout
        plt.figure(figsize=(20, 15))
        pos = nx.spring_layout(G, k=2, iterations=50)

        # Draw nodes with different colors for different types
        node_colors = {
            'user': 'lightblue',
            'location': 'lightgreen',
            'profile': 'lightpink',
            'session': 'lightyellow',
            'content': 'lightgray',
            'reaction': 'coral',
            'reactionType': 'lavender'
        }

        for node_type, color in node_colors.items():
            nodes = [n for n, attr in G.nodes(data=True) if attr.get('node_type') == node_type]
            if nodes:
                nx.draw_networkx_nodes(G, pos, nodelist=nodes, node_color=color,
                                     node_size=2000, label=node_type.capitalize())

        # Draw remaining nodes
        other_nodes = [n for n in G.nodes() if 'node_type' not in G.nodes[n]]
        if other_nodes:
            nx.draw_networkx_nodes(G, pos, nodelist=other_nodes,
                                 node_color='white', node_size=1000)

        # Draw edges and labels
        nx.draw_networkx_edges(G, pos, edge_color='gray', arrows=True, arrowsize=20)
        nx.draw_networkx_labels(G, pos, font_size=8)

        edge_labels = nx.get_edge_attributes(G, 'label')
        nx.draw_networkx_edge_labels(G, pos, edge_labels, font_size=6)

        plt.title("Social Network Knowledge Graph")
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.axis('off')
        plt.tight_layout()

        # Save visualization
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        plt.close()

        print(f"Visualization saved as {output_file}")

    def save_graph(self, output_file='social_network_kg.ttl'):
        """Save the knowledge graph to a TTL file"""
        self.g.serialize(destination=output_file, format="turtle")
        print(f"Knowledge graph saved as {output_file}")

# Example usage
def create_social_network_kg(data_files):
    """Create and visualize knowledge graph from CSV files"""
    kg = SocialNetworkKG()
    kg.create_ontology()
    kg.load_data(data_files)
    kg.create_knowledge_graph()
    kg.visualize()
    kg.save_graph()
    return kg

# Example usage:
data_files = {
    'users': '/content/User - User (1).csv.csv',
    'locations': '/content/Location - Location (1).csv.csv',
    'profiles': '/content/Profile - Profile (1).csv (1).csv',
    'sessions': '/content/Session - Session (1).csv.csv',
    'content': '/content/Content - Content (1).csv.csv',
    'reactions': '/content/Reactions - Reactions (1).csv.csv',
    'reaction_types': '/content/ReactionTypes (1) - ReactionTypes (1).csv.csv'
}

kg = create_social_network_kg(data_files)

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/isodate/isodatetime.py", line 25, in parse_datetime
    datestring, timestring = datetimestring.split("T")
ValueError: not enough values to unpack (expected 2, got 1)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/rdflib/term.py", line 2163, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "/usr/local/lib/python3.10/dist-packages/isodate/isodatetime.py", line 27, in parse_datetime
    raise ISO8601Error(
isodate.isoerror.ISO8601Error: ISO 8601 time designator 'T' missing. Unable to parse datetime string '2021-04-22 15:17:15'
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/isodate/isodatetime.py", line 25, in parse_datetime
    datestring, timestring = datetimestring.split("T")
ValueError: not enough values to unpack (expected 2,

Loaded 50 users records
Loaded 50 locations records
Loaded 50 profiles records
Loaded 50 sessions records
Loaded 50 content records
Loaded 50 reactions records
Loaded 16 reaction_types records


Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/isodate/isodatetime.py", line 25, in parse_datetime
    datestring, timestring = datetimestring.split("T")
ValueError: not enough values to unpack (expected 2, got 1)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/rdflib/term.py", line 2163, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "/usr/local/lib/python3.10/dist-packages/isodate/isodatetime.py", line 27, in parse_datetime
    raise ISO8601Error(
isodate.isoerror.ISO8601Error: ISO 8601 time designator 'T' missing. Unable to parse datetime string '2021-01-06 19:13:01'
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/isodate/isodatetime.py", line 25, in parse_datetime
    datestring, timestring = datetimestring.split("T")
ValueError: not enough values to unpack (expected 2,

Visualization saved as social_network_graph.png
Knowledge graph saved as social_network_kg.ttl


# SPARQL WAY

a. Knowledge Graph Setup

Loaded the TTL file and fixed any formatting issues (e.g., datetime inconsistencies).
Defined and bound custom namespaces to ensure the knowledge graph could interpret domain-specific entities.

b. Query Type Identification

Designed predefined query templates for common queries (e.g., user info, content, age distribution).
Used a SentenceTransformer to encode natural language patterns for each query type.
Compared user questions against these encoded patterns to determine the most likely query type.

c. Entity and Parameter Extraction

Extracted user-specific entities (e.g., names or ages) from questions using regex and semantic similarity techniques.
Mapped extracted names to valid users in the knowledge graph by handling typos and partial matches.

d. Query Execution

Based on the identified query type, constructed and executed SPARQL queries on the knowledge graph.
Processed query results into meaningful and human-readable responses.

e. Semantic Question Matching

Applied embeddings to match user questions with predefined templates, enabling the system to generalize beyond rigid keyword-based queries.

In [None]:
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD
from sentence_transformers import SentenceTransformer
import numpy as np
from typing import List, Tuple, Dict
import re
from datetime import datetime

class EnhancedKnowledgeGraphQA:
    def __init__(self, ttl_file: str):
        """Initialize the QA system with a TTL file"""
        self.g = Graph()

        # Custom parsing with datetime handling
        try:
            self.g.parse(ttl_file, format="turtle")
        except Exception as e:
            # If parsing fails, try pre-processing the file to fix datetime format
            self._fix_and_load_ttl(ttl_file)

        self.ns = Namespace("http://example.org/social/")
        self.g.bind("social", self.ns)
        self.g.bind("rdf", RDF)
        self.g.bind("rdfs", RDFS)
        self.g.bind("xsd", XSD)

        self.encoder = SentenceTransformer('all-MiniLM-L6-v2')
        self.valid_users = self.list_users()

        self.query_templates = self._create_query_templates()
        self.pattern_embeddings = self._encode_patterns()

    def _fix_and_load_ttl(self, ttl_file: str):
        """Fix datetime format in TTL file and load it"""
        with open(ttl_file, 'r') as f:
            content = f.read()

        # Replace datetime format to match ISO 8601
        pattern = r'(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})'
        content = re.sub(pattern, lambda m: m.group(1).replace(' ', 'T'), content)

        # Parse the fixed content
        self.g.parse(data=content, format="turtle")

    def _create_query_templates(self) -> Dict[str, str]:
        """Create SPARQL query templates for different question types"""
        return {
            "user_info": """
                PREFIX social: <http://example.org/social/>
                PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

                SELECT ?name ?email ?age ?interests
                WHERE {
                    ?user rdf:type social:User ;
                          social:hasName ?name ;
                          social:hasEmail ?email ;
                          social:hasProfile ?profile .
                    ?profile social:hasAge ?age ;
                            social:hasInterests ?interests .
                    FILTER(LCASE(str(?name)) = LCASE("%s"))
                }
            """,

            "user_content": """
                PREFIX social: <http://example.org/social/>
                PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

                SELECT ?type ?category ?url
                WHERE {
                    ?user rdf:type social:User ;
                          social:hasName ?name ;
                          social:hasContent ?content .
                    ?content social:hasType ?type ;
                            social:hasCategory ?category ;
                            social:hasURL ?url .
                    FILTER(LCASE(str(?name)) = LCASE("%s"))
                }
            """,

            "user_reactions": """
                PREFIX social: <http://example.org/social/>
                PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

                SELECT ?content_type ?reaction_type ?datetime
                WHERE {
                    ?user rdf:type social:User ;
                          social:hasName ?name ;
                          social:hasReaction ?reaction .
                    ?reaction social:hasType ?reaction_type ;
                             social:hasDateTime ?datetime .
                    ?content social:hasReaction ?reaction ;
                            social:hasType ?content_type .
                    FILTER(LCASE(str(?name)) = LCASE("%s"))
                }
            """,

            "age_based_interests": """
                PREFIX social: <http://example.org/social/>
                PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

                SELECT DISTINCT ?interests
                WHERE {
                    ?user rdf:type social:User ;
                          social:hasProfile ?profile .
                    ?profile social:hasAge ?age ;
                            social:hasInterests ?interests .
                    FILTER(?age = %d)
                }
            """,

            "common_interests": """
                PREFIX social: <http://example.org/social/>
                PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

                SELECT ?interests (COUNT(DISTINCT ?user) as ?count)
                WHERE {
                    ?user rdf:type social:User ;
                          social:hasProfile ?profile .
                    ?profile social:hasInterests ?interests .
                }
                GROUP BY ?interests
                ORDER BY DESC(?count)
            """,

            "age_distribution": """
                PREFIX social: <http://example.org/social/>
                PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

                SELECT ?age (COUNT(DISTINCT ?user) as ?count)
                WHERE {
                    ?user rdf:type social:User ;
                          social:hasProfile ?profile .
                    ?profile social:hasAge ?age .
                }
                GROUP BY ?age
                ORDER BY ?age
            """
        }

    def _encode_patterns(self) -> Dict[str, np.ndarray]:
        """Create embeddings for question patterns"""
        patterns = {
            "user_info": [
                "What is the information about user",
                "Tell me about user",
                "What are the details of user",
                "Show me user profile",
                "Who is"
            ],
            "user_content": [
                "What content has user posted",
                "Show me posts by user",
                "What has user shared",
                "List content from user"
            ],
            "user_reactions": [
                "What are the reactions by user",
                "Show me how user reacted",
                "What has user liked or commented on",
                "List user's reactions"
            ],
            "age_based_interests": [
                "What are people interested in at age",
                "Show me interests of users who are",
                "What do people like at age",
                "What interests do year olds have"
            ],
            "common_interests": [
                "What are the most common interests",
                "Show popular interests",
                "What do users like most",
                "Most common interests"
            ],
            "age_distribution": [
                "How many users are there of each age",
                "Show age distribution",
                "Age breakdown of users",
                "User ages statistics"
            ]
        }

        return {
            query_type: self.encoder.encode(patterns[query_type]).mean(axis=0)
            for query_type in patterns
        }

    def _extract_user_name(self, question: str) -> str:
        """Extract user name from question"""
        question_lower = question.lower()

        # Check for exact matches
        for user in self.valid_users:
            if user.lower() in question_lower:
                return user

        # Use regex patterns
        patterns = [
            r"(?:user|about|by)\s+([A-Za-z\s]+?)(?:\s|$)",
            r"(?:who is|where is)\s+([A-Za-z\s]+?)(?:\s|$)",
            r"([A-Za-z\s]+?)(?:'s|\sis)",
        ]

        for pattern in patterns:
            match = re.search(pattern, question, re.IGNORECASE)
            if match:
                extracted_name = match.group(1).strip()
                closest_match = self._find_closest_user(extracted_name)
                if closest_match:
                    return closest_match

        return None

    def _extract_age(self, question: str) -> int:
        """Extract age from question"""
        age_patterns = [
            r"(\d+)\s*(?:year|years|yr|yrs)?\s*old",
            r"age\s*(?:of)?\s*(\d+)",
            r"(\d+)\s*(?:year|years|yr|yrs)?"
        ]

        for pattern in age_patterns:
            match = re.search(pattern, question, re.IGNORECASE)
            if match:
                return int(match.group(1))
        return None

    def _find_closest_user(self, name: str) -> str:
        """Find the closest matching valid user"""
        if not name:
            return None

        name_lower = name.lower()
        for user in self.valid_users:
            if user.lower() == name_lower:
                return user

        for user in self.valid_users:
            if name_lower in user.lower() or user.lower() in name_lower:
                return user

        return None

    def _get_query_type(self, question: str) -> str:
        """Determine the type of query based on semantic similarity"""
        question_embedding = self.encoder.encode(question)

        similarities = {
            query_type: np.dot(question_embedding, pattern_embedding)
            for query_type, pattern_embedding in self.pattern_embeddings.items()
        }

        return max(similarities.items(), key=lambda x: x[1])[0]

    def _process_results(self, results, query_type: str, params: Dict = None) -> str:
        """Process SPARQL query results into natural language response"""
        results_list = list(results)
        if not results_list:
            return f"No results found for this query."

        if query_type == "user_info":
            row = results_list[0]
            return (f"{params['user_name']}'s profile:\n"
                   f"- Email: {row['email']}\n"
                   f"- Age: {row['age']}\n"
                   f"- Interests: {row['interests']}")

        elif query_type == "age_based_interests":
            interests = set([str(row['interests']) for row in results_list])
            age = params['age']
            if not interests:
                return f"No interests found for {age} year olds."
            return f"Users who are {age} years old are interested in: {', '.join(interests)}"

        elif query_type == "common_interests":
            interests = [f"{row['interests']} ({row['count']} users)" for row in results_list]
            return "Most common interests:\n- " + "\n- ".join(interests)

        elif query_type == "age_distribution":
            distribution = [f"Age {row['age']}: {row['count']} users" for row in results_list]
            return "Age distribution:\n- " + "\n- ".join(distribution)

        return "Unable to process the results."

    def answer_question(self, question: str) -> str:
        """Answer a natural language question about the knowledge graph"""
        query_type = self._get_query_type(question)

        if query_type in ["user_info", "user_content", "user_reactions"]:
            user_name = self._extract_user_name(question)
            if not user_name:
                return ("I couldn't identify a user name in your question. Please specify a user.\n"
                       f"Valid users are: {', '.join(self.valid_users)}")
            query = self.query_templates[query_type] % user_name
            results = self.g.query(query)
            return self._process_results(results, query_type, {'user_name': user_name})

        elif query_type == "age_based_interests":
            age = self._extract_age(question)
            if not age:
                return "Please specify an age in your question."
            query = self.query_templates[query_type] % age
            results = self.g.query(query)
            return self._process_results(results, query_type, {'age': age})

        else:  # common_interests, age_distribution
            query = self.query_templates[query_type]
            results = self.g.query(query)
            return self._process_results(results, query_type)

    def list_users(self) -> List[str]:
        """List all users in the knowledge graph"""
        query = """
            PREFIX social: <http://example.org/social/>
            PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

            SELECT DISTINCT ?name
            WHERE {
                ?user rdf:type social:User ;
                      social:hasName ?name .
            }
        """
        results = self.g.query(query)
        return [str(row['name']) for row in results]

    def suggest_questions(self) -> List[str]:
        """Suggest sample questions"""
        users = self.valid_users
        if not users:
            return []

        sample_user = users[0]
        return [
            f"What is the information about user {sample_user}?",
            f"Show me posts by user {sample_user}",
            "What are people interested in at age 14?",
            "What are the most common interests?",
            "Show me the age distribution of users"
        ]

# Example usage
if __name__ == "__main__":
    qa_system = EnhancedKnowledgeGraphQA("social_network_kg.ttl")

    print("Available users:", qa_system.valid_users)

    print("\nSuggested questions:")
    for question in qa_system.suggest_questions():
        print(f"- {question}")

    while True:
        question = input("\nEnter your question (or 'quit' to exit): ")
        if question.lower() == 'quit':
            break
        print(f"\nA: {qa_system.answer_question(question)}")

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/isodate/isodatetime.py", line 25, in parse_datetime
    datestring, timestring = datetimestring.split("T")
ValueError: not enough values to unpack (expected 2, got 1)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/rdflib/term.py", line 2163, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "/usr/local/lib/python3.10/dist-packages/isodate/isodatetime.py", line 27, in parse_datetime
    raise ISO8601Error(
isodate.isoerror.ISO8601Error: ISO 8601 time designator 'T' missing. Unable to parse datetime string '2021-04-11 14:29:59'
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/isodate/isodatetime.py", line 25, in parse_datetime
    datestring, timestring = datetimestring.split("T")
ValueError: not enough values to unpack (expected 2,

Available users: ['Deborah Eastes', 'Violet Thomas', 'Jose Pankratz', 'Julia Rivera', 'Daniel Beede', 'Leonard Lavigne', 'Louise Laramore', 'Chester Smith', 'Kerry Flores', 'Christopher Gipson', 'Kathy Johnson', 'Hazel Neville', 'Debbie Stanton', 'William Christensen', 'Betty Ellis', 'Mary Judge', 'Shawn Landau', 'Cristi Miller', 'Gary Hill', 'Suzanne Campbell', 'Bonnie Castillo', 'Diane Hall', 'Lesley Franich', 'Larry Mcgee', 'Kathryn Henderson', 'Gavin Anderson', 'Javier Johnson', 'Kristi Mccarthy', 'Patrick Ponce', 'Michael Deering', 'Yen Bruton', 'Jorge Oglesby', 'Brandon Portillo', 'Amanda Price', 'Kari Jones', 'Richard Bain', 'Rose Cook', 'Judy Cobbley', 'Marie Laflam', 'Justin Hooper', 'Timothy Martin', 'Rhoda Lopez', 'Raymond Whitley', 'Randal Davis', 'Elizabeth Holmes', 'Eva Brown', 'Charles Pennington', 'David Pujols', 'Jo Pearson', 'Douglas Colon']

Suggested questions:
- What is the information about user Deborah Eastes?
- Show me posts by user Deborah Eastes
- What are peo

### Conclusion of the above method


The method performs well but has notable limitations. It requires the questions to explicitly define their type, relying heavily on hardcoded semantic matching. This approach demands accounting for all possible scenarios, making the code lengthy, tedious, and prone to redundancy.

Currently, the questions are modeled specifically around users' age and interests. For example, the system accurately handles queries such as:


*Input: What are people interested in at age 14?*

*Output: Users who are 14 years old are interested in: ['education', 'studying', 'education', 'culture'], ['culture', 'public speaking'].*


However, the system struggles with questions related to other aspects of the database, such as Reactions or ReactionTypes. This highlights the brute-force nature of the approach, which, while functional for predefined scenarios, is overly simplistic and lacks scalability for more complex or diverse queries.










# Using TTL File in an LLM Pipeline

This systematic approach integrates KG reasoning, semantic embedding, and lightweight LLMs to deliver contextually accurate answers to user queries.

**Knowledge Graph Construction:**

Built a knowledge graph (KG) from structured data using RDF triples, with unique URIs for nodes and relationships.
Represented entities (e.g., users, content) and their connections (e.g., reactions, profiles).

**Embedding Initialization:**

Utilized the SentenceTransformer model to encode graph nodes and triples into embeddings for semantic similarity calculations.
Generated textual descriptions for nodes and triples to create meaningful embeddings.

**Relevant Triple Retrieval:**

Calculated semantic similarity between the user's question and graph triples to identify the most relevant facts.

**Entity and Subgraph Extraction:**

Extracted entities mentioned in the question and expanded them into a subgraph of related nodes and edges, with a configurable depth.

**Context Creation:**

Compiled a contextual summary by combining the most relevant triples and subgraph facts, structured for interpretability.

**Response Generation:**

Employed a lightweight OPT-350M model to generate answers based on the context and question, integrating the KG's exact data values.

**Explainability:**

Provided explanations by outlining relevant triples, identified entities, and the subgraph used in reasoning, ensuring transparency in decision-making.



In [None]:
import numpy as np
from rdflib import Graph, Namespace, Literal, URIRef, RDF, RDFS
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Tuple, Set
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import re
from collections import defaultdict

class KGRAGSystem:
    def __init__(self, kg: SocialNetworkKG):
        self.kg = kg
        # Initialize embedding model
        self.embed_model = SentenceTransformer('all-MiniLM-L6-v2')
        # Initialize LLM using a free model
        self.generator = pipeline('text-generation',
                                model='facebook/opt-350m',  # Using OPT model which is freely available
                                max_length=512)
        # Store embeddings for graph nodes and relationships
        self.node_embeddings = {}
        self.triple_embeddings = {}
        self.triple_texts = {}
        self._initialize_embeddings()

    def _initialize_embeddings(self):
        """Initialize embeddings for all nodes and relationships in the graph"""
        print("Initializing embeddings...")

        # Create text representations and embeddings for nodes
        for subject in self.kg.g.subjects():
            if isinstance(subject, URIRef):
                node_text = self._create_node_text(subject)
                self.node_embeddings[str(subject)] = self.embed_model.encode(node_text)

        # Create text representations and embeddings for triples
        for s, p, o in self.kg.g:
            triple_text = self._create_triple_text(s, p, o)
            triple_key = (str(s), str(p), str(o))
            self.triple_texts[triple_key] = triple_text
            self.triple_embeddings[triple_key] = self.embed_model.encode(triple_text)

    def _create_node_text(self, node: URIRef) -> str:
        """Create a textual representation of a node"""
        node_type = None
        properties = []

        # Get node type
        for _, _, o in self.kg.g.triples((node, RDF.type, None)):
            node_type = str(o).split('/')[-1]

        # Get node properties
        for _, p, o in self.kg.g.triples((node, None, None)):
            if p != RDF.type:
                prop_name = str(p).split('/')[-1]
                prop_value = str(o)
                properties.append(f"{prop_name}: {prop_value}")

        # Create readable text
        node_id = str(node).split('/')[-1]
        text = f"{node_type} {node_id} with properties: {'; '.join(properties)}"
        return text

    def _create_triple_text(self, s: URIRef, p: URIRef, o) -> str:
        """Create a textual representation of a triple"""
        subj = str(s).split('/')[-1]
        pred = str(p).split('/')[-1]
        obj = str(o).split('/')[-1] if isinstance(o, URIRef) else str(o)
        return f"{subj} {pred} {obj}"

    def _get_relevant_triples(self, question: str, k: int = 5) -> List[Tuple[str, float]]:
        """Retrieve relevant triples based on question similarity"""
        question_embedding = self.embed_model.encode(question)

        # Calculate similarities with all triples
        similarities = []
        for triple_key, triple_embedding in self.triple_embeddings.items():
            similarity = np.dot(question_embedding, triple_embedding)
            similarities.append((self.triple_texts[triple_key], similarity))

        # Return top-k most similar triples
        return sorted(similarities, key=lambda x: x[1], reverse=True)[:k]

    def _extract_entities(self, question: str) -> Set[str]:
        """Extract entity mentions from the question"""
        entities = set()

        # Extract nodes that match parts of the question
        for node in self.node_embeddings.keys():
            node_name = str(node).split('/')[-1].lower()
            if node_name in question.lower():
                entities.add(node)

        return entities

    def _get_subgraph(self, entities: Set[str], depth: int = 2) -> Set[Tuple]:
        """Extract a relevant subgraph around mentioned entities"""
        subgraph = set()
        current_entities = entities

        for _ in range(depth):
            new_entities = set()
            for entity in current_entities:
                # Add outgoing edges
                for s, p, o in self.kg.g.triples((URIRef(entity), None, None)):
                    subgraph.add((str(s), str(p), str(o)))
                    if isinstance(o, URIRef):
                        new_entities.add(str(o))

                # Add incoming edges
                for s, p, o in self.kg.g.triples((None, None, URIRef(entity))):
                    subgraph.add((str(s), str(p), str(o)))
                    if isinstance(s, URIRef):
                        new_entities.add(str(s))

            current_entities = new_entities

        return subgraph

    def _create_context(self, relevant_triples: List[Tuple[str, float]], subgraph: Set[Tuple]) -> str:
        """Create a context string from relevant triples and subgraph"""
        context = "Knowledge Graph Information:\n\n"

        # Add relevant triples
        context += "Most relevant facts:\n"
        for triple, score in relevant_triples:
            context += f"- {triple}\n"

        # Add subgraph information
        context += "\nRelated information:\n"
        grouped_info = defaultdict(list)
        for s, p, o in subgraph:
            s_name = s.split('/')[-1]
            p_name = p.split('/')[-1]
            o_name = o.split('/')[-1] if 'http' in o else o
            grouped_info[s_name].append(f"{p_name}: {o_name}")

        for entity, facts in grouped_info.items():
            context += f"\n{entity}:\n"
            for fact in facts:
                context += f"- {fact}\n"

        return context

    def _generate_response(self, question: str, context: str) -> str:
        """Generate a response using the LLM"""
        prompt = f"""Based on the following knowledge graph information, please answer the question.
        Try to be specific and use the exact values from the knowledge graph when possible.

        Context:
        {context}

        Question: {question}

        Answer: """

        # Generate response using the pipeline
        response = self.generator(prompt, max_length=512, num_return_sequences=1)[0]['generated_text']

        # Extract the answer part
        answer = response.split("Answer:")[-1].strip()
        return answer

    def answer_question(self, question: str) -> str:
        """Main method to answer questions using RAG approach"""
        # Get relevant triples based on question similarity
        relevant_triples = self._get_relevant_triples(question)

        # Extract mentioned entities
        entities = self._extract_entities(question)

        # Get relevant subgraph
        subgraph = self._get_subgraph(entities)

        # Create context from relevant information
        context = self._create_context(relevant_triples, subgraph)

        # Generate response using LLM
        answer = self._generate_response(question, context)

        return answer

    def get_explanations(self, question: str) -> Dict:
        """Get explanations about the reasoning process"""
        relevant_triples = self._get_relevant_triples(question)
        entities = self._extract_entities(question)
        subgraph = self._get_subgraph(entities)

        return {
            "relevant_triples": [triple for triple, _ in relevant_triples],
            "identified_entities": list(entities),
            "subgraph_size": len(subgraph)
        }


In [None]:
# Initialize the system
rag_system = KGRAGSystem(kg)

questions = [
    "What are the top interests for 14 year olds?",
    "What are the top reaction types?",
    "What are the top locations?"
]

# Loop through the list of questions
for question in questions:
    answer = rag_system.answer_question(question)
    print(f"Question: {question}")
    explanations = rag_system.get_explanations(question)
    print("\nReasoning process:")
    print(f"Relevant triples: {explanations['relevant_triples']}")
    print(f"Identified entities: {explanations['identified_entities']}")
    print(f"Subgraph size: {explanations['subgraph_size']}")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Initializing embeddings...


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Question: What are the top interests for 14 year olds?

Reasoning process:
Relevant triples: ["profile_da97952a-2eba-46c4-a19e-3df3c40a1524 hasInterests ['animals', 'travel', 'science', 'studying']", "profile_63e07d14-7bed-44b8-9cf0-4a7e9b2a99ea hasInterests ['dogs', 'studying', 'public speaking']", "profile_9e9c6089-692f-406c-afee-50f62c127e9d hasInterests ['tennis', 'travel', 'science', 'education']", "profile_9dcb5fd8-9e7f-40d3-bd1d-edf33ecf6388 hasInterests ['food', 'animals', 'science', 'soccer']", "profile_002cd824-10f2-447c-8d1c-940325a1cdf4 hasInterests ['food', 'soccer', 'public speaking', 'public speaking']"]
Identified entities: []
Subgraph size: 0
Question: What are the top reaction types?

Reasoning process:
Relevant triples: ['reaction_97522e57-d9ab-4bd6-97bf-c24d952602d2_1932a904-86ba-4438-bb52-b7e6516a4019 22-rdf-syntax-ns#type Reaction', 'reaction_97522e57-d9ab-4bd6-97bf-c24d952602d2_e74edcea-5db4-4412-a4ce-eb7c5adc314a 22-rdf-syntax-ns#type Reaction', 'reaction_97522e

### Conclusion

The method above brings us closer to achieving our goal of building a QnA model using an LLM on a relational database. This model not only provides answers but also explains the reasoning behind them. However, a drawback is that while it prints relevant triples, the user must manually analyze these to derive the answer. On the positive side, it can answer questions related to all the relations in the database without needing explicit definitions or additional coding.

# Using Llama model

The process was divided into three main components:

Social Network Knowledge Graph (KG): We created a class to load and parse RDF triples from a .ttl file, representing social network data. The graph stores relationships between subjects, predicates, and objects.

RDF Data Processing and Storage: We implemented a processor that extracts triples from the RDF graph and stores them in ChromaDB, a vector database. This allows efficient retrieval of relevant context for answering questions. We used a sentence embedding model ("all-MiniLM-L6-v2") to convert RDF triples into document embeddings for storage.

Q&A System: We developed a Q&A system that uses the stored RDF triples to generate answers. For each query, the system retrieves the most relevant triples from ChromaDB, then passes them as context to a pre-trained causal language model (Llama). The model generates an answer by conditioning on both the query and the context.

In summary, we combined RDF data storage, vector search, and large language model capabilities to create an efficient and flexible system capable of answering questions based on relational data without the need for explicit coding or definitions for each query.

In [None]:
import numpy as np
from rdflib import Graph
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
import chromadb
from chromadb.utils import embedding_functions
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("QA System")

# Your Hugging Face token
HF_AUTH_TOKEN = "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"

# Load the model and tokenizer
model_name = "unsloth/llama-3-8b-bnb-4bit"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=HF_AUTH_TOKEN)
model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=HF_AUTH_TOKEN)

# Social Network KG class for parsing RDF triples
class SocialNetworkKG:
    def __init__(self, ttl_path: str):
        self.g = Graph()
        self.g.parse(ttl_path, format="turtle")

    def triples(self):
        return self.g

# RDF Data Processor class for extracting and storing RDF triples
class RDFDataProcessor:
    def __init__(self, ttl_path: str):
        """Initialize RDF processor and ChromaDB integration."""
        self.graph = SocialNetworkKG(ttl_path)
        self.chroma_client = chromadb.Client()
        self.embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
            model_name="all-MiniLM-L6-v2"
        )
        try:
            self.collection = self.chroma_client.get_collection(
                name="rdf_triples",
                embedding_function=self.embedding_function
            )
        except:
            self.collection = self.chroma_client.create_collection(
                name="rdf_triples",
                embedding_function=self.embedding_function
            )

    def extract_and_store_triples(self):
        """Extract triples from RDF graph and store them in ChromaDB."""
        if self.collection.count() > 0:
            logger.info("Collection already contains data.")
            return

        documents, metadatas, ids = [], [], []
        for idx, (s, p, o) in enumerate(self.graph.triples()):
            text = f"{str(s).split('/')[-1]} {str(p).split('/')[-1]} {str(o)}"
            metadata = {'subject': str(s), 'predicate': str(p), 'object': str(o)}
            documents.append(text)
            metadatas.append(metadata)
            ids.append(f"triple_{idx}")

            if len(documents) >= 1000:
                self.collection.add(documents=documents, metadatas=metadatas, ids=ids)
                documents, metadatas, ids = [], [], []

        if documents:
            self.collection.add(documents=documents, metadatas=metadatas, ids=ids)
        logger.info(f"Stored {self.collection.count()} triples.")

# Q&A System class for generating answers
class QASystem:
    def __init__(self, rdf_processor: RDFDataProcessor):
        self.rdf_processor = rdf_processor
        self.embed_model = SentenceTransformer("all-MiniLM-L6-v2")
        self.tokenizer, self.model = self._initialize_model()

    def _initialize_model(self):
        return tokenizer, model

    def get_relevant_context(self, query: str, k: int = 5):
        results = self.rdf_processor.collection.query(
            query_texts=[query],
            n_results=k,
            include=["documents", "metadatas", "distances"]
        )
        return [
            {"text": doc, "metadata": meta, "distance": dist}
            for doc, meta, dist in zip(
                results["documents"][0], results["metadatas"][0], results["distances"][0]
            )
        ]

    def generate_answer(self, query: str):
        context = self.get_relevant_context(query)
        context_text = "\n".join([f"{c['text']} (Relevance: {1 - c['distance']:.2f})" for c in context])

        prompt = f"""Context:\n{context_text}\n\nQuestion: {query}\nAnswer:"""
        inputs = self.tokenizer(prompt, return_tensors="pt")
        outputs = self.model.generate(inputs.input_ids, max_length=400)
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

if __name__ == "__main__":
    # File path to the .ttl file
    ttl_path = "/content/social_network_kg.ttl"

    # Initialize the RDF processor
    rdf_processor = RDFDataProcessor(ttl_path)
    rdf_processor.extract_and_store_triples()

    # Initialize the Q&A system
    qa_system = QASystem(rdf_processor)

    # Example queries
    while True:
        query = input("Enter your query (or type 'exit' to quit): ")
        if query.lower() == 'exit':
            break
        answer = qa_system.generate_answer(query)
        print(f"Answer: {answer}")




tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/isodate/isodatetime.py", line 25, in parse_datetime
    datestring, timestring = datetimestring.split("T")
ValueError: not enough values to unpack (expected 2, got 1)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/rdflib/term.py", line 2163, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "/usr/local/lib/python3.10/dist-packages/isodate/isodatetime.py", line 27, in parse_datetime
    raise ISO8601Error(
isodate.isoerror.ISO8601Error: ISO 8601 time designator 'T' missing. Unable to parse datetime string '2021-04-11 14:29:59'
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/isodate/isodatetime.py", line 25, in parse_datetime
    datestring, timestring = datetimestring.split("T")
ValueError: not enough values to unpack (expected 2,

Enter your query (or type 'exit' to quit): what are the top interests of 14 year olds?




Answer: Context:
profile_da97952a-2eba-46c4-a19e-3df3c40a1524 hasInterests ['animals', 'travel','science','studying'] (Relevance: -0.23)
profile_63e07d14-7bed-44b8-9cf0-4a7e9b2a99ea hasInterests ['dogs','studying', 'public speaking'] (Relevance: -0.26)
profile_5ff89fb3-b364-494f-b62d-07097b2ffa12 hasInterests ['food', 'healthy eating', 'culture', 'education'] (Relevance: -0.28)
profile_9e9c6089-692f-406c-afee-50f62c127e9d hasInterests ['tennis', 'travel','science', 'education'] (Relevance: -0.29)
profile_002cd824-10f2-447c-8d1c-940325a1cdf4 hasInterests ['food','soccer', 'public speaking', 'public speaking'] (Relevance: -0.29)

Question: what are the top interests of 14 year olds?
Answer: ['soccer', 'tennis', 'food', 'public speaking']
Enter your query (or type 'exit' to quit): what are the top locations?
Answer: Context:
location_da97952a-2eba-46c4-a19e-3df3c40a1524 22-rdf-syntax-ns#type http://example.org/social/Location (Relevance: -0.33)
location_30a7de62-d6a1-47d6-963f-0724adf18c4

### Conclusion

The approach outlined above brings us closer to achieving a robust QnA model utilizing a large language model (LLM) for relational database querying. This model can effectively provide answers to questions by leveraging relevant triples stored in ChromaDB, which are retrieved based on their semantic relevance to the query. A key advantage is that the model can answer questions related to any relation in the database without requiring predefined rules or additional coding. It dynamically generates responses based on the context provided by the RDF data and the query.

However, a notable drawback is that the model struggles with fully capturing the context of certain queries. For instance, when asked about locations, instead of providing actual places, the model returned a generic statement like "the location id," which lacks specificity. This highlights the model's current limitation in fully understanding and generating contextually accurate answers. While it retrieves relevant triples, the interpretation of certain aspects of the data, could be more nuanced, requiring further refinement in future iterations to improve context understanding.

Overall, this method effectively bridges the gap between structured relational data and natural language understanding, offering a powerful tool for querying and interpreting relational databases with an LLM.