In [3]:
#pip install pydantic==1.10.2
!pip install --upgrade pydantic


Collecting pydantic
  Using cached pydantic-2.9.2-py3-none-any.whl.metadata (149 kB)
Using cached pydantic-2.9.2-py3-none-any.whl (434 kB)
Installing collected packages: pydantic
  Attempting uninstall: pydantic
    Found existing installation: pydantic 1.10.2
    Uninstalling pydantic-1.10.2:
      Successfully uninstalled pydantic-1.10.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-groq 0.1.10 requires langchain-core<0.3.0,>=0.2.39, but you have langchain-core 0.3.6 which is incompatible.
langchain-experimental 0.0.65 requires langchain-community<0.3.0,>=0.2.16, but you have langchain-community 0.3.1 which is incompatible.
langchain-experimental 0.0.65 requires langchain-core<0.3.0,>=0.2.38, but you have langchain-core 0.3.6 which is incompatible.[0m[31m
[0mSuccessfully installed pydantic-2.9.2


In [27]:
from requests_aws4auth import AWS4Auth
import boto3
from langchain_community.embeddings import BedrockEmbeddings
from opensearchpy import OpenSearch, RequestsHttpConnection
from langchain_aws import ChatBedrock
import pandas as pd
import json
import re
import time 

In [39]:
QUERIES = {
    '1': 'Which transformer is needing an immediate attention?',
    '2': 'For transformers 2217, 1235, 1685 and 2484 provide comparative details for transformer health based on Oil quality testing ( dielectric breakdown voltage, IFT, acidity, moisture content, furan analysis) , DGA ( duval, rogers ratio, key gas method), Power factor (IPF/IR), temperature monitoring (top oil temperature , hot spot temperature, cooling efficiency) , load/overload history ( loading prof, percentage overload), partial discharge , electrical testing ( FRA, WRM, Turns ratio, capacitance and dissipation factor) , TAIM, LEDT, dornenburgs method, bushing health (capacitance and power factor, DGA for bushings), THI (HI, risk of failure), LTC. Ensure you provide details on parameter, significance, calculation details, root cause analysis details, actions and measures that must be taken , advice to curb further damage for provided transformer details in a tabular format',
    '3': 'what is the duval triangle 1,2,3 value for transformer 2484',
    '4': 'How is my transformer fleet - 2217, 1235, 1685 and 2484 doing?'
}
INDEX_CONFIG ={'Hackathon_index': 'innovator_hack_index'}

# Keyword Parsing

In [29]:
import boto3
from langchain_aws import ChatBedrock
import re

def load_model():
    # bedrock client
    bedrock_runtime = boto3.client(
        service_name="bedrock-runtime",
        region_name="us-east-1",
    )
    # Update to use amazon.titan-text-premier-v1:0
    model_id = "amazon.titan-text-premier-v1:0"
    model_kwargs = {
        "temperature": 0,
        "top_p":1
    }
    # Bedrock chat model
    model = ChatBedrock(
        client=bedrock_runtime,
        model_id=model_id,
        model_kwargs=model_kwargs,
    ).with_retry()
    return model

In [30]:
def parse_keywords(content):
    must_keywords = []
    should_keywords = []

    # Extract the Must Keywords line
    must_line = re.search(r'Numerical Keywords: \[(.*?)\]', content)
    if must_line:
        must_keywords = [keyword.strip() for keyword in must_line.group(1).split(',')]

    # Extract the Should Keywords line
    should_line = re.search(r'Text Keywords: \[(.*?)\]', content)
    if should_line:
        should_keywords = [keyword.strip() for keyword in should_line.group(1).split(',')]

    return must_keywords, should_keywords

def extract_keywords(query):
    prompt = f"""
    Extract the keywords from the following query: "{query}"
    Keywords should include specific identifiers like transformer numbers, names, or other entities relevant to the query.
    Transformer should never be a keyword.
    If the keyword is clearly a transformer ID and it doesn't start with 'TXID_', add this to its prefix
    The output should have format of: Numerical Keywords: [...]; Text Keywords: [...]
    
    <example>
    Input: provide comparative analysis of transformers - 2217, 1235, 1685 and 2484 based on their parameters?
    Output: Numerical Keyword: [TXID_2217, TXID_1235, TXID_1685, TXID_2484]; Text Keyword: …
    
    Input: what is the duval triangle 1,2,3 value for transformer 2484
    Output: Numerical Keyword: [TXID_2484]; Text Keyword: [duval triangle]
    """
    
    model = load_model()
    response = model.invoke(prompt)
    
    content = response.content
    print(content)
    
    return parse_keywords(content)


# Searching Logic

In [31]:
def keyword_search(opensearch_client, index_name, query, k=2):
    must_keywords, should_keywords = extract_keywords(query)

    msearch_body = []
    if must_keywords:
        for keyword in must_keywords:
            msearch_body.append({'index': index_name})
            
            msearch_body.append({
                'size': k, 
                'query': {
                    'bool': {
                        'must': [
                            {
                                'match_phrase': {
                                    'content': keyword
                                }
                            }
                        ],
                        'should': [
                            {
                                'match': {
                                    'content': should_keyword
                                }
                            } for should_keyword in should_keywords
                        ],
                        'minimum_should_match': 0 
                    }
                }
            })

    response = opensearch_client.msearch(body=msearch_body)

    all_results = []
    for res in response['responses']:
        if 'hits' in res and 'hits' in res['hits']:
            all_results.extend(res['hits']['hits'])  

    return all_results



def vector_search(opensearch_client, index_name, embedding_model, query, k=2):
    query_vector = embedding_model.embed_documents([query])[0]
    response = opensearch_client.search(
        index=index_name,
        body={
            'size': k,
            'query': {
                'knn': {
                    'embedding': {
                        'vector': query_vector,
                        'k': k
                    }
                }
            }
        }
    )
    return response['hits']['hits']

def print_query_result(query, results, search_type):
    search_results = ""
    
    if not results:
        search_results += f"# Query: {query} ({search_type} search)"
        search_results += "--------------------------------"
        search_results += "No results found."
        search_results += "--------------------------------"
        print(search_results)
        return search_results

    df = pd.concat([pd.DataFrame([result['_source'] if isinstance(result, dict) else result]) for result in results], ignore_index=True)
    search_results += f"# Query: {query} ({search_type} search)"
    search_results += "--------------------------------"
    for i, result in enumerate(results):
        # print(result)
        metadata = result['_source'] if isinstance(result, dict) else result
        search_results += f"# {search_type} search result {i+1} (relevant document chunk):"
        search_results += f"Source: {metadata['source']}"
        search_results += "Content:"
        row_content = json.loads(metadata['content'])
        for key, value in row_content.items():
            search_results += f"'{key}': {value}"
        search_results +=  "--------------------------------"
    return search_results

In [32]:
def test_queries(index_name, query, opensearch_client, embedding_model, search_type):

    search_result = ""
    if search_type == 'keyword':
        print(f"\nTesting query {query} - Keyword Search:")
        results = keyword_search(opensearch_client, index_name, query)
        search_result = print_query_result(query, results, "Keyword")
    elif search_type == 'vector':
        print(f"\nTesting query {query} - Vector Search:")
        results = vector_search(opensearch_client, index_name, embedding_model, query)
        search_result = print_query_result(query, results, "Vector")
    elif search_type == 'hybrid':
        print(f"\nTesting query {query} - Hybrid Search:")
        results = keyword_search(opensearch_client, index_name, query) + vector_search(opensearch_client, index_name, embedding_model, query)
        search_result = print_query_result(query, results, "Hybrid")
    else:
        print(f"Unknown search type: {search_type}")

    formatted_output = format_output(query, [search_result])
    # print(formatted_output)
    return formatted_output

def format_output(query, results):
    formatted_results = []
    for result in results:
        formatted_results.append(f"<documents>\n{result}\n</documents>")
    return f"<query> # Query: {query} </query>\n" + "\n--------------------------------\n".join(formatted_results)


In [33]:
# Initialize OpenSearch client
def init_opensearch_client(host, port, region, service):
    credentials = boto3.Session().get_credentials()
    awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token)
    return OpenSearch(
        hosts=[{'host': host, 'port': port}],
        http_auth=awsauth,
        use_ssl=True,
        verify_certs=True,
        connection_class=RequestsHttpConnection,
        timeout=3000
    )


opensearch_host = 'iellhhrn6kean028im78.us-east-1.aoss.amazonaws.com'
opensearch_port = 443
opensearch_region = 'us-east-1'
opensearch_service = 'aoss'
index_name = INDEX_CONFIG['Hackathon_index']
dimension = 1024

opensearch_client = init_opensearch_client(opensearch_host, opensearch_port, opensearch_region, opensearch_service)
embedding_model = BedrockEmbeddings(client=boto3.client("bedrock-runtime", region_name=opensearch_region), model_id="amazon.titan-embed-text-v2:0")

In [34]:
search_result = test_queries(index_name, QUERIES['4'], opensearch_client, embedding_model, 'hybrid')


Testing query How is my transformer fleet - 2217, 1235, 1685 and 2484 doing? - Hybrid Search:
Numerical Keywords: [TXID_2217, TXID_1235, TXID_1685, TXID_2484]; Text Keywords: []


# Call Titan LLM Model for contextual output 

In [35]:
def generate_response(model, user_prompt, system_prompt):
    input_prompt = f"System: {system_prompt}\n\nHuman: {user_prompt}\n\nAI:"
    
    response = model.invoke(
        input=input_prompt
    )
    
    return response

def clean_response(raw_response):
    raw_text = raw_response.content
    cleaned_response = raw_text.strip()
    
    return cleaned_response

In [36]:
model = load_model()

query = QUERIES['1']
print(f"Running query  {query}")
search_result=test_queries(index_name, query, opensearch_client, embedding_model, 'hybrid')
user_prompt = search_result
system_prompt = """You are a specialized assistant trained to provide information only related to power transformers. You have access to detailed operational, environmental, and performance data for each transformer. If a transformer ID is mentioned, you should provide accurate and factual information related only to that transformer and not generalize across other transformers. You are expected to avoid answering any queries outside the scope of power transformer maintenance, operation, and health.

Instructions:

1. Use only the information from the dataset, and avoid relying on external knowledge for answering questions.
2. Always respond with precise data when the transformer ID is mentioned, only referring to the specified transformer.
3. When multiple transformer IDs are mentioned, compare them side by side using parameters from the dataset (e.g., hydrogen level, temperature, performance metrics, etc.).
4. Flag any transformer needing attention based on unusual parameter readings (e.g., high hydrogen, low operating time, specific alerts).
5. Reject any queries outside the scope of transformer health, operation, or maintenance.
6. Provide detailed responses.

**DO NOT HALLUCINATE.**
"""


response = generate_response(model, user_prompt, system_prompt)

cleaned_response = clean_response(response)
print(cleaned_response)

Running query  Which transformer is needing an immediate attention?

Testing query Which transformer is needing an immediate attention? - Hybrid Search:
Numerical Keywords: []; Text Keywords: [attention]
Based on the available data, there are no transformers that require immediate attention. All transformers are operating within normal parameters, and there are no alerts or incidents reported. However, it is important to note that regular maintenance and monitoring are crucial to ensure the continued safe and efficient operation of these transformers.


In [37]:
print(query)
print(cleaned_response)

Which transformer is needing an immediate attention?
Based on the available data, there are no transformers that require immediate attention. All transformers are operating within normal parameters, and there are no alerts or incidents reported. However, it is important to note that regular maintenance and monitoring are crucial to ensure the continued safe and efficient operation of these transformers.


In [38]:
from requests_aws4auth import AWS4Auth
import boto3
from langchain_community.embeddings import BedrockEmbeddings
from opensearchpy import OpenSearch, RequestsHttpConnection
from langchain_aws import ChatBedrock
import pandas as pd
import json
import re
import time 

QUERIES = {
    '1': 'Which transformer is needing an immediate attention?',
    '2': 'For transformers 2217, 1235, 1685 and 2484 provide comparative details for transformer health based on Oil quality testing ( dielectric breakdown voltage, IFT, acidity, moisture content, furan analysis) , DGA ( duval, rogers ratio, key gas method), Power factor (IPF/IR), temperature monitoring (top oil temperature , hot spot temperature, cooling efficiency) , load/overload history ( loading prof, percentage overload), partial discharge , electrical testing ( FRA, WRM, Turns ratio, capacitance and dissipation factor) , TAIM, LEDT, dornenburgs method, bushing health (capacitance and power factor, DGA for bushings), THI (HI, risk of failure), LTC. Ensure you provide details on parameter, significance, calculation details, root cause analysis details, actions and measures that must be taken , advice to curb further damage for provided transformer details in a tabular format',
    '3': 'what is the duval triangle 1,2,3 value for transformer 2484',
    '4': 'How is my transformer fleet - 2217, 1235, 1685 and 2484 doing?'
}
INDEX_CONFIG ={'Hackathon_index': 'innovator_hack_index'}

opensearch_host = 'iellhhrn6kean028im78.us-east-1.aoss.amazonaws.com'
opensearch_port = 443
opensearch_region = 'us-east-1'
opensearch_service = 'aoss'
index_name = INDEX_CONFIG['Hackathon_index']
dimension = 1024

# Initialize OpenSearch client
def init_opensearch_client(host, port, region, service):
    credentials = boto3.Session().get_credentials()
    awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token)
    return OpenSearch(
        hosts=[{'host': host, 'port': port}],
        http_auth=awsauth,
        use_ssl=True,
        verify_certs=True,
        connection_class=RequestsHttpConnection,
        timeout=3000
    )
opensearch_client = init_opensearch_client(opensearch_host, opensearch_port, opensearch_region, opensearch_service)
embedding_model = BedrockEmbeddings(client=boto3.client("bedrock-runtime", region_name=opensearch_region), model_id="amazon.titan-embed-text-v2:0")

def load_model():
    bedrock_runtime = boto3.client(
        service_name="bedrock-runtime",
        region_name="us-east-1",
    )
    # Updated model with more balanced temperature and top_p
    model_id = "amazon.titan-text-premier-v1:0"
    model_kwargs = {
        "temperature": 0       
    }
    model = ChatBedrock(
        client=bedrock_runtime,
        model_id=model_id,
        model_kwargs=model_kwargs,
    ).with_retry()
    return model

def parse_keywords(content):
    must_keywords = []
    should_keywords = []

    # Extract the Must Keywords line
    must_line = re.search(r'Numerical Keywords: \[(.*?)\]', content)
    if must_line:
        must_keywords = [keyword.strip() for keyword in must_line.group(1).split(',')]

    # Extract the Should Keywords line
    should_line = re.search(r'Text Keywords: \[(.*?)\]', content)
    if should_line:
        should_keywords = [keyword.strip() for keyword in should_line.group(1).split(',')]

    return must_keywords, should_keywords

def extract_keywords(query):
    # List of domain-specific keywords for transformer health and other common queries
    health_keywords = [
        "oil quality", "dielectric breakdown voltage", "IFT", "acidity", "moisture content", 
        "furan analysis", "DGA", "duval", "rogers ratio", "key gas", "power factor", 
        "IPF", "IR", "temperature monitoring", "top oil temperature", "hot spot temperature", 
        "cooling efficiency", "load history", "overload history", "partial discharge", 
        "electrical testing", "FRA", "WRM", "turns ratio", "capacitance", 
        "dissipation factor", "TAIM", "LEDT", "dornenburg method", "bushing health", 
        "THI", "risk of failure", "LTC", "transformer", "attention", "failure", "immediate attention"
    ]
    
    # Extract transformer IDs (numerical keywords)
    must_keywords = re.findall(r'\b\d+\b', query)
    for i, keyword in enumerate(must_keywords):
        if not keyword.startswith('TXID_'):
            must_keywords[i] = 'TXID_' + keyword

    # Extract health-related keywords (text keywords)
    should_keywords = []
    for keyword in health_keywords:
        if keyword.lower() in query.lower():
            should_keywords.append(keyword)

    return must_keywords, should_keywords

def keyword_search(opensearch_client, index_name, query, k=2):
    must_keywords, should_keywords = extract_keywords(query)
    
    # Debugging: Print the extracted keywords
    print("Must Keywords:", must_keywords)
    print("Should Keywords:", should_keywords)

    if not must_keywords and not should_keywords:
        # No keywords found, return early with empty result
        print("No keywords found for the query.")
        return []

    msearch_body = []
    if must_keywords:
        for keyword in must_keywords:
            msearch_body.append({'index': index_name})
            msearch_body.append({
                'size': k, 
                'query': {
                    'bool': {
                        'must': [
                            {
                                'match_phrase': {
                                    'content': keyword
                                }
                            }
                        ],
                        'should': [
                            {
                                'match': {
                                    'content': should_keyword
                                }
                            } for should_keyword in should_keywords
                        ],
                        'minimum_should_match': 0 
                    }
                }
            })

    else:
            for keyword in should_keywords:
                msearch_body.append({'index': index_name})
                msearch_body.append({
                    'size': k, 
                    'query': {
                        'bool': {
                            'should': [
                                {
                                    'match_phrase': {
                                        'content': keyword
                                    }
                                }
                            ],
                            'minimum_should_match': 0 
                        }
                    }
                })

    # Ensure msearch_body is not empty before making the search call
    if not msearch_body:
        print("No search body constructed, returning empty result.")
        return []

    response = opensearch_client.msearch(body=msearch_body)

    all_results = []
    for res in response['responses']:
        if 'hits' in res and 'hits' in res['hits']:
            all_results.extend(res['hits']['hits'])  

    return all_results

def vector_search(opensearch_client, index_name, embedding_model, query, k=2, similarity_threshold=0.75):
    # Get the query embedding from the embedding model
    query_vector = embedding_model.embed_documents([query])[0]
    
    # Perform semantic search using vector similarity (k-nearest neighbors search)
    response = opensearch_client.search(
        index=index_name,
        body={
            'size': k,
            'query': {
                'knn': {
                    'embedding': {
                        'vector': query_vector,
                        'k': k
                    }
                }
            }
        }
    )
    
    # Filter results based on similarity threshold
    filtered_results = [
        result for result in response['hits']['hits'] if result['_score'] >= similarity_threshold
    ]
    
    return filtered_results

def hybrid_search(opensearch_client, index_name, embedding_model, query, k=2):
    # Perform keyword-based search
    keyword_results = keyword_search(opensearch_client, index_name, query, k)
    
    # Perform semantic (vector) search with similarity filtering
    vector_results = vector_search(opensearch_client, index_name, embedding_model, query, k, similarity_threshold=0.75)
    
    # Combine results from both keyword and semantic search
    combined_results = keyword_results + vector_results
    
    # Sort the combined results by relevance (score)
    combined_results = sorted(combined_results, key=lambda x: x['_score'], reverse=True)
    
    return combined_results

def print_query_result(query, results, search_type):
    search_results = ""

    if not results:
        search_results += f"# Query: {query} ({search_type} search)\n"
        search_results += "--------------------------------\n"
        search_results += "No results found.\n"
        search_results += "--------------------------------\n"
        print(search_results)
        return search_results

    # Combine all the results into a pandas DataFrame for better readability
    df = pd.concat([pd.DataFrame([result['_source'] if isinstance(result, dict) else result]) for result in results], ignore_index=True)
    
    search_results += f"# Query: {query} ({search_type} search)\n"
    search_results += "--------------------------------\n"
    for i, result in enumerate(results):
        metadata = result['_source'] if isinstance(result, dict) else result
        search_results += f"# {search_type} search result {i+1} (relevant document chunk):\n"
        search_results += f"Source: {metadata.get('source', 'Unknown')}\n"
        search_results += "Content:\n"
        row_content = json.loads(metadata['content']) if 'content' in metadata else metadata
        for key, value in row_content.items():
            search_results += f"'{key}': {value}\n"
        search_results += "--------------------------------\n"

    print(search_results)
    return search_results


def format_output(query, results):
    formatted_results = []
    for result in results:
        formatted_results.append(f"<documents>\n{result}\n</documents>")
    return f"<query> # Query: {query} </query>\n" + "\n--------------------------------\n".join(formatted_results)


def test_queries(index_name, query, opensearch_client, embedding_model, search_type):
    search_result = ""
    if search_type == 'keyword':
        print(f"\nTesting query {query} - Keyword Search:")
        results = keyword_search(opensearch_client, index_name, query)
        search_result = print_query_result(query, results, "Keyword")
    elif search_type == 'vector':
        print(f"\nTesting query {query} - Vector Search:")
        results = vector_search(opensearch_client, index_name, embedding_model, query)
        search_result = print_query_result(query, results, "Vector")
    elif search_type == 'hybrid':
        print(f"\nTesting query {query} - Hybrid Search:")
        results = hybrid_search(opensearch_client, index_name, embedding_model, query)
        search_result = print_query_result(query, results, "Hybrid")
    else:
        print(f"Unknown search type: {search_type}")

    formatted_output = format_output(query, [search_result])
    return formatted_output

# Run the query test with hybrid search
search_result = test_queries(index_name, "Which transformer is needing an immediate attention?", opensearch_client, embedding_model, 'hybrid')

# Example prompt formatting for the LLM
with open("prompt_cot.txt", 'r') as file:
        system_prompt = file.read()

def generate_response(model, user_prompt, system_prompt):
    input_prompt = f"""
    System: {system_prompt}
    
    Human: Based on the search results from both keyword and semantic search, here is the relevant information: {user_prompt}.
    
    AI: Please provide a summary of the transformer health and status, including any critical issues or necessary actions.
    """

    response = model.invoke(
        input=input_prompt
    )
    
    return response


def clean_response(raw_response):
    # Assuming the response content is stored in a `content` attribute
    raw_text = raw_response.content
    # Strip any unnecessary whitespace from the beginning and end of the response
    cleaned_response = raw_text.strip()
    
    return cleaned_response

model = load_model()

#query = QUERIES['1']

query = "Which transformer is needing an immediate attention?"

# Validate search results
if not search_result:
    print("No valid results found from the knowledge base.")
else:
    user_prompt = search_result
    response = generate_response(model, user_prompt, system_prompt)
    cleaned_response = clean_response(response)
    print(cleaned_response)


Testing query Which transformer is needing an immediate attention? - Hybrid Search:
Must Keywords: []
Should Keywords: ['transformer', 'attention', 'immediate attention']
# Query: Which transformer is needing an immediate attention? (Hybrid search)
--------------------------------
# Hybrid search result 1 (relevant document chunk):
Source: health_index_augdata.csv
Content:
'Hydrogen': 106
'Oxigen': 581
'Nitrogen': 75200
'Methane': 47
'CO': 760
'CO2': 4070
'Ethylene': 11
'Ethane': 30
'Acethylene': 0
'DBDS': 183.0
'Power factor': 0.16
'Interfacial V': 43
'Dielectric rigidity': 29
'Water content': 5
'Health index': 58.3
'Life expectation': 6.1
'CO_H2_ratio': 7.169811320754717
'CH4_H2_ratio': 0.4433962264150943
'C2H4_H2_ratio': 0.1037735849056603
'C2H2_H2_ratio': 0.0
'H2_N2_ratio': 0.0014095744680851
'O2_N2_ratio': 0.0077260638297872
'H2_CO2_ratio': 0.026044226044226
'TransformerID': TxID_4748
'InstallationDate': 2013-02-23
'MaintenanceSchedule': 2022-10-02
'ReplacementHistory': No major 