In [1]:
import os
import openai
import requests
import re
from pathlib import Path
from dotenv import load_dotenv
load_dotenv()

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.chains import GraphCypherQAChain
from langchain.graphs import Neo4jGraph
from langchain.document_loaders import TextLoader
from neo4j import GraphDatabase
from langchain.vectorstores import Neo4jVector

In [2]:
PARENT_PATH = Path.cwd().parent
if 'publishingchatgptpocweb' not in str(PARENT_PATH):
    PARENT_PATH = PARENT_PATH / 'publishingchatgptpocweb'

DATA_DIRECTORY = PARENT_PATH / 'data'

JATS_DATA_DIRECTORY_PATH = DATA_DIRECTORY / 'raw'
ARTICLES_DATA_DIRECTORY_PATH = DATA_DIRECTORY / 'processed' / 'articles'
CONCEPTS_DATA_DIRECTORY_PATH = DATA_DIRECTORY / 'processed' / 'concepts'

In [3]:
OPEN_AI_API_SECRET = os.getenv('Open__AI__API__Secret')
openai.api_key = OPEN_AI_API_SECRET

In [17]:
def update_article_metadata(document):
    # Extracting various details from the page_content
    title_match = re.search(r'Title:\s*(.*?)(?:\n|$)', document.page_content)
    pan_match = re.search(r'PAN:\s*(.*?)(?:\n|$)', document.page_content)
    source_match = re.search(r'Article Link/URL/Source:\s*(.*?)(?:\n|$)', document.page_content)
    pub_date = re.search(r'Publishing Date:\s*(.*?)(?:\n|$)', document.page_content)    
    isbn_match = re.search(r'ISBN:\s*(.*?)(?:\n|$)', document.page_content)
    day_match = re.search(r'Day:\s*(\d{1,2})(?:\n|$)', document.page_content)
    month_match = re.search(r'Month:\s*(\d{1,2})(?:\n|$)', document.page_content)
    year_match = re.search(r'Year:\s*(\d{4})(?:\n|$)', document.page_content)

    document.metadata['document_type'] = 'article'
    # Updating metadata dictionary
    if title_match:
        document.metadata['title'] = title_match.group(1)
    if pan_match:
        document.metadata['pan'] = pan_match.group(1)
    if source_match:
        document.metadata['source'] = source_match.group(1)
    if isbn_match:
        document.metadata['isbn'] = isbn_match.group(1)
    if day_match and month_match and year_match:
        pub_date = f"{year_match.group(1)}-{month_match.group(1).zfill(2)}-{day_match.group(1).zfill(2)}"
        document.metadata['publishing_date'] = pub_date   
    
    return document

def update_concepts_metadata(document):
    # Extracting various details from the page_content
    document.metadata['document_type'] = 'concept'
    concept_section = re.search(r'Thesaurus Concept:\n\s*Concept:\n(.*?)\n(?:\s*Broader Concept:|\s*Narrower Concepts:|\s*Related Concepts:|\Z)', document.page_content, re.DOTALL)
    if concept_section:
        concept_details = concept_section.group(1)
        name_match = re.search(r'name:\s*(.*?)(?:\n|$)', concept_details)
        uri_match = re.search(r'uri:\s*(.*?)(?:\n|$)', concept_details)
        # Updating metadata dictionary
        if name_match:
            document.metadata['name'] = name_match.group(1)
            document.metadata['title'] = name_match.group(1)
        if uri_match:
            document.metadata['uri'] = uri_match.group(1)
            document.metadata['source'] = uri_match.group(1)        
    return document    



def add_document_to_neo4j_vector_db(existing_index, data_directory, type='article'):
    for root_dir, sub_dirs, files in os.walk(data_directory):
        for file_name in files:
            if file_name.endswith('.txt'):
                try:
                    text_file_path = os.path.join(root_dir, file_name)
                    loader = TextLoader(text_file_path, encoding='utf8')
                    documents = loader.load()
                    if type=='articles':
                        documents[0] = update_article_metadata(documents[0])
                    elif type=='concepts':
                        documents[0] = update_concepts_metadata(documents[0])
                    existing_index.add_documents(documents)
                except:
                    print('error adding document', file_name)
    return existing_index

def create_vector_data_from_processed_documents():   
    loader = TextLoader(os.path.join(JATS_DATA_DIRECTORY_PATH, 'test.txt'))
    documents = loader.load()
    neo4j_db = Neo4jVector.from_documents(
        documents,
        embeddings,
        url=os.getenv('Local__Neo4J__URL'),
        username=os.getenv('Local__Neo4J__UserName'),
        password=os.getenv('Local__Neo4J__Password'),
        database=os.getenv('Local__Neo4J__Database'),
        index_name=os.getenv('Local__Neo4J__PrimaryIndexName'),
        node_label="PublishingDataChunk",  # Chunk by default
        text_node_property="info",  # text by default
        embedding_node_property="vector",  # embedding by default
        create_id_index=True,  # True by default
        pre_delete_collection=False # False by default
    )
        
    existing_index = Neo4jVector.from_existing_index(
        embeddings,
        url=os.getenv('Local__Neo4J__URL'),
        username=os.getenv('Local__Neo4J__UserName'),
        password=os.getenv('Local__Neo4J__Password'),
        database=os.getenv('Local__Neo4J__Database'),
        index_name=os.getenv('Local__Neo4J__PrimaryIndexName'),
        text_node_property="info",
    )
    existing_index = add_document_to_neo4j_vector_db(existing_index, ARTICLES_DATA_DIRECTORY_PATH, 'articles')
    existing_index = add_document_to_neo4j_vector_db(existing_index, CONCEPTS_DATA_DIRECTORY_PATH, 'concepts')






def create_relationships_in_neo4j_db():
    driver = GraphDatabase.driver(neo4j_url, auth=(neo4j_username, neo4j_password))    

    with driver.session(database=neo4j_database) as session:
        all_articles_nodes = session.execute_read(get_nodes,'article')
        all_concept_nodes = session.execute_read(get_nodes,'concept')

        # Creating a mapping of concept names to nodes for efficient lookup
        concept_name_to_node = {node['n'].get('name'): node['n'] for node in all_concept_nodes if node['n'].get('name')}

        # Mapping of vocab type to relationship name
        vocab_type_to_relationship_name = {
            'preferred_terms': 'HAS_PREFERRED_TERM',
            'organism_terms': 'HAS_ORGANISM_TERM',
            'geographic_terms': 'HAS_GEOGRAPHIC_TERM',
        }
        
        # # Step 2: Create Article to Concept relationships
        # for article_node in all_articles_nodes:
        #     article_node= article_node['n']
        #     article_content = article_node['info']
            
        #     vocab_sections = extract_vocab_sections_from_article_content(article_content)
           
        #     for vocab_type, concepts in vocab_sections.items():
        #         relationship_name = vocab_type_to_relationship_name.get(vocab_type)
        #         for concept_name in concepts:
        #             concept_node = concept_name_to_node.get(concept_name)
        #             if concept_node and relationship_name:                    
        #                 try:
        #                     session.execute_write(create_article_concept_relationship, article_node, concept_node, relationship_name)
        #                     print(article_node['pan'], relationship_name, concept_node['name'], 'DONE')                        
        #                 except:
        #                     print(article_node['pan'], relationship_name, concept_node['name'], 'ERROR')        
        #     # if article_node['n']['pan']=='20203180098':
        #     #     break
        
        #     print()
        #     print('---------------')
        #     print()
    
    
        # Step 3: Create Concept to Concept relationships
        for concept_node in all_concept_nodes:
            concept_details = concept_node['n']['info']  # Adjust field name based on your data structure
            concept_node= concept_node['n']
            if 'name' not in concept_node:
                continue
            
            # Identify broader, narrower, and related concepts using regex
            broader_concept_section = re.search(r'Broader Concept:\s*\n*\s*name: (.*?)(?:\n|, uri)', concept_details)
            if broader_concept_section:
                broader_concepts = [broader_concept_section.group(1)]
            else:
                broader_concepts = []
            
            narrower_concept_section = re.search(r'Narrower Concepts:([\s\S]*?)(?:Broader Concepts:|Related Concepts:|$)', concept_details)
            if narrower_concept_section:
                narrower_concepts = re.findall(r'name: (.*?)(?:, uri|\n|$)', narrower_concept_section.group(1))
            else:
                narrower_concepts = []
            
            related_concept_section = re.search(r'Related Concepts:([\s\S]*?)(?:Broader Concepts:|Narrower Concepts:|$)', concept_details)
            if related_concept_section:
                related_concepts = re.findall(r'name: (.*?)(?:, uri|\n|$)', related_concept_section.group(1))
            else:
                related_concepts = []
             
            # Create relationships with other concept nodes based on broader, narrower, and related concepts
            for broader_concept_name in broader_concepts:
                broader_concept_node = concept_name_to_node.get(broader_concept_name)
                if broader_concept_node:
                    try:
                        session.execute_write(create_concept_relationship, broader_concept_node, concept_node, 'BROADER_TERM_FOR')
                        print(broader_concept_node['name'], 'BROADER_TERM_FOR', concept_node['name'], 'DONE')                    
                    except Exception as e:
                        print(broader_concept_node['name'], 'BROADER_TERM_FOR', concept_node['name'], 'ERROR')  
                        print(e)
                       
        
            for narrower_concept_name in narrower_concepts:
                narrower_concept_node = concept_name_to_node.get(narrower_concept_name)
                if narrower_concept_node:
                    try:
                        session.execute_write(create_concept_relationship, narrower_concept_node, concept_node, 'NARROWER_TERM_FOR')
                        print(narrower_concept_node['name'], 'NARROWER_TERM_FOR', concept_node['name'], 'DONE')                    
                    except Exception as e:
                        print(narrower_concept_node['name'], 'NARROWER_TERM_FOR', concept_node['name'], 'ERROR')
                        print(e)
                       
        
            for related_concept_name in related_concepts:
                related_concept_node = concept_name_to_node.get(related_concept_name)
                if related_concept_node:
                    try:
                        session.execute_write(create_concept_relationship, related_concept_node, concept_node, 'RELATED_TERM')
                        print(related_concept_node['name'], 'RELATED_TERM', concept_node['name'], 'DONE')                    
                    except Exception as e:                
                        print(related_concept_node['name'], 'RELATED_TERM', concept_node['name'], 'ERROR')
                        print(e)
                        
        
            print()
            print('---------------')
            print()


def get_nodes(tx, document_type):
        return tx.run(f"MATCH (n) WHERE n.document_type = '{document_type}' RETURN n", database=neo4j_database).data()

def create_article_concept_relationship(tx, start_node, end_node, relationship_type):
    query = f"""
    MATCH (a {{pan: '{start_node['pan']}'}})
    MATCH (b {{name: '{end_node['name']}'}})
    MERGE (a)-[r:{relationship_type}]->(b)
    """
    tx.run(query, database=neo4j_database)

def create_concept_relationship(tx, start_node, end_node, relationship_type):
    query = f"""
    MATCH (a {{name: '{start_node['name']}'}})
    MATCH (b {{name: '{end_node['name']}'}})
    MERGE (a)-[r:{relationship_type}]->(b)
    """
    tx.run(query, database=neo4j_database)

   
def extract_vocab_sections_from_article_content(article_content):
    vocab_sections = {
        'preferred_terms': [],
        'organism_terms': [],
        'geographic_terms': [],
    }
    
    # Define patterns for each section
    patterns = {
        'preferred_terms': r'Preferred Terms \(Non-geographic, Non-organism contents from thesaurus\):\s*\n(.*?)(?:\s*(?:Organism Terms|Geographic Terms|$))',
        'organism_terms': r'Organism Terms \(Organism names from thesaurus\):\s*\n(.*?)(?:\s*(?:Preferred Terms|Geographic Terms|$))',
        'geographic_terms': r'Geographic Terms \(Geographic Tags indicating content about a place\):\s*\n(.*?)(?:\s*\n{2,}|$)',
    }

    # Extract terms for each section
    for section, pattern in patterns.items():
        match = re.search(pattern, article_content, re.DOTALL)
        if match:
            vocab_sections[section] = [term.lstrip('- ').strip() for term in match.group(1).split('\n') if term.strip()]

    return vocab_sections


In [None]:
# create_vector_data_from_processed_documents()
create_relationships_in_neo4j_db()


---------------


---------------

buildings BROADER_TERM_FOR abattoirs DONE
slaughter RELATED_TERM abattoirs DONE
meat production RELATED_TERM abattoirs DONE
meat hygiene RELATED_TERM abattoirs DONE

---------------

cattle BROADER_TERM_FOR Aberdeen-Angus DONE

---------------

stomach BROADER_TERM_FOR abomasum DONE

---------------

reproductive disorders BROADER_TERM_FOR abortion DONE
reproduction RELATED_TERM abortion DONE
infertility RELATED_TERM abortion DONE
fetal death RELATED_TERM abortion DONE
brucellosis RELATED_TERM abortion DONE
birth RELATED_TERM abortion DONE

---------------

optical properties BROADER_TERM_FOR absorbance DONE

---------------

sorption BROADER_TERM_FOR absorption DONE
adsorption RELATED_TERM absorption DONE
uptake RELATED_TERM absorption DONE
nutrient uptake RELATED_TERM absorption DONE
desorption RELATED_TERM absorption DONE

---------------

pesticides BROADER_TERM_FOR acaricides DONE
ectoparasiticides RELATED_TERM acaricides DONE
tick control RELAT

In [None]:
# driver = GraphDatabase.driver(neo4j_url, auth=(neo4j_username, neo4j_password))    
# with driver.session(database=neo4j_database) as session:
#     all_articles_nodes = session.execute_read(get_nodes,'article')
#     all_concept_nodes = session.execute_read(get_nodes,'concept')

In [None]:
# # Creating a mapping of concept names to nodes for efficient lookup
# concept_name_to_node = {node['n'].get('name'): node['n'] for node in all_concept_nodes if node['n'].get('name')}


# with driver.session(database=neo4j_database) as session:

#     # Mapping of vocab type to relationship name
#     vocab_type_to_relationship_name = {
#         'preferred_terms': 'HAS_PREFERRED_TERM',
#         'organism_terms': 'HAS_ORGANISM_TERM',
#         'geographic_terms': 'HAS_GEOGRAPHIC_TERM',
#     }
    
#     # Step 2: Create Article to Concept relationships
#     for article_node in all_articles_nodes:
#         article_node= article_node['n']
#         article_content = article_node['info']
        
#         vocab_sections = extract_vocab_sections_from_article_content(article_content)
       
#         for vocab_type, concepts in vocab_sections.items():
#             relationship_name = vocab_type_to_relationship_name.get(vocab_type)
#             for concept_name in concepts:
#                 concept_node = concept_name_to_node.get(concept_name)
#                 if concept_node and relationship_name:                    
#                     try:
#                         session.execute_write(create_article_concept_relationship, article_node, concept_node, relationship_name)
#                         print(article_node['pan'], relationship_name, concept_node['name'], 'DONE')                        
#                     except:
#                         print(article_node['pan'], relationship_name, concept_node['name'], 'ERROR')        
#         # if article_node['n']['pan']=='20203180098':
#         #     break
    
#         print()
#         print('---------------')
#         print()


#     # Step 3: Create Concept to Concept relationships
#     for concept_node in all_concept_nodes:
#         concept_details = concept_node['n']['info']  # Adjust field name based on your data structure
#         print(concept_node['n']['name'])
        
#         # Identify broader, narrower, and related concepts using regex
#         broader_concept_section = re.search(r'Broader Concept:\s*\n*\s*name: (.*?)(?:\n|, uri)', concept_details)
#         if broader_concept_section:
#             broader_concepts = [broader_concept_section.group(1)]
#         else:
#             broader_concepts = []
        
#         narrower_concept_section = re.search(r'Narrower Concepts:([\s\S]*?)(?:Broader Concepts:|Related Concepts:|$)', concept_details)
#         if narrower_concept_section:
#             narrower_concepts = re.findall(r'name: (.*?)(?:, uri|\n|$)', narrower_concept_section.group(1))
#         else:
#             narrower_concepts = []
        
#         related_concept_section = re.search(r'Related Concepts:([\s\S]*?)(?:Broader Concepts:|Narrower Concepts:|$)', concept_details)
#         if related_concept_section:
#             related_concepts = re.findall(r'name: (.*?)(?:, uri|\n|$)', related_concept_section.group(1))
#         else:
#             related_concepts = []
    
#         # if concept_node['n']['name']=='Ebolavirus':
#         #     break
        
#         # Create relationships with other concept nodes based on broader, narrower, and related concepts
#         for broader_concept_name in broader_concepts:
#             broader_concept_node = concept_name_to_node.get(broader_concept_name)
#             if broader_concept_node:
#                 try:
#                     session.execute_write(create_concept_relationship, broader_concept_node, concept_node, 'BROADER_TERM_FOR')
#                     print(broader_concept_node['name'], 'BROADER_TERM_FOR', concept_node['n']['name'], 'DONE')                    
#                 except:
#                     print(broader_concept_node['name'], 'BROADER_TERM_FOR', concept_node['n']['name'], 'ERROR')  
    
#         for narrower_concept_name in narrower_concepts:
#             narrower_concept_node = concept_name_to_node.get(narrower_concept_name)
#             if narrower_concept_node:
#                 try:
#                     session.execute_write(create_concept_relationship, narrower_concept_node, concept_node, 'NARROWER_TERM_FOR')
#                     print(narrower_concept_node['name'], 'NARROWER_TERM_FOR', concept_node['n']['name'], 'DONE')                    
#                 except:
#                     print(narrower_concept_node['name'], 'NARROWER_TERM_FOR', concept_node['n']['name'], 'ERROR')
    
#         for related_concept_name in related_concepts:
#             related_concept_node = concept_name_to_node.get(related_concept_name)
#             if related_concept_node:
#                 try:
#                     session.execute_write(create_concept_relationship, related_concept_node, concept_node, 'RELATED_TERM')
#                     print(related_concept_node['name'], 'RELATED_TERM', concept_node['n']['name'], 'DONE')                    
#                 except:
#                     print(related_concept_node['name'], 'RELATED_TERM', concept_node['n']['name'], 'ERROR')
    
#         print()
#         print('---------------')
#         print()