In [1]:
import os
import pandas as pd
import glob
from groq import Groq

## Normalization

In [None]:
relationship_client = Groq(api_key='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')

def generate_prompt_gene_normalize(gene_entity):
    system_prompt = """
    You are a biologist who are very familiar with gene names and their official hgnc symbol.
    """
    user_prompt = f"""
    I will provide you with an entity that is likely a gene.
    Entity: {gene_entity}

    For this gene, output its official hgnc symbol only, with no additional texts!

    Important rules to follow:
    - If this entity does not look like a gene, then output "doesntlooklikegene" !! don't add new texts.
    - Some provided entities are the full name of a gene without space, so be careful.

    **Remember:**
    Output only the official hgnc symbol, don't add new texts!!
    Examples: (gene_entity=fcgammareceptor3a)
    - FCGR3A
    Examples: (gene_entity=tumorprogression)
    - doesntlooklikegene
    """
    return system_prompt, user_prompt

def get_gene_synonyms(gene_entity):
    system_prompt, user_prompt = generate_prompt_gene_normalize(gene_entity)
    response = relationship_client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": system_prompt,
            },
            {
                "role": "user",
                "content": user_prompt,
            }
        ],
        model="llama3-groq-70b-8192-tool-use-preview", # llama3-groq-70b-8192-tool-use-preview
        max_tokens=8000,
        temperature=0
    )
    res_strip = response.choices[0].message.content
    lines = res_strip.strip().split('\n')
    synonyms_list = [line.lstrip('- ').strip() for line in lines]
    return synonyms_list[0].upper()

def remove_single_character_entries(df):
    """Remove rows where 'Outgoing' or 'Incoming' has only one character."""
    return df[(df['Outgoing'].str.len() > 1) & (df['Incoming'].str.len() > 1)]

def normalize_gene_dataframe(df):
    def conditional_normalize(row, column):
        if row[f'{column.lower()}_source'] in ['gene', 'unknown']:
            return get_gene_synonyms(row[column])
        else:
            return row[column]
    df['Normalized_Outgoing'] = df.apply(lambda row: conditional_normalize(row, 'Outgoing'), axis=1)
    df['Normalized_Incoming'] = df.apply(lambda row: conditional_normalize(row, 'Incoming'), axis=1)
    return df


In [2]:
import pandas as pd
import glob
import os
from functools import lru_cache

relationship_client = Groq(api_key='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')

def generate_prompt_gene_normalize(gene_entity):
    system_prompt = """
    You are a biologist who are very familiar with gene names and their official hgnc symbol.
    """
    user_prompt = f"""
    I will provide you with an entity that is likely a gene.
    Entity: {gene_entity}

    For this gene, output its official hgnc symbol only, with no additional texts!

    Important rules to follow:
    - If this entity does not look like a gene, then output "doesntlooklikegene" !! don't add new texts.
    - Some provided entities are the full name of a gene without space, so be careful.

    **Remember:**
    Output only the official hgnc symbol, don't add new texts!!
    Examples: (gene_entity=fcgammareceptor3a)
    - FCGR3A
    Examples: (gene_entity=tumorprogression)
    - doesntlooklikegene
    """
    return system_prompt, user_prompt

@lru_cache(maxsize=None)
def get_gene_synonyms(gene_entity):
    system_prompt, user_prompt = generate_prompt_gene_normalize(gene_entity)
    response = relationship_client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": system_prompt,
            },
            {
                "role": "user",
                "content": user_prompt,
            }
        ],
        model="llama3-groq-70b-8192-tool-use-preview",
        max_tokens=8000,
        temperature=0
    )
    res_strip = response.choices[0].message.content
    lines = res_strip.strip().split('\n')
    synonyms_list = [line.lstrip('- ').strip() for line in lines]
    return synonyms_list[0].upper()

def remove_single_character_entries(df):
    """Remove rows where 'Outgoing' or 'Incoming' has only one character."""
    return df[(df['Outgoing'].str.len() > 1) & (df['Incoming'].str.len() > 1)]

def normalize_gene_dataframe(df, unique_outgoing, unique_incoming):
    def conditional_normalize(value, column):
        if df.loc[df[column] == value, f'{column.lower()}_source'].iloc[0] in ['gene', 'unknown']:
            return get_gene_synonyms(value)
        else:
            return value

    normalized_outgoing = {value: conditional_normalize(value, 'Outgoing') for value in unique_outgoing}
    normalized_incoming = {value: conditional_normalize(value, 'Incoming') for value in unique_incoming}

    df['Normalized_Outgoing'] = df['Outgoing'].map(normalized_outgoing)
    df['Normalized_Incoming'] = df['Incoming'].map(normalized_incoming)

    return df


In [None]:
noname = 0
folder_path = "/Volumes/she4/knowledgegraph/result/extracted_entities_from_Macrophage_abstracts"
csv_files = glob.glob(os.path.join(folder_path, "*relationship.csv"))
start_index = 0
for i, file_path in enumerate(csv_files[start_index:], start_index + 1):
    print(f"Processing file: {file_path}")
    df = pd.read_csv(file_path)
    if "Outgoing" not in df.columns:
        noname += 1 
        continue
    df = remove_single_character_entries(df)

    # Get unique values
    unique_outgoing = df['Outgoing'].unique()
    unique_incoming = df['Incoming'].unique()

    # Normalize the dataframe
    normalized_df = normalize_gene_dataframe(df, unique_outgoing, unique_incoming)

    normalized_df.to_csv(file_path, index=False)
    print(f"Normalized data saved to: {file_path}")
    print(i)

print("All files processed.")