In [None]:
import GEOparse
import logging

import pandas as pd

In [None]:
N_TRIALS = 10
SAMPLING_SIZE = 150

from random import choice, sample
from prompts.aspects import annotation_aspects_list

In [None]:
annotation_aspects_list.keys()

In [None]:
aspect = 'Control and Reference Samples'
aspect_details = annotation_aspects_list[aspect]
irrelevant_aspects = [x for x in annotation_aspects_list.keys() if x != aspect]

In [None]:
logging.getLogger('GEOparse').setLevel(logging.WARNING)

In [None]:
datasets_df = pd.read_excel('Magalhaes_datasets.xlsx', header=1)

In [None]:
gse_id_list = datasets_df.query("Technology=='Microarray' & Study !='AgeMap'").Study.tolist()

In [None]:
gse_list = []
for gse_id in gse_id_list:
    try:
        gse_list.append(GEOparse.get_GEO(geo=gse_id, destdir="./data/"))
    except Exception as e:
        print(e)

In [None]:
import os

from openai import OpenAI


def openai_generate(messages: list, model: str = 'gpt-4', **generation_params):
    client = OpenAI()

    completion = client.chat.completions.create(
        messages=messages,
        model=model,
        **generation_params
    )
    return completion#.choices[0].message.content

In [None]:
def extract_metadata(gse, gsm):
    # GSE Fields
    # 'title' : The title of the Gene Expression Omnibus (GEO) series (GSE)
    gse_title = gse.metadata['title'][0] if 'title' in gse.metadata else 'N/A'
    
    # 'summary' : Brief summary of the experiment conducted in the GSE
    gse_summary = gse.metadata['summary'][0] if 'summary' in gse.metadata else 'N/A'
    
    # 'keywords' : Keywords associated with the GSE
    gse_keywords = gse.metadata['keywords'][0] if 'keywords' in gse.metadata else 'N/A'
    
    # 'overall_design' : Overall design of the experiment conducted in the GSE
    gse_overall_design = gse.metadata['overall_design'][0] if 'overall_design' in gse.metadata else 'N/A'

    # GSM Fields
    # 'title' : The title of the individual sample (GSM)
    gsm_title = gsm.metadata['title'][0] if 'title' in gsm.metadata else 'N/A'
    
    # 'source_name_ch1' : Biological source from which the sample was taken
    gsm_source_name_ch1 = gsm.metadata['source_name_ch1'][0] if 'source_name_ch1' in gsm.metadata else 'N/A'
    
    # 'organusm_ch1' : The organism from which the sample was taken
    gsm_organism_ch1 = gsm.metadata['organism_ch1'][0] if 'organism_ch1' in gsm.metadata else 'N/A'
    
    # 'characteristics_ch1' : Characteristics and traits of the sample
    gsm_characteristics_ch1 = gsm.metadata['characteristics_ch1'][0] if 'characteristics_ch1' in gsm.metadata else 'N/A'
    
    # 'description' : Additional information about the sample
    gsm_description = gsm.metadata['description'][0] if 'description' in gsm.metadata else 'N/A'

    # Combine everything into a single string
    metadata_str = f"GSE Title: {gse_title}\nGSE Summary: {gse_summary}\nGSE Keywords: {gse_keywords}\nGSE Overall Design: {gse_overall_design}\n"
    metadata_str += f"GSM Organism: {gsm_organism_ch1}\nGSM Characteristics: {gsm_characteristics_ch1}\nGSM Description: {gsm_description}"

    return metadata_str

In [None]:
gsm_sample.metadata

In [None]:
unique_values = collect_unique_field_values(key='data_processing')

In [None]:
print('\n###\n'.join(unique_values))

In [None]:
num_tokens_from_string(, 'gpt-4o')

In [None]:
from prompts.data_processing import data_processing_constitution

In [None]:
example_message = '''Raw data (CEL files) were processed using the robust multi-array average (RMA) algorithm and quantile normalization with the Affymetrix Power Tools, version 1.12.0, and platform-specific library files. Differential gene expression was analyzed using descriptive statistics (fold change) and Student’s T-Test method for pairwise comparisons. Genes were prioritized by statistical evidence. In order to create candidate lists for differential gene expression between conditions, we used all genes regulated at least 1.5-fold where differential expression was significant at level 0.05. Type I error inflation was ignored because the p-values were used to prioritize the list rather than being interpreted in a confirmatory sense.'''

In [None]:
messages =[
    {'role':'system', 'content':data_processing_constitution},
    {'role':'user', 'content':example_message}
]

In [None]:
completion = openai_generate(messages, temperature=1.0)
print(completion.choices[0].message.content)

In [None]:
print(completion.choices[0].message.content)

In [None]:
example_extraction_message = 'Cardiac ventricle total RNA from 10 young (4-6 month) and 10 old (25-28 month) mice was prepared and hybridized to Affymetrix™GeneChip® MOE430 2.0 arrays. Total RNA was prepared from frozen tissues using the Qiagen RNeasy kit following homogenization in Trizol (Invitrogen). Genomic DNA was prepared using the Qiagen Dneasy kit.'

In [None]:
from prompts.extraction_protocol import extraction_protocol_constitution

In [None]:
messages =[
    {'role':'system', 'content':extraction_protocol_constitution},
    {'role':'user', 'content':example_extraction_message}
]

In [None]:
completion = openai_generate(messages, temperature=1.0)
print(completion.choices[0].message.content)

In [None]:
def format_metadata(gse, gsm):
    # GSE fields
    gse_fields = ['summary', 'overall_design', 'type']
    
    # GSM fields
    gsm_fields= [
        'source_name_ch1', 'organism_ch1', 'characteristics_ch1',
        'treatment_protocol_ch1', 'molecule_ch1', 'extract_protocol_ch1',
        'label_ch1', 'label_protocol_ch1', 'hyb_protocol_ch1', 
        'scan_protocol', 'data_processing'
    ]
    
    metadata_str = []
    
    # Fetch and print GSE metadata
    for field in gse_fields:
        value = gse.metadata.get(field, ['N/A'])[0]  # fetch metadata if available, else 'N/A'
        metadata_str.append(f"GSE {field.replace('_', ' ').title()}: {value}")  # add formatted string
    
    # Fetch and print GSM metadata
    for field in gsm_fields:
        value = gsm.metadata.get(field, ['N/A'])[0]  # fetch metadata if available, else 'N/A'
        metadata_str.append(f"GSM {field.replace('_', ' ').title()}: {value}")  # add formatted string

    return "\n".join(metadata_str)

In [None]:
field_values = set()
for gse in gse_list:
    try:
        for gsm in gse.gsms:
            gsm_sample = gse.gsms[gsm]
            field_values.add(format_metadata(gse, gsm_sample))
    except Exception as e:
        print(e)

In [None]:
len(field_values)

In [None]:
import tiktoken
def num_tokens_from_string(string: str, model_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model(model_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens
from random import sample

In [None]:
predicate_instruction = """Introduction and Goal:

The primary goal of this task is to systematically extract and standardize predicates from the metadata of GSE (Gene Expression Omnibus Series) and GSM (Gene Expression Omnibus Samples) descriptions. These predicates will be used to automate the metadata standardization process across various omics datasets, facilitating efficient and consistent data analysis for meta-studies. By defining clear predicates based on experimental design and sample collection, we aim to create a structured queryable framework that enhances data retrieval and analysis capabilities in biological research databases.

Task Description:

Generate a comprehensive list of predicates focused on {aspect} from GSE and GSM descriptions. Ensure each predicate is standardized, uses controlled vocabulary, and adheres strictly to the concepts outlined in the provided guidelines. This structured extraction will enable automated systems to better understand and categorize data entries, significantly improving the accuracy and speed of meta-analyses.

Guidelines:
Focus on Key Concepts from the following list:
{aspect_details}

Ensure that the list of predicates is comprehensive and covers all concepts from the list above. 


Define Predicates:

For each identified relationship or attribute, define a predicate. Ensure the predicate name is descriptive of the relationship or attribute it represents.
Create separate predicates for distinctive properties not entailed with each other (for example, strain and age).
Avoid using free-text entries in the predicates.
Exclude predicates that are not related to the specified key concepts. Avoid including predicates related to the following aspects: {irrelevant_aspects_combined}.
Only include predicates relevant to {aspect}. 
Do not generate predicates for generic attributes such as age or sex unless explicitly related to the sample characteristics described in the guidelines.
Do not generate predicates for quantitative measures such as temporal data, dosage, concentration, volume, mass, temperature, speed, time duration, and frequency. These formats are already standardized and provided. Focus on other attributes and relationships related to the specified key concepts.



Standardize Quantitative Data:

Ensure that quantitative data is standardized in a consistent format, such as TimeFrame(value, unit).
Use the following formats for other quantitative measures:
List of Standardized Quantitative Measures
Temporal Data

Format: TimeFrame(value, unit)
Example: TimeFrame(32, Years)
Range Format: TimeFrame(Range(minValue, maxValue), unit)
Range Example: TimeFrame(Range(32, 39), Years)
Dosage

Format: Dosage(value, unit)
Example: Dosage(1.0, gPerKg)
Range Format: Dosage(Range(minValue, maxValue), unit)
Range Example: Dosage(Range(0.5, 1.0), gPerKg)
Concentration

Format: Concentration(value, unit)
Example: Concentration(0.1, Molar)
Range Format: Concentration(Range(minValue, maxValue), unit)
Range Example: Concentration(Range(0.05, 0.1), Molar)
Volume

Format: Volume(value, unit)
Example: Volume(10, mL)
Range Format: Volume(Range(minValue, maxValue), unit)
Range Example: Volume(Range(5, 10), mL)
Mass

Format: Mass(value, unit)
Example: Mass(2.5, mg)
Range Format: Mass(Range(minValue, maxValue), unit)
Range Example: Mass(Range(1.0, 2.5), mg)
Temperature

Format: Temperature(value, unit)
Example: Temperature(37, Celsius)
Range Format: Temperature(Range(minValue, maxValue), unit)
Range Example: Temperature(Range(35, 37), Celsius)
Speed

Format: Speed(value, unit)
Example: Speed(100, rpm)
Range Format: Speed(Range(minValue, maxValue), unit)
Range Example: Speed(Range(80, 100), rpm)
Time Duration

Format: Duration(value, unit)
Example: Duration(30, Minutes)
Range Format: Duration(Range(minValue, maxValue), unit)
Range Example: Duration(Range(20, 30), Minutes)
Frequency

Format: Frequency(value, unit)
Example: Frequency(3, perDay)
Range Format: Frequency(Range(minValue, maxValue), unit)
Range Example: Frequency(Range(2, 3), perDay)

Standardize Software versions:
Ensure that versions of the software related to {aspect} are standardized in a consistent format.
Use the following predicate:
Format: SoftwareVersion(softwareTitle, versionNumber)
Example: SoftwareVersion(AffymetrixPowerTools, 1.12.0)

Format the Output:

Generate a list of predicates.
Provide a brief description of what each predicate represents.
Include an example for each predicate.
Refrain from using sample as an argument in predicates.
Output nothing but a list of predicates according to the provided format. Start right away with the definition of the first predicate.

Output format example:

Definition: somePredicate(argument)
Description: some predicate given for example. Actual predicates will have a meaningful description here.
Example: somePredicate(ExampleArgument)
"""

In [None]:
num_tokens_from_string(predicate_instruction, 'gpt-4o')

In [None]:
SAMPLING_SIZE=150
N_TRIALS = 10

In [None]:
refinement_prompt = """You are given multiple lists of predicates extracted from various sets of GSE (Gene Expression Omnibus Series) and GSM (Gene Expression Omnibus Samples) metadata descriptions. These predicates are related to {aspect}. Your task involves several key steps to refine and consolidate these predicates into a single, standardized list:

Merge and Deduplicate: Combine all the provided lists into one unified list. Identify and remove any duplicate predicates, ensuring that synonyms and nearly identical concepts are consolidated into a single entry. Aim for a clean, non-redundant list that captures the essence of each predicate without repetition.

Standardize Format and Terminology: Review the merged list for consistency in naming conventions, formatting, and descriptions. Adjust the predicates to ensure they align with the standardized formats discussed in the guidelines. This includes using controlled vocabulary and adhering to a consistent structural format. Use adjustment examples below.

Adjustment examples:
Before: somePredicate("free text argument")
After: somePredicate(VariableLikeArgument)

Before: somePredicate(Argument With Spaces)
After: somePredicate(ArgumentWithoutSpaces)

Before: somePredicate(argument_with_underscore)
After: somePredicate(CamelCaseArgument)

Validate and Refine: Ensure that each predicate in the list strictly pertains to the key concepts of {aspect}. Remove any predicates that do not conform to these guidelines or that address unrelated topics such as the following: {irrelevant_aspects_combined}. Focus on refining predicates to be as specific and relevant as possible.

Format the Final Output: Organize the final list of predicates into a structured format, with definition of each predicate followed by its brief, clear description and an example of usage. Output nothing but a list of predicates in the format specified below. Start with definition of the first predicate right away.

Output format example:

Definition: somePredicate(argument)
Description: some predicate given for example. Actual predicates will have a meaningful description here.
Example: somePredicate(ExampleArgument)

Definition: anotherPredicate(argument)
Description: another predicate given for example. Actual predicates will have a meaningful description here.
Example: anotherPredicate(AnotherArgument)

"""

In [None]:
def extract_predicates(aspect, n_trials, sampling_size):
    aspect_details = annotation_aspects_list[aspect]
    irrelevant_aspects = [x for x in annotation_aspects_list.keys() if x != aspect] 
    predicate_list = []
    for __ in range(n_trials):
        input_samples_combined = "List of examples to extract predicates from:\n"+"\n\n\n".join(sample((list(field_values)), sampling_size))
        resp = openai_generate([
            {"role":'system',
             'content':predicate_instruction.format(aspect=aspect,
                                                    aspect_details=aspect_details,
                                                    irrelevant_aspects_combined='\n'.join(irrelevant_aspects)
                                                    )},
            {'role':'user',
             'content':input_samples_combined}
        ], model='gpt-4o')
        predicate_list.append(resp.choices[0].message.content)
    
    combined_predicates = "Draft predicate lists:\n"'\n\n###\n'.join(predicate_list)
    resp = openai_generate([
            {"role":'system', 'content':refinement_prompt.format(aspect=aspect,
                                                    aspect_details=aspect_details,
                                                    irrelevant_aspects_combined='\n'.join(irrelevant_aspects)
                                                    )},
            {'role':'user', 'content':combined_predicates}
        ], model='gpt-4')
    
    return resp.choices[0].message.content

In [None]:
for aspect in annotation_aspects_list.keys():
    predicates = extract_predicates(aspect, 5, 180)
    with open(aspect+".txt", 'w') as f:
        f.write(predicates)

### Predicate collection with each annotation_field

In [None]:
gsm_sample.metadata

In [None]:
def collect_unique_gsm_field_values(key='data_processing'):
    field_values = set()
    for gse in gse_list:
        try:
            for gsm in gse.gsms:
                sample = gse.gsms[gsm]
                field_values.update(set(sample.metadata[key]))
        except Exception as e:
            print(e)
    return field_values

In [None]:
gsm_metadata_fields = ['treatment_protocol_ch1', 'extract_protocol_ch1', 'label_protocol_ch1', 'hyb_protocol', 'scan_protocol', 'data_processing']

In [None]:
field = 'data_processing'
unique_values = collect_unique_gsm_field_values(field)
data_string = "\n###\n".join(list(unique_values))

In [None]:
print(data_string)

In [None]:
field_prompt="""Below are multiple occurrences of the field {field} from metadata of various studies from GEO, separated by ###. The goal is to create a set of First-Order Logic (FOL) predicates to parse these protocols, ensuring they are standardized in a machine-readable format. List all aspects from the data needed to construct these predicates. Ensure the list is complete and non-redundant. Output only the list and start with the first element.
{data_string}
"""

In [None]:
aspect_message = openai_generate([
            {"role":'user', 'content':field_prompt.format(field=field,
                                                          data_string=data_string
                                                    )},
        ], model='gpt-4')
print(aspect_message.choices[0].message.content)
predicate_message = openai_generate([
            {"role":'user', 'content':field_prompt.format(field=field,
                                                          data_string=data_string
                                                    )},
    {"role":"assistant", 'content':resp.choices[0].message.content},
    {"role":'user', 'content':'''Now generate a list of predicates to cover this list. Use the following format:
Definition: somePredicate(argument)
Description: some predicate given for example. Actual predicates will have a meaningful description here.
Example: somePredicate(ExampleArgument)'''
                              }
        ], model='gpt-4', temperature=0.0)
print(predicate_message.choices[0].message.content)
parsing_message = openai_generate([
            {"role":'user', 'content':field_prompt.format(field=field,
                                                          data_string=data_string
                                                    )},
    {"role":"assistant", 'content':resp.choices[0].message.content},
    {"role":'user', 'content':'Now generate a list of predicates to cover this list.'},
    {"role":"assistant", 'content':resp2.choices[0].message.content},
    {"role":'user', 'content':'Now parse each input metadata into FOL using constructed predicates. Include original text in your output. Make sure to eliminate freetext arguments and format arguments in camelcase and are not quoted.'},
        ], model='gpt-4')
print(parsing_message.choices[0].message.content)

In [157]:
from typing import List, Dict

def extract_field_predicates(field: str) -> Dict[str, str]:
    unique_values = collect_unique_gsm_field_values(field)
    data_string = "\n###\n".join(list(unique_values))
    message_list = []
    field_prompt = """Below are multiple occurrences of the field {field} from metadata of various studies from GEO, separated by ###. The goal is to create a set of First-Order Logic (FOL) predicates to parse these protocols, ensuring they are standardized in a machine-readable format. List all aspects from the data needed to construct these predicates. Ensure the list is complete and non-redundant. Output only the list and start with the first element.
    {data_string}
    """
    message_list.append({"role": 'user', 'content': field_prompt.format(field=field, data_string=data_string)})
    aspect_message = openai_generate(message_list, model='gpt-4')

    # Extracting the content of the aspect_message
    aspect_content = aspect_message.choices[0].message.content
    # appending the aspect message to the message list
    message_list.append({"role": "assistant", 'content': aspect_content})

    message_list.append({
        "role": 'user',
        'content': '''Now generate a list of predicates to cover this list. Use the following format:
        Definition: somePredicate(argument)
        Description: some predicate given for example. Actual predicates will have a meaningful description here.
        Example: somePredicate(ExampleArgument)'''
    })
    predicate_message = openai_generate(message_list, model='gpt-4', temperature=0.0)

    # Extracting the content of the predicate_message
    predicate_content = predicate_message.choices[0].message.content
    # appending the predicate message to the message list
    message_list.append({"role": "assistant", 'content': predicate_content})

    message_list.append({
        "role": 'user',
        'content': 'Now parse each input metadata into FOL using constructed predicates. Include original text in your output. Make sure to eliminate freetext arguments and format arguments in camelcase and are not quoted.'
    })
    parsing_message = openai_generate(message_list, model='gpt-4')

    # Extracting the contents of the parsing_message
    parsing_content = parsing_message.choices[0].message.content

    return {"aspects": aspect_content, "predicates": predicate_content, "parsed": parsing_content}

In [158]:
content = extract_field_predicates('hyb_protocol')

'GDS' object has no attribute 'gsms'
'GDS' object has no attribute 'gsms'
'GDS' object has no attribute 'gsms'
'GDS' object has no attribute 'gsms'
'GDS' object has no attribute 'gsms'
'GDS' object has no attribute 'gsms'
'GDS' object has no attribute 'gsms'
'GDS' object has no attribute 'gsms'
'GDS' object has no attribute 'gsms'
'GDS' object has no attribute 'gsms'
'GDS' object has no attribute 'gsms'
'GDS' object has no attribute 'gsms'
'GDS' object has no attribute 'gsms'
'GDS' object has no attribute 'gsms'
'GDS' object has no attribute 'gsms'
'GDS' object has no attribute 'gsms'
'GDS' object has no attribute 'gsms'
'GDS' object has no attribute 'gsms'
'GDS' object has no attribute 'gsms'
'GDS' object has no attribute 'gsms'
'GDS' object has no attribute 'gsms'
'GDS' object has no attribute 'gsms'
'GDS' object has no attribute 'gsms'
'GDS' object has no attribute 'gsms'
'GDS' object has no attribute 'gsms'
'GDS' object has no attribute 'gsms'
'GDS' object has no attribute 'gsms'
'

In [161]:
print(content['aspects'])

1. Protocol Name or Source
2. Hybridization procedure
3. Temperature for Hybridization
4. Duration of Hybridization
5. Type of RNA 
6. Quantity of RNA
7. Type of Chip or Array
8. Washing and Staining procedure
9. Fluidic Station used
10. Manufacturer of Materials
11. Hybridization Kit Used
12. Address/location of Manufacturer
13. Type of DNA (if any)
14. Labeling method for RNA or DNA
15. Amplification of cRNA
16. Biotinylation of cRNA
17. Fragmentation of cRNA and its quantity.


In [163]:
print(content['parsed'])

1. "According to Illumina protocols"
protocolName(IlluminaProtocols)

2. "Following fragmentation, cRNA were hybridized onto the Affymetrix GeneChip Mouse Genome 430 2.0 arrays using the standard Affymetrix protocol. GeneChips were washed and stained in the Affymetrix Fluidics Station 450."
hybridizationProcedure(HybridizedAffymetrixGeneChipMouseGenome430Arrays)
rnaType(CRNA)
chipType(AffymetrixGeneChipMouseGenome430)
washingProcedure(WashedAndStainedAffymetrixFluidicsStation450)
fluidicStation(AffymetrixFluidicsStation450)

3. "700 ng of biotinylated cRNA was hybridized to a BeadChip at 580C for 16-17 hours."
rnaQuantity(700Ng)
rnaType(BiotinylatedCRNA)
hybridizationProcedure(HybridizedToBeadChip)
hybridizationTemperature(580C)
hybridizationDuration(16To17Hours)

4. "Standard Illumina protocol"
protocolName(StandardIllumina)

5. "Affymetrix Eukaryotic Target Hybridization protocol for 100 Format midi arrays (GeneChip® Expression Analysis Technical Manual 701021 Rev. 5)"
protocolName(A