In [1]:
import rdflib
import pandas as pd
from rapidfuzz import fuzz, process
import obonet

In [2]:
preds = pd.read_parquet("../preds_strainselect.pqt")

In [52]:
vcs = preds.value_counts(["ner","word_qc"]).reset_index().sort_values(["ner","count"],ascending=False)

# MEDIUM

## GMO

In [7]:
# Load the ontology file
g = rdflib.Graph()
g.parse("ontologies/gmo/gmo.ttl", format="ttl")

<Graph identifier=N9befc49c08154039b384f01240394ec7 (<class 'rdflib.graph.Graph'>)>

In [8]:
# Extract relevant data including all predicates found
data = []

# Define initially known relevant predicates and any new ones found
predicates = {
    rdflib.RDFS.label: 'label',
    rdflib.URIRef('http://www.w3.org/2004/02/skos/core#altLabel'): 'altLabel',
    rdflib.RDFS.subClassOf: 'subClassOf',
    rdflib.OWL.disjointWith: 'disjointWith',
    rdflib.URIRef('http://purl.org/dc/terms/identifier'): 'identifier',
    rdflib.RDFS.comment: 'comment',
    rdflib.SKOS.prefLabel: 'prefLabel',
    rdflib.SKOS.definition: 'definition',
    rdflib.URIRef('http://purl.org/dc/terms/bibliographicCitation'): 'bibliographicCitation',
    rdflib.URIRef('http://purl.org/dc/terms/contributor'): 'contributor',
    rdflib.URIRef('http://purl.org/dc/terms/creator'): 'creator',
    rdflib.URIRef('http://purl.org/dc/terms/date'): 'date',
    rdflib.URIRef('http://purl.org/dc/terms/description'): 'description',
    rdflib.URIRef('http://purl.org/dc/terms/references'): 'references',
    rdflib.URIRef('http://purl.org/dc/terms/title'): 'title',
    rdflib.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'): 'type',
    rdflib.URIRef('http://www.w3.org/2000/01/rdf-schema#domain'): 'domain',
    rdflib.URIRef('http://www.w3.org/2000/01/rdf-schema#range'): 'range',
    rdflib.URIRef('http://www.w3.org/2000/01/rdf-schema#seeAlso'): 'seeAlso',
    rdflib.URIRef('http://www.w3.org/2002/07/owl#versionIRI'): 'versionIRI',
    rdflib.URIRef('http://www.w3.org/2002/07/owl#versionInfo'): 'versionInfo',
    rdflib.URIRef('http://www.w3.org/2004/02/skos/core#historyNote'): 'historyNote',
    rdflib.URIRef('http://www.w3.org/2004/02/skos/core#note'): 'note'
}

for s, p, o in g:
    if p in predicates:
        identifier = s.split('/')[-1]
        value = str(o)
        predicate = predicates[p]
        data.append({'identifier': identifier, 'value': value, 'type': predicate})

# Convert the extracted data into a DataFrame
gmo = pd.DataFrame(data)

In [9]:
gmo

Unnamed: 0,identifier,value,type
0,GMO_001450,http://www.ncbi.nlm.nih.gov/pccompound/61701,seeAlso
1,GMO_001666,http://en.wikipedia.org/wiki/Trypsin,seeAlso
2,GMO_001731,http://www.w3.org/2002/07/owl#Class,type
3,GMO_001774,http://purl.jp/bio/10/gmo/GMO_001773,subClassOf
4,GMO_001933,http://purl.jp/bio/10/gmo/GMO_000002,subClassOf
...,...,...,...
8611,GMO_000001,Medium,label
8612,GMO_001635,Rabbit serum,label
8613,GMO_001861,alpha-Ketoglutaric acid,definition
8614,GMO_000110,http://www.w3.org/2002/07/owl#AnnotationProperty,type


In [10]:
gmo[gmo["type"]=="label"].identifier.unique().shape

(1040,)

In [13]:
gmo_db = pd.read_csv("ontologies/GMO.csv")

In [61]:
gmo[gmo.value.str.contains("tryptic soy broth")]

Unnamed: 0,identifier,value,type


In [19]:
gmo[gmo.identifier == "GMO_000019"]

Unnamed: 0,identifier,value,type
1311,GMO_000019,GMO_000019,identifier
1701,GMO_000019,Prepared culture medium,label
3210,GMO_000019,Prepared culture medium,prefLabel
7004,GMO_000019,http://www.w3.org/2002/07/owl#Class,type
8289,GMO_000019,http://purl.jp/bio/10/gmo/GMO_000002,subClassOf


In [64]:
gmo_db[gmo_db["Preferred Label"].str.contains("")]

Unnamed: 0,Class ID,Preferred Label,Synonyms,Definitions,Obsolete,CUI,Semantic Types,Parents,amount,concentration,...,http://www.w3.org/2004/02/skos/core#historyNote,http://www.w3.org/2004/02/skos/core#note,http://www.w3.org/2004/02/skos/core#prefLabel,is a medium type,is composed of,medium description,medium ID,medium name,medium URL,original description


In [24]:
values = gmo_db['Preferred Label'].tolist()

In [43]:
preds_medium = preds[preds.ner=="MEDIUM"]

In [54]:
preds_medium_vc = preds_medium.word_qc.value_counts().reset_index()

In [71]:
preds_medium_vc

Unnamed: 0,word_qc,count,best_match
0,lb,47756,"(lb medium, 90.0, 305)"
1,tsb,12523,"(quaker white oats, 72.0, 276)"
2,lb broth,11208,"(difco lb broth, lennox, 90.0, 36)"
3,bhi,8144,"(biphenyl, 72.0, 89)"
4,luria-bertani,8105,"(edta, 67.5, 705)"
...,...,...,...
43295,ay3706-ngm,1,"(mes, 45.0, 141)"
43296,op50-ngm,1,"(mops, 60.00000000000001, 168)"
43297,citrated sheep blood agar,1,"(agar, 90.0, 721)"
43298,flask standard medium,1,"(medium type based on consistency, 85.5, 6)"


In [56]:
preds_medium_vc['best_match'] = preds_medium_vc['word_qc'].apply(lambda x: process.extractOne(x, [i.lower() for i in values], scorer=fuzz.WRatio))

In [74]:
preds_medium_vc['best_match_db'] = preds_medium_vc['word_qc'].apply(lambda x: process.extractOne(x, [i.lower() for i in gmo_db["Preferred Label"].to_list()], scorer=fuzz.WRatio))

In [75]:
preds_medium_vc

Unnamed: 0,word_qc,count,best_match,best_match_db
0,lb,47756,"(lb medium, 90.0, 305)","(lb medium, 90.0, 305)"
1,tsb,12523,"(quaker white oats, 72.0, 276)","(quaker white oats, 72.0, 276)"
2,lb broth,11208,"(difco lb broth, lennox, 90.0, 36)","(difco lb broth, lennox, 90.0, 36)"
3,bhi,8144,"(biphenyl, 72.0, 89)","(biphenyl, 72.0, 89)"
4,luria-bertani,8105,"(edta, 67.5, 705)","(edta, 67.5, 705)"
...,...,...,...,...
43295,ay3706-ngm,1,"(mes, 45.0, 141)","(mes, 45.0, 141)"
43296,op50-ngm,1,"(mops, 60.00000000000001, 168)","(mops, 60.00000000000001, 168)"
43297,citrated sheep blood agar,1,"(agar, 90.0, 721)","(agar, 90.0, 721)"
43298,flask standard medium,1,"(medium type based on consistency, 85.5, 6)","(medium type based on consistency, 85.5, 6)"


## MCO

In [4]:
# Load the ontology file
graph = obonet.read_obo('ontologies/Microbial-Conditions-Ontology/mco.obo')


In [5]:
labels = [data['name'] for _, data in graph.nodes(data=True)]

In [12]:
names = []
synonyms = []

for _, data in graph.nodes(data=True):
	name = data.get('name')
	if name:
		names.append(name)
	synonyms.extend(data.get('synonym', []))

In [29]:
names_series = pd.Series(names).str.lower()

In [37]:
names_series[names_series.str.contains("")]

Series([], dtype: object)

In [9]:
graph.nodes(data=True)['MICRO:0000558']["name"]

'MacConkey agar'

# CHEBI

In [76]:
chebi = pd.read_csv("ontologies/CHEBI.csv")

  chebi = pd.read_csv("ontologies/CHEBI.csv")


In [78]:
chebi[chebi["Preferred Label"].str.contains("LB")]

Unnamed: 0,Class ID,Preferred Label,Synonyms,Definitions,Obsolete,CUI,Semantic Types,Parents,BRAND NAME,database_cross_reference,...,INN,is conjugate acid of,is conjugate base of,is enantiomer of,is substituent group from,is tautomer of,IUPAC NAME,shorthand,subset_property,synonym_type_property
34715,http://purl.obolibrary.org/obo/CHEBI_185775,ALBIZZIINE,2-amino-3-(carbamoylamino)propanoic acid,,False,,,http://purl.obolibrary.org/obo/CHEBI_33704,,Chemspider:308985|KEGG:C08264,...,,,,,,,,,,
82185,http://purl.obolibrary.org/obo/CHEBI_47076,N-(3-AMINOPROPYL)-N-[(R)-(3-BENZYL-5-CHLORO-4-...,,,False,,,http://purl.obolibrary.org/obo/CHEBI_88341,,,...,,,,,,,,,,
109840,http://purl.obolibrary.org/obo/CHEBI_178642,LBPA(16:1(9Z)/18:1(9Z)),[(2S)-1-[[(2S)-2-[(Z)-hexadec-9-enoyl]oxy-3-hy...,,False,,,http://purl.obolibrary.org/obo/CHEBI_24360,,LIPID_MAPS_instance:LMGP04100004,...,,,,,,,,,,
122826,http://purl.obolibrary.org/obo/CHEBI_140875,SKLB-677,"1-(5-tert-butyl-1,2-oxazol-3-yl)-3-[3-fluoro-4...",A member of the class of phenylureas that is u...,False,,,http://purl.obolibrary.org/obo/CHEBI_134043|ht...,,PMID:26497577,...,,,,,,,,,,
137096,http://purl.obolibrary.org/obo/CHEBI_214634,MC-LBu,"(5R,8S,11R,12S,15S,18S,19S,22R)-15-ethyl-18-[(...",,False,,,http://purl.obolibrary.org/obo/CHEBI_16670,,,...,,,,,,,,,,
144810,http://purl.obolibrary.org/obo/CHEBI_167660,LB42908,"LB-42908|(1-((1-(benzo[d][1,3]dioxol-5-ylmethy...",A member of the class of pyrrolecarboxamides t...,False,,,http://purl.obolibrary.org/obo/CHEBI_46844|htt...,,PMID:11714612|PMID:24059235|PMID:15812228|Chem...,...,,,,,,,,,,
186331,http://purl.obolibrary.org/obo/CHEBI_46269,"4,4,4-TRIFLUORO-1-THIEN-2-YLBUTANE-1,3-DIONE",,,False,,,http://purl.obolibrary.org/obo/CHEBI_76224,,,...,,,,,,,,,,
199009,http://purl.obolibrary.org/obo/CHEBI_188257,SLBPA(42:0),"[(2S)-3-[[(2S)-2,3-di(tetradecanoyloxy)propoxy...",,False,,,http://purl.obolibrary.org/obo/CHEBI_24360,,Chemspider:113377844|LIPID_MAPS_instance:LMGP0...,...,,,,,,,,,,


In [None]:
k