# Introducing DOIs to the master file

### DOI acquisition from Elad

In [None]:
# import modules
from pymed import PubMed
import httpx
import pandas as pd
from io import StringIO
from more_itertools import chunked
from tqdm.notebook import tqdm
CHUNK_SIZE = 100

# load the CSV file and 
req = httpx.request("GET", "https://raw.githubusercontent.com/roberts-farm-of-ideas/8/56bab7c99ef340a6c3853b595f4d777ac7288c54/materials/reference_code_mappings/reference_code_mappings.csv")
s = StringIO(req.content.decode("UTF-8"))
tecr_refs = pd.read_csv(s)
tecr_refs.head()

# load references from PubMed
tecr_refs_with_pubmed_id = tecr_refs[~pd.isnull(tecr_refs.pmid)].copy()
tecr_refs_with_pubmed_id["pmid"] = tecr_refs_with_pubmed_id.pmid.astype(int).astype(str)
print(f"Collected {tecr_refs_with_pubmed_id.shape[0]} PubMed IDs")

# parse the references for DOIs
pubmed = PubMed(tool="MyTool", email="elad.noor@weizmann.ac.il")

data = []
with tqdm("downloading metadata from PubMed", total=tecr_refs_with_pubmed_id.shape[0]) as pbar:
    for rows in chunked(tecr_refs_with_pubmed_id.itertuples(), CHUNK_SIZE):
        pubmed_ids = " ".join([str(r.pmid) for r in rows])
        results = pubmed.query(pubmed_ids)
        for paper in results:
            try:
                doi = paper.pubmed_doi
            except AttributeError:
                doi = None
 
            pmid = paper.pubmed_id.split("\n")[0]
            pbar.set_description_str(f"pubmed ID {pmid}")
            authors = ", ".join([d["lastname"] + (" " + d["firstname"] if d["firstname"] else "") for d in paper.authors])
            data.append((str(pmid), doi, paper.publication_date.year, authors, paper.abstract))
        pbar.update(len(rows))
        
# export the parsed information into a new CSV
_df = pd.DataFrame(data=data, columns=["pmid", "doi", "year", "authors", "abstract"])
result_df = _df.join(tecr_refs_with_pubmed_id.set_index("pmid"), on="pmid", lsuffix="_from_pubmed", rsuffix="_from_robert")
result_df.to_csv("references_with_abstracts.csv")

### Incorporating the DOIs into the master file

In [6]:
# import the modules
from io import StringIO
import pandas
import httpx
import re

# import the master_file and reference_file
reference_file = pandas.read_csv('references_with_abstracts.csv')
reference_file.fillna(' ')
master_file = httpx.request("GET", "https://raw.githubusercontent.com/freiburgermsu/Biochemical-databases/main/openTECR/TECR_files/2021-08-16_master_TECR_3.csv")
master_file = StringIO(master_file.content.decode("UTF-8"))
master_file = pandas.read_csv(master_file)
master_file.fillna(' ')
for column in master_file:
    if re.search('Unnamed', column):
        del master_file[column]

# DOI and PMID columns are added to the master_file
new_column = [' ' for row in range(len(master_file))]
master_file.insert(6, 'PMID', new_column)
master_file.insert(7, 'DOI', new_column)
display(master_file.head(5))

reference_ids = reference_file['reference_code_in_online_database']
references_added = 0
for index, reference in reference_ids.iteritems():
    if (reference_file.at[index, 'pmid'] or reference_file.at[index, 'doi_from_robert']) not in [' ']:
        matching_master_subset = master_file.loc[(master_file['Reference ID:'] == reference)]

        for master_index, match in matching_master_subset.iterrows():
            master_file.at[master_index, 'PMID'] = reference_file.at[index, 'pmid']
            master_file.at[master_index, 'DOI'] = reference_file.at[index, 'doi_from_robert']

            references_added += 1
        
display(master_file.head(5))
print(f'References added to {references_added} datums')

# export the indentifier-ingrained CSV
master_file.to_csv('2021-08-18_master_with_DOIs.csv')

Unnamed: 0,standard_id,Enzyme:,KEGG Reaction:,Reaction:,Reference:,Reference ID:,PMID,DOI,T [K],pH,...,Km,Method:,Buffer:,Experimental conditions,EC Value:,solutes [mol / kg],solutes [mol / dm^3],Ionic strength [mol / dm^3],Ionic strength [mol / kg],Enthalpy [kJ / mol]
0,4336.0,aspartate ammonia-lyase,kegg:C00049 = kegg:C00122 + kegg:C00014,L-aspartate(aq) = fumarate(aq) + ammonia(aq),Quastel J.H.; Woolf B.; Biochem. J.; 20 545 (1...,26QUA/WOO_1205,,,310.15,7.4,...,,chemical analysis,phosphate,,4.3.1.1,,,,,
1,2910.0,aspartate ammonia-lyase,kegg:C00049 = kegg:C00122 + kegg:C00014,L-aspartate(aq) = fumarate(aq) + ammonia(aq),Woolf B.; Biochem. J.; 23 472 (1929).,29WOO_1206,,,310.15,7.4,...,,chemical analysis and polarimetry,phosphate,,4.3.1.1,,,,,
2,2129.0,fumarate hydratase,kegg:C00122 + kegg:C00001 = kegg:C00149,fumarate(aq) + H2O(l) = (S)-malate(aq),Borsook H.; Schott H.F.; J. Biol. Chem.; 92 55...,31BOR/SCH_1141,,,298.15,6.81,...,,electrochemistry,,,4.2.1.2,,,,,
3,2130.0,fumarate hydratase,kegg:C00122 + kegg:C00001 = kegg:C00149,fumarate(aq) + H2O(l) = (S)-malate(aq),Borsook H.; Schott H.F.; J. Biol. Chem.; 92 55...,31BOR/SCH_1141,,,298.15,7.12,...,,electrochemistry,,,4.2.1.2,,,,,
4,791.0,fumarate hydratase,kegg:C00122 + kegg:C00001 = kegg:C00149,fumarate(aq) + H2O(l) = (S)-malate(aq),Jacobsohn K.P.; Biochem. Z.; 274 167 (1934).,34JAC_1142,,,278.15,6.8,...,,polarimetry,barbital,,4.2.1.2,,,,,


Unnamed: 0,standard_id,Enzyme:,KEGG Reaction:,Reaction:,Reference:,Reference ID:,PMID,DOI,T [K],pH,...,Km,Method:,Buffer:,Experimental conditions,EC Value:,solutes [mol / kg],solutes [mol / dm^3],Ionic strength [mol / dm^3],Ionic strength [mol / kg],Enthalpy [kJ / mol]
0,4336.0,aspartate ammonia-lyase,kegg:C00049 = kegg:C00122 + kegg:C00014,L-aspartate(aq) = fumarate(aq) + ammonia(aq),Quastel J.H.; Woolf B.; Biochem. J.; 20 545 (1...,26QUA/WOO_1205,16743691.0,10.1042/bj0200545,310.15,7.4,...,,chemical analysis,phosphate,,4.3.1.1,,,,,
1,2910.0,aspartate ammonia-lyase,kegg:C00049 = kegg:C00122 + kegg:C00014,L-aspartate(aq) = fumarate(aq) + ammonia(aq),Woolf B.; Biochem. J.; 23 472 (1929).,29WOO_1206,16744231.0,10.1042/bj0230472,310.15,7.4,...,,chemical analysis and polarimetry,phosphate,,4.3.1.1,,,,,
2,2129.0,fumarate hydratase,kegg:C00122 + kegg:C00001 = kegg:C00149,fumarate(aq) + H2O(l) = (S)-malate(aq),Borsook H.; Schott H.F.; J. Biol. Chem.; 92 55...,31BOR/SCH_1141,,,298.15,6.81,...,,electrochemistry,,,4.2.1.2,,,,,
3,2130.0,fumarate hydratase,kegg:C00122 + kegg:C00001 = kegg:C00149,fumarate(aq) + H2O(l) = (S)-malate(aq),Borsook H.; Schott H.F.; J. Biol. Chem.; 92 55...,31BOR/SCH_1141,,,298.15,7.12,...,,electrochemistry,,,4.2.1.2,,,,,
4,791.0,fumarate hydratase,kegg:C00122 + kegg:C00001 = kegg:C00149,fumarate(aq) + H2O(l) = (S)-malate(aq),Jacobsohn K.P.; Biochem. Z.; 274 167 (1934).,34JAC_1142,,,278.15,6.8,...,,polarimetry,barbital,,4.2.1.2,,,,,


References added to 1637 datums
