# Create Domain MeSH

In [1]:
def get_keywords_mesh(mesh):
    mesh_kewords_related = []
    for m in mesh:
        if 'DescriptorName' in m:
            mesh_kewords_related.append([str(m['DescriptorName']),m['DescriptorName'].attributes['UI']])
    return mesh_kewords_related

def get_authors(authors_list):
    authors = []
    for authr in authors_list:
        authors.append({
        'lastname': authr['LastName'] if 'LastName' in authr else None,
        'firstname': authr['ForeName'] if 'ForeName' in authr else None,
        'initials': authr['Initials'] if 'Initials' in authr else None,
        'affiliation': authr['AffiliationInfo'][0]['Affiliation'] if 'AffiliationInfo' in authr and 'Affiliation' in authr['AffiliationInfo'] else None,
        })
    return authors

def get_conclusion(abstract_texts):
    res = {}
    for text in abstract_texts:
        if 'Label' in text.attributes:
            if text.attributes['Label'] == "CONCLUSIONS":
                res = str(text)
    if type(res) == dict:
        res = None
    return res

def get_results(abstract_texts):
    res = {}
    for text in abstract_texts:
        if 'Label' in text.attributes:
            if text.attributes['Label'] == "RESULTS":
                res = str(text)
    if type(res) == dict:
        res = None
    return res

def get_methods(abstract_texts):
    res = {}
    for text in abstract_texts:
        if 'Label' in text.attributes:
            if text.attributes['Label'] == "METHODS":
                res = str(text)

    if type(res) == dict:
        res = None
    return res

def pubMedSearcherBio(query,max_count):
    import datetime
    
   
    Entrez.email = 'myemail@ccc.com'

    handle = Entrez.esearch(db='pubmed',
                           retmode='xml',
                           term = query,
                           datetype='pdat',
                           retmax = max_count,
                           mindate='2010',
                           maxdate='2020')
    results = Entrez.read(handle)

    ids = ','.join(results['IdList'])

    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           sort = 'date',
                           id=ids)

    papers = Entrez.read(handle)
    
    articleInfo = []
    if 'PubmedArticle' in papers and len(papers['PubmedArticle'])>0:
        for paper in papers['PubmedArticle']:
            articleInfo.append({
                    u'pubmed_id':str(paper['MedlineCitation']['PMID']),
                    u'title':str(paper['MedlineCitation']['Article']['ArticleTitle']) if 'Article' in paper['MedlineCitation'] and 'ArticleTitle' in paper['MedlineCitation']['Article'] else None,
                    u'keywords':[str(word) for word in paper['MedlineCitation']['KeywordList'][0]] if 'KeywordList' in paper['MedlineCitation'] and len(paper['MedlineCitation']['KeywordList'])>0 else [],
                    u'journal':str(paper['MedlineCitation']['Article']['Journal']['Title']) if 'Article' in paper['MedlineCitation'] and 'Journal' in paper['MedlineCitation']['Article'] and 'Title' in paper['MedlineCitation']['Article']['Journal'] else None,
                    u'abstract':str(paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0]) if 'Article' in paper['MedlineCitation'] and 'Abstract' in paper['MedlineCitation']['Article'] and 'AbstractText' in paper['MedlineCitation']['Article']['Abstract'] and len(paper['MedlineCitation']['Article']['Abstract']['AbstractText']) > 0 else None,
                    u'conclusions':get_conclusion(paper['MedlineCitation']['Article']['Abstract']['AbstractText']) if 'Article' in paper['MedlineCitation'] and 'Abstract' in paper['MedlineCitation']['Article'] and 'AbstractText' in paper['MedlineCitation']['Article']['Abstract'] else None,
                    u'methods':get_methods(paper['MedlineCitation']['Article']['Abstract']['AbstractText']) if 'Article' in paper['MedlineCitation'] and 'Abstract' in paper['MedlineCitation']['Article'] and 'AbstractText' in paper['MedlineCitation']['Article']['Abstract'] else None,
                    u'results': get_results(paper['MedlineCitation']['Article']['Abstract']['AbstractText']) if 'Article' in paper['MedlineCitation'] and 'Abstract' in paper['MedlineCitation']['Article'] and 'AbstractText' in paper['MedlineCitation']['Article']['Abstract'] else None,
                    u'copyrights':str(paper['MedlineCitation']['Article']['Abstract']['CopyrightInformation']) if 'Article' in paper['MedlineCitation'] and 'Abstract' in paper['MedlineCitation']['Article'] and 'CopyrightInformation' in paper['MedlineCitation']['Article']['Abstract'] else None,
                    u'doi':str(paper['MedlineCitation']['Article']['ELocationID'][0]) if 'Article' in paper['MedlineCitation'] and 'ELocationID' in paper['MedlineCitation']['Article'] and len(paper['MedlineCitation']['Article']['ELocationID'])>0 else None,
                    u'publication_date':datetime.datetime.strptime(f"{paper['MedlineCitation']['Article']['ArticleDate'][0]['Year']}-{paper['MedlineCitation']['Article']['ArticleDate'][0]['Month']}-{paper['MedlineCitation']['Article']['ArticleDate'][0]['Day']}","%Y-%M-%d").date() if 'Article' in paper['MedlineCitation'] and 'ArticleDate' in paper['MedlineCitation']['Article'] and len(paper['MedlineCitation']['Article']['ArticleDate'])>0 else None,
                    u'authors':get_authors(paper['MedlineCitation']['Article']['AuthorList']) if 'Article' in paper['MedlineCitation'] and 'AuthorList' in paper['MedlineCitation']['Article'] else [],
                    u'mesh_terms':get_keywords_mesh(paper['MedlineCitation']['MeshHeadingList']) if 'MeshHeadingList' in paper['MedlineCitation'] else []
            })

#     # Generate Pandas DataFrame from list of dictionaries
#     articlesPD = pd.DataFrame.from_dict(articleInfo)
#     export_csv = df.to_csv (r'C:\Users\YourUsernam\Desktop\export_dataframe.csv', index = None, header=True) 

#     #Print first 10 rows of dataframe
#     print(articlesPD.head(10))
    return articleInfo

In [2]:
import pandas as pd
File = 'master keywords list.csv'
keywords = pd.read_csv(File,names =['keyword'])['keyword'].to_list()

In [3]:
len(keywords)

49

In [4]:
def get_revelant_mesh(mesh_kewords_related):
    from collections import Counter
    all_mesh_words = []
    for mesh in mesh_kewords_related:
        all_mesh_words.append(mesh[0])
    return list(dict(sorted(Counter(all_mesh_words).items(), key=lambda item: item[1],reverse=True)).keys())[:50]

In [5]:
def related_keywords_mesh(papers):
    mesh_kewords_related = []
    for paper in papers:
        mesh_kewords_related.extend(paper['mesh_terms'])
    mesh_kewords_related = get_revelant_mesh(mesh_kewords_related)
    return mesh_kewords_related

In [None]:
import json
from Bio import Entrez
from tqdm.notebook import tqdm
import time
domain_mesh = {}
for key in tqdm(keywords):
    try:
        papers  = pubMedSearcherBio(f"{key}",10000)
        mesh_extracted = related_keywords_mesh(papers)
        domain_mesh[key] = mesh_extracted
        time.sleep(20)
    except Exception as e:
        print(str(e))
        print(key)
        domain_mesh[key] = []

  0%|          | 0/49 [00:00<?, ?it/s]

[WinError 10054] An existing connection was forcibly closed by the remote host
Protein A


In [7]:
import json
with open('domain_mesh.json','r') as f:
    domain_mesh = json.load(f)

In [58]:
domain_mesh

# Add Labelled Data to Domain Terms

In [59]:
import pandas as pd
File = 'train.csv'
df_train = pd.read_csv(File)
df_train = df_train[['Keywords (separated by ;)','doi']]
df_train

Unnamed: 0,Keywords (separated by ;),doi
0,affinity,10.1002/biot.201600357
1,affinity;intein,10.2174/1389203720666190208110416
2,affinity;Protein A,10.3390/ma9120994
3,affinity;Protein A;electrophoresis,10.1371/journal.pone.0139137
4,affinity;Protein A;Protein L,10.1080/19420862.2019.1662690
5,affinity;VHH,10.1128/aem.02595-14
6,bioconjugation;5MP,10.1021/jacs.7b00670
7,ELP,10.3389/fbioe.2019.00233
8,ELP,10.1016/j.jconrel.2014.06.028
9,ELP,10.1016/j.ab.2011.04.034


In [60]:
train_dataset = {}
for ind,row in df_train.iterrows():
    for key in row['Keywords (separated by ;)'].split(';'):
        if key in train_dataset:
            train_dataset[key].append(row['doi'])
        else:
            train_dataset[key] =[row['doi']]

In [61]:
train_dataset

{'affinity': ['10.1002/biot.201600357',
  '10.2174/1389203720666190208110416',
  '10.3390/ma9120994',
  '10.1371/journal.pone.0139137',
  '10.1080/19420862.2019.1662690',
  '10.1128/aem.02595-14'],
 'intein': ['10.2174/1389203720666190208110416'],
 'Protein A': ['10.3390/ma9120994',
  '10.1371/journal.pone.0139137',
  '10.1080/19420862.2019.1662690',
  '10.1371/journal.pone.0025282',
  '10.1080/19420862.2019.1565749'],
 'electrophoresis': ['10.1371/journal.pone.0139137'],
 'Protein L': ['10.1080/19420862.2019.1662690', '10.1038/s41467-018-05403-1'],
 'VHH': ['10.1128/aem.02595-14'],
 'bioconjugation': ['10.1021/jacs.7b00670'],
 '5MP': ['10.1021/jacs.7b00670'],
 'ELP': ['10.3389/fbioe.2019.00233',
  '10.1016/j.jconrel.2014.06.028',
  '10.1016/j.ab.2011.04.034',
  '10.1002/0471140864.ps0611s61',
  '10.1016/j.jconrel.2015.11.010',
  '10.1021/bm400167h'],
 'membrane': ['10.1073/pnas.2000223117',
  '10.1016/j.jmb.2018.06.038',
  '10.1038/s41467-018-05403-1']}

In [55]:
def keywords_mesh(papers):
    mesh_kewords_related = []
    for paper in papers:
        mesh_kewords_related.extend(paper['mesh_terms'])
    
    all_mesh_words = []
    for mesh in mesh_kewords_related:
        all_mesh_words.append(mesh[0])
        
    return all_mesh_words

In [56]:
def get_train_mesh(train_dataset):
    train_mesh = {}
    for key,values in train_dataset.items():
        train_mesh[key] = []
        for value in values:
            papers  = pubMedSearcherBio(f"{value}",1)
            mesh_extracted = keywords_mesh(papers)
            train_mesh[key].extend(mesh_extracted)
        train_mesh[key] = list(set(train_mesh[key]))
    return train_mesh

In [63]:
train_mesh = get_train_mesh(train_dataset)

In [None]:
train_mesh

In [None]:
# replace or add train mesh into domain terms 
for key,value in train_mesh.items():
    for val in value:
        if val not in domain_mesh:
            domain_mesh[key].append(val)

In [62]:
# save domain mesh
with open('domain_mesh.json','w') as f:
    json.dump(domain_mesh,f)

# Cache articles based on keywords and extracted MeSH 

In [14]:
from tqdm.notebook import tqdm

def extract_articles_cache(search_term,domain_mesh):        
    docs = []
    
    try:
        papers  = pubMedSearcherBio(f"{search_term}",1500)
        docs.extend(papers)
    except Exception as e:
        print(str(e))
        print("Error :",search_term)

    for key in tqdm(domain_mesh[search_term]):
        try:
            papers  = pubMedSearcherBio(f"{search_term} AND {key}",50)
            docs.extend(papers)
        except Exception as e:
            print(str(e))
            print("Error :",f"{search_term} AND {key}")
    
    return docs

In [20]:
cache_articles = {}

In [21]:
from Bio import Entrez
for key in domain_mesh.keys():
    cache_articles[key] = extract_articles_cache(key,domain_mesh) 

  0%|          | 0/89 [00:00<?, ?it/s]

  0%|          | 0/90 [00:00<?, ?it/s]

[WinError 10054] An existing connection was forcibly closed by the remote host
Error : membrane AND Cells, Cultured
[WinError 10053] An established connection was aborted by the software in your host machine
Error : membrane AND SARS-CoV-2


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Remote end closed connection without response
Error : virus-like particles AND Animals


  0%|          | 0/58 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Supplied id parameter is empty.
Error : 5MP AND Sulfhydryl Compounds
Supplied id parameter is empty.
Error : 5MP AND Models, Molecular
Supplied id parameter is empty.
Error : 5MP AND Pyrroles
Supplied id parameter is empty.
Error : 5MP AND Acetolactate Synthase
Supplied id parameter is empty.
Error : 5MP AND Escherichia coli
Supplied id parameter is empty.
Error : 5MP AND Isoenzymes


  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

[WinError 10054] An existing connection was forcibly closed by the remote host
Error : liquid-liquid separation AND Organelles
[WinError 10054] An existing connection was forcibly closed by the remote host
Error : liquid-liquid separation AND Chemical Fractionation


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

[WinError 10053] An established connection was aborted by the software in your host machine
Error : ELP AND Phase Transition


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/81 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

[WinError 10054] An existing connection was forcibly closed by the remote host
Error : polymer precipitation AND Temperature
[WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond
Error : polymer precipitation AND Adsorption
[WinError 10053] An established connection was aborted by the software in your host machine
Error : polymer precipitation AND Microscopy, Electron, Scanning


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

[WinError 10054] An existing connection was forcibly closed by the remote host
Error : IgG structure


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/75 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/57 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Supplied id parameter is empty.
Error : Gradipore


0it [00:00, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Supplied id parameter is empty.
Error : Chreto


0it [00:00, ?it/s]

In [22]:
len(cache_articles)

49

In [34]:
import datetime
from json import JSONEncoder
# subclass JSONEncoder
class DateTimeEncoder(JSONEncoder):
    #Override the default method
    def default(self, obj):
        if isinstance(obj, (datetime.date, datetime.datetime)):
            return obj.isoformat()

In [91]:
import pickle
with open('cache_articles.json', 'w') as f:
    json.dump(cache_articles, f,indent=4, cls=DateTimeEncoder)

In [51]:
# import json
# import dateutil.parser

# # custom Decoder
# def DecodeDateTime(Dict):
#     if 'publication_date' in Dict and Dict["publication_date"]!=None:
#         Dict["publication_date"] = dateutil.parser.parse(Dict["publication_date"])
#         return Dict
#     else:
#         return Dict

# # use of object_hook
# with open('cache_articles.json', 'r') as f:
#     decoded_cache_articles = json.load(f,object_hook=DecodeDateTime)
# print(len(decoded_cache_articles))

In [71]:
cache_articles.keys()

dict_keys(['affinity', 'membrane', 'TFF', 'continuous processing', 'virus capsid', 'virus-like particles', 'bioconjugation', '5MP', '5MPs', 'crystalization', 'aptamer', 'photoresponsive', 'HCIC', 'thermoresponsive polymer', 'covalent capture', 'ionic liquids', 'liquid-liquid separation', 'Astrea', 'Prometic', 'ELP', 'affinity precipitation', 'Protein A', 'PEG', 'polymer precipitation', 'ATPS', 'magnetic particles', 'droplet forming protein', 'intein', 'IgG structure', 'Protein L', 'charged membrane', 'peptoid', 'PVA', 'PVP', 'monolith', 'novel support', 'VHH', 'precipitation', 'Protein G', 'PAA', 'expanded bed', 'ion exchange', 'membrane chromatography', 'supercritical fluid', 'electrophoresis', 'Gradipore', 'recombinant IgG', 'cyclic peptides', 'Chreto'])