In [105]:
from math import floor
from Bio import Medline
from Bio import Entrez
import pandas as pd
import numpy as np

Entrez.email = 'chee.el@northeastern.edu'

def get_article_count(protein_name):
    '''
    Query PubMed for the number of articles associated with a given protein using both MeSH terms and text words.
    '''
    if isinstance(protein_name, str):
        term = f'{protein_name}[MeSH Terms] OR {protein_name}[tw] AND (UV[tiab] OR Ultraviolet radiation[tiab] OR G4[tiab] OR quadruplex[tiab] OR dna repair[tiab] OR melanoma[tiab])'
        handle = Entrez.esearch(db='pubmed', term=term, retmax=9999)
        record = Entrez.read(handle)
        count = record['Count']
        handle.close()
        print(f'{protein_name}: {record["Count"]} articles')
        handle.close()
        #ccdc9 
    else:
        count = 0
        record = None
    return int(count), record

def search_abstracts(record):
    term_total = 0
    terms = ['UV', 'Ultraviolet radiation', 'G4', 'quadruplex', 'dna repair', 'melanoma']
    id_list = record['IdList']
    for id in id_list:
        handle = Entrez.efetch(db='pubmed', id=id, retmode='text', rettype='medline')
        record = Medline.read(handle)
        try:
            abstract = record['AB']
            for term in terms:
                if term in abstract:
                    term_total += 1
        except Exception as e:
            term_total = term_total
            print('Error finding abstract')
    print('Total terms: ', term_total)
    return term_total

def query_pubmed(proteins):
    '''
    Query PubMed for a list of proteins and return a sorted list of proteins 
    and the number of associated articles.
    '''
    protein_counts = []
    for protein in proteins:
        count, record = get_article_count(protein)
        if count > 0:
            term_total = search_abstracts(record)
        else: 
            term_total = 0
        protein_counts.append((protein, count, term_total))
    return protein_counts

def main(list_proteins):
    # sorted list of proteins and their associated article counts
    protein_counts = query_pubmed(list_proteins)

    # convert to dataframe
    df = pd.DataFrame(protein_counts, columns=['Protein', 'Article Count', 'Abstract Term Total'])

# list of proteins to query
#proteins = ['ZRF1', 'DDB2', 'CST', 'CTC1-STN1-TEN1', 'BRCA1', 'hnRNP A1', 'Pif1']
proteins = ['XIRP2']

#main(proteins)


In [106]:
def main(input_csv, output_csv):

    # read proteins from input CSV
    df = pd.read_csv(input_csv)
    print('Read CSV')
    proteins = df['SYMBOL'].tolist()
    print('Created protein list')
    print(proteins[1])

    # sorted list of proteins and their associated article counts
    protein_counts = query_pubmed(proteins)

    # convert to dataframe
    result_df = pd.DataFrame(protein_counts, columns=['Protein', 'Article Count', 'Abstract Term Total'])
    result_df.to_csv(output_csv, index=False)
    print(f"Results saved to {output_csv}")


# list of proteins to query
proteins = ['POT1', 'RPA', 'CST', 'CTC1-STN1-TEN1', 'BRCA1', 'hnRNP A1', 'Pif1']

if __name__ == "__main__":
    input_csv = "proteins_with_uv.csv"
    output_csv = "protein_article_counts3.csv"
    main(input_csv, output_csv)

Read CSV
Created protein list
nan
NBAS: 1 articles
Total terms:  1
XIRP2: 0 articles
PSD: 74 articles
Total terms:  65
RNF222: 0 articles
RIMBP3C: 0 articles
DBX1: 0 articles
WASHC1: 2 articles
Total terms:  1
WASHC1: 2 articles
Total terms:  1


HTTPError: HTTP Error 429: Too Many Requests