In [None]:
import glob
import os
import re
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import pandas as pd
from tqdm.notebook import tqdm

PDF_DIR = '/med/pdbrepo/pdb_pubmed_pdfs/pdfs'
TECHNIQUES_FILE = 'biochemical_techniques_list.txt'

In [None]:
def convert_pdf_to_txt(path, pages=None):
    with StringIO() as output:
        manager = PDFResourceManager()
        
        with TextConverter(manager, output, laparams=LAParams()) as converter:
            interpreter = PDFPageInterpreter(manager, converter)

            with open(path, 'rb') as infile:
                for page in PDFPage.get_pages(infile, set()):
                    interpreter.process_page(page)

        text = output.getvalue()
    
    return text.replace("\n", " ").replace("- ", "")


def create_regex_dict_from_phrases(phrase_list):
    phrase_regex_dict = {}
    
    for phrase in phrase_list:
        phrase_regex_dict[phrase] = re.compile(phrase, re.IGNORECASE)
        
    return phrase_regex_dict


def check_if_text_contains_phrases(pubmed_id, text, phrase_regex_dict):
    hit_dict = {}
    hit_dict["PubMed id"] = pubmed_id
    hit_sum = 0
    
    for phrase in phrase_regex_dict.keys():
        is_hit = re.search(phrase_regex_dict[phrase], text) is not None
        hit_dict[phrase] = [is_hit]
        
        if is_hit:
            hit_sum += 1
        
    hit_dict["Total number of phrases found"] = hit_sum
        
    return pd.DataFrame(hit_dict)


def analyze_pdfs(pdfs, phrases_list, max_count=None):
    result_df = pd.DataFrame()
    phrase_regex_dict = create_regex_dict_from_phrases(phrases_list)

    if max_count is not None:
        pdfs = pdfs[:max_count]
    
    for pdf in tqdm(pdfs):
        pubmed_id = pdf.split('/')[-1][:-4]
        pdf_text = convert_pdf_to_txt(pdf)
        hits = check_if_text_contains_phrases(pubmed_id, pdf_text, phrase_regex_dict)
        result_df = result_df.append(hits, ignore_index=True)

    return result_df

In [None]:
pubmed_pdfs = glob.glob(f'{PDF_DIR}/*.pdf') 
with open(TECHNIQUES_FILE) as file:
    techniques_list = file.read().rstrip().lower().split('\n')

In [None]:
result_df = analyze_pdfs(pubmed_pdfs, techniques_list, max_count=20)
result_df.to_csv("pdf_phrases.csv")

In [None]:
result_df