In [7]:
import glob
import os
import re
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import pandas as pd
from tqdm.notebook import tqdm
from joblib import Parallel, delayed

PDF_DIR = '/med/pdbrepo/pdb_pubmed_pdfs/pdfs'
TECHNIQUES_FILE = 'biochemical_techniques_list.txt'

In [14]:
def convert_pdf_to_txt(path, pages=None):
    with StringIO() as output:
        manager = PDFResourceManager()
        
        with TextConverter(manager, output, laparams=LAParams()) as converter:
            interpreter = PDFPageInterpreter(manager, converter)

            with open(path, 'rb') as infile:
                for page in PDFPage.get_pages(infile, set()):
                    interpreter.process_page(page)

        text = output.getvalue()
    
    return text.replace("\n", " ").replace("- ", "")


def create_regex_dict_from_phrases(phrase_list):
    phrase_regex_dict = {}
    
    for phrase in phrase_list:
        phrase_regex_dict[phrase] = re.compile(phrase, re.IGNORECASE)
        
    return phrase_regex_dict


def check_if_text_contains_phrases(pubmed_id, text, phrase_regex_dict):
    hit_dict = {}
    hit_dict["PubMed id"] = pubmed_id
    hit_sum = 0
    
    for phrase in phrase_regex_dict.keys():
        is_hit = re.search(phrase_regex_dict[phrase], text) is not None
        hit_dict[phrase] = [is_hit]
        
        if is_hit:
            hit_sum += 1
        
    hit_dict["Total number of phrases found"] = hit_sum
    hit_df = pd.DataFrame(hit_dict)
    hit_df = hit_df.set_index("PubMed id")
        
    return hit_df


def analyze_pdf(pdf, phrase_regex_dict):
    pubmed_id = pdf.split('/')[-1][:-4]
    pdf_text = convert_pdf_to_txt(pdf)
    hit_df = check_if_text_contains_phrases(pubmed_id, pdf_text, phrase_regex_dict)
    
    return hit_df


def analyze_pdfs(pdfs, phrases_list, max_count=None, n_jobs=1):
    result_df = pd.DataFrame()
    phrase_regex_dict = create_regex_dict_from_phrases(phrases_list)

    if max_count is not None and max_count != -1:
        pdfs = pdfs[:max_count]
    
    hit_dfs = Parallel(n_jobs=n_jobs)(delayed(analyze_pdf)(pdf, phrase_regex_dict) 
                                      for pdf in tqdm(pdfs))

    return pd.concat(hit_dfs)

In [15]:
pubmed_pdfs = glob.glob(f'{PDF_DIR}/*.pdf') 
with open(TECHNIQUES_FILE) as file:
    techniques_list = file.read().rstrip().lower().split('\n')

In [None]:
result_df = analyze_pdfs(pubmed_pdfs, techniques_list, max_count=-1, n_jobs=6)
result_df.to_csv("pdf_phrases.csv")

HBox(children=(FloatProgress(value=0.0, max=6279.0), HTML(value='')))

In [11]:
result_df

Unnamed: 0_level_0,abts,acid guanidinium thiocyanate-phenol-chloroform extraction,alsever's solution,ammonium sulfate precipitation,antibodies from lymphocyte secretions,bacterial display,bicinchoninic acid assay,bioorthogonal chemical reporter,biopanning,blosum,...,surface-enhanced laser desorption/ionization,tcp-seq,temperature gradient gel electrophoresis,terminal restriction fragment length polymorphism,trizol,trolox equivalent antioxidant capacity,turbidimetric inhibition immunoassay,xdna,yeast display,Total number of phrases found
PubMed id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15809300,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
9651355,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
31345930,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
16634633,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
16531242,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
31201325,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,3
28266846,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
19934036,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
31836717,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,2
25973989,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
