In [None]:
import pandas as pd
import numpy as np
import re
import webbrowser

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
from nltk.corpus import stopwords
from wand.image import Image as WImage
from IPython.display import Image, display

#### From pdf to text

In [4]:
class PdfConverter:

   def __init__(self, file_path):
       self.file_path = file_path
        
# → Function used to convert pdf to text to allow the post-processing.
   def convert_pdf_to_txt(self):
       rsrcmgr = PDFResourceManager()
       retstr = StringIO()
       codec = 'utf-8'  # 'utf16','utf-8'
       laparams = LAParams()
       device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
       fp = open(self.file_path, 'rb')
       interpreter = PDFPageInterpreter(rsrcmgr, device)
    
# → Proposing just to read the first page of the pdf, because this is normally where the abstract and the introduction are.
       maxpages = 0
        
       caching = True
       pagenos = set()
       for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, caching=caching, check_extractable=True):
           interpreter.process_page(page)
       fp.close()
       device.close()
       str = retstr.getvalue()
       retstr.close()
       return str
        
if __name__ == '__main__':
    # → Put the path of your pfd here :
    file_path='sample-papers/11.pdf'
    pdfConverter = PdfConverter(file_path)
    with WImage(filename=file_path+"[0]") as img: img.save(filename="cover.png")
    
    # → Now we have the pdf in plain text format. I am also applying a lower() here to avoid dealing with this later.
    text = pdfConverter.convert_pdf_to_txt().lower()

#### Post-processing phase

In [None]:
# → First post-processing for the text : Deleting all the symbols. [^\w] will match anything that's not alphanumeric or underscore.
text = re.sub(r'[^\w]', ' ', text)

# → Second post-processing for the text. Avoiding stop words using the nltk dictionary.
generalDictionary = stopwords.words('english')
wordsArray = text.split()
generalResult = [word for word in wordsArray if word.lower() not in generalDictionary]
text = ' '.join(generalResult)

# → Third post-processing for the text. Avoiding numbers.
notNumbers = re.findall(r'[a-zA-Z]\w+',text)
text = ' '.join(notNumbers)

# → Printing the number of words in the text after the whole post-processing.
print(len(notNumbers))
print(text)

#### Getting the keywords

In [128]:
# → Calculating the repetition frequency of each word.
def word_count(str):
    counts = dict()
    words = str.split()
    
    for word in words:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1

    return counts

# → dictionary that contains the repetition frecuency of a word and its value.
d = word_count(text)

# → Simple array and counter to select the desired number of keywords that we want.
array = []
i = 0

for w in sorted(d, key=d.get, reverse=True):
    array.append(w)
    # → Uncomment the following to the see the complete dictionary and its values.
    #print(w, d[w])

desiredKeywords = 10

print ('Keywords:\n')
for i in range(desiredKeywords):
    print (str(i+1)+' '+array[i])

Keywords:

1 eye
2 flight
3 approach
4 position
5 angle
6 altitude
7 point
8 trials
9 landing
10 airplane


#### You can display the pdf cover to compare

In [None]:
display(Image(filename='cover.png'))