In [51]:
import pandas as pd
import numpy as np
import re

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

from nltk.corpus import stopwords


class PdfConverter:

   def __init__(self, file_path):
       self.file_path = file_path
        
# → Function used to convert pdf to text to allow the post-processing.
   def convert_pdf_to_txt(self):
       rsrcmgr = PDFResourceManager()
       retstr = StringIO()
       codec = 'utf-8'  # 'utf16','utf-8'
       laparams = LAParams()
       device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
       fp = open(self.file_path, 'rb')
       interpreter = PDFPageInterpreter(rsrcmgr, device)
    
# → Proposing just to read the first page of the pdf, because this is normally where the abstract and the introduction are.
       maxpages = 1
        
       caching = True
       pagenos = set()
       for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, caching=caching, check_extractable=True):
           interpreter.process_page(page)
       fp.close()
       device.close()
       str = retstr.getvalue()
       retstr.close()
       return str
        
if __name__ == '__main__':
    # → Put the path of your pfd here : 
    pdfConverter = PdfConverter(file_path='sample-papers/4.pdf')
    
    # → Now we have the pdf in plain text format. I am also applying a lower() here to avoid dealing with this later.
    text = pdfConverter.convert_pdf_to_txt().lower()

In [52]:
# → First post-processing for the text : Deleting all the symbols. [^\w] will match anything that's not alphanumeric or underscore.
text = re.sub(r'[^\w]', ' ', text)

# → Second post-processing for the text. Avoiding stop words using the nltk dictionary.
generalDictionary = stopwords.words('english')
wordsArray = text.split()
generalResult = [word for word in wordsArray if word.lower() not in generalDictionary]
text = ' '.join(generalResult)

# → Third post-processing for the text. Avoiding numbers.
notNumbers = re.findall(r'[a-zA-Z]\w+',text)
text = ' '.join(notNumbers)

# → Printing the number of words in the text after the whole post-processing.
print(len(notNumbers))

330


### Method 1

In [53]:
# → Calculating the repetition frequency of each word.
def word_count(str):
    counts = dict()
    words = str.split()
    
    for word in words:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1

    return counts

# → dictionary that contains the repetition frecuency of a word and its value.
d = word_count(text)

# → Simple array and counter to select the desired number of keywords that we want.
array = []
i = 0

for w in sorted(d, key=d.get, reverse=True):
    array.append(w)
    # → Uncomment the following to the see the complete dictionary and its values.
    #print(w, d[w])

desiredKeywords = 10

print ('Keywords:\n')
for i in range(desiredKeywords):
    print (str(i+1)+' '+array[i])

Keywords:

1 cores
2 mobile
3 system
4 android
5 energy
6 application
7 processors
8 power
9 use
10 conservation


### Method 2

This is the second method that I propose, this one is using the Gensim library, the thing is that we are not able to see the process of how its calculating the keywords, and if you run the code with the same paper you will see that we get different results for keywords (the correct keywords are from the first method because I counted the repetition frequency with an external software). So yeah, I think my manual method is better than the library because of the post-processing.

In [54]:
from gensim.summarization import keywords
import warnings
#warnings.filterwarnings("ignore")

In [55]:
values = keywords(text=text,split='\n',scores=True)

In [56]:
data = pd.DataFrame(values,columns=['keyword','score'])
data = data.sort_values('score',ascending=False)
data.head(10)

Unnamed: 0,keyword,score
0,core,0.339256
1,cores,0.339256
2,mobile,0.202357
3,mobiles,0.202357
4,processor,0.200973
5,processors,0.200973
6,use,0.180999
7,designers,0.172344
8,design,0.172344
9,power,0.151012
