In [3]:
pip install PyMuPDF==1.16.14 fitz --quiet

[K     |████████████████████████████████| 5.7 MB 8.5 MB/s 
[?25h

In [47]:
#Importing Libraries
filepath = '/content/test.pdf'
from operator import itemgetter
import fitz
import json
import re

In [48]:
# To identify fonts
def fonts(doc, granularity=False):
    styles = {}
    font_counts = {}

    for page in doc:
        blocks = page.getText("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if granularity:
                            identifier = "{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
                            styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
                                                  'color': s['color']}
                        else:
                            identifier = "{0}".format(s['size'])
                            styles[identifier] = {'size': s['size'], 'font': s['font']}

                        font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage

    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    return font_counts, styles


# To identify font tags
def font_tags(font_counts, styles):
    p_style = styles[font_counts[0][0]]  # get style for most used font by count (paragraph)
    p_size = p_style['size']  

    # sorting the font sizes high to low, so that we can append the right integer to each tag
    font_sizes = []
    for (font_size, count) in font_counts:
        font_sizes.append(float(font_size))
    font_sizes.sort(reverse=True)

    # aggregating the tags for each font size
    idx = 0
    size_tag = {}
    for size in font_sizes:
        idx += 1
        if size == p_size:
            idx = 0
            size_tag[size] = '<p>'
        if size > p_size:
            size_tag[size] = '<h>'
        elif size < p_size:
            size_tag[size] = '<s>'

    return size_tag


def headers_para(doc, size_tag):
    header_para = []  # list with headers and paragraphs
    first = True  # boolean operator for first header
    previous_s = {}  # previous span

    for page in doc:
        blocks = page.getText("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # this block contains text

                # REMEMBER: multiple fonts and sizes are possible IN one block

                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if s['text'].strip():  # removing whitespaces:
                            if first:
                                previous_s = s
                                first = False
                                block_string = size_tag[s['size']] + s['text']
                            else:
                                if s['size'] == previous_s['size']:

                                    if block_string and all((c == "|") for c in block_string):
                                        # block_string only contains pipes
                                        block_string = size_tag[s['size']] + s['text']
                                    if block_string == "":
                                        # new block has started, so append size tag
                                        block_string = size_tag[s['size']] + s['text']
                                    else:  # in the same block, so concatenate strings
                                        block_string += " " + s['text']

                                else:
                                    header_para.append(block_string)
                                    block_string = size_tag[s['size']] + s['text']

                                previous_s = s

                    # new block started, indicating with a pipe
                    block_string += "|"

                header_para.append(block_string)

    return header_para


def main():

    document = filepath
    doc = fitz.open(document)

    font_counts, styles = fonts(doc, granularity=False)

    size_tag = font_tags(font_counts, styles)

    elements = headers_para(doc, size_tag)

    return elements


if __name__ == '__main__':
    text=main()

In [49]:
# Function to convert  
def listToString(s): 
    
    # initialize an empty string
    str1 = " " 
    
    # return string  
    return (str1.join(s))

text = listToString(text)

In [50]:
def pre_process1(text):
    text = "".join(text.split('\n')) #remove whitespaces
    text = text.lower()
    
    #using re
    text=re.sub('http\S+\s*',' ',text)
    text=re.sub('RT|cc',' ',text)
    text=re.sub('#\S+',' ',text)
    text=re.sub('@\S+',' ',text)
    text=text.replace('|',' ')
    text=text.replace('<s>',' ')
    text=text.replace('<p>',' ')
    text=text.replace('<h>',' ')

    #for i in range (len(emails)): #removes emails
     #   text = text.replace(emails[i],"") 
    
    #text = re.sub(r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})','',text) #removes phone numbers
    text=re.sub(r'[^\x00-\x7f]',' ',text)
    text=re.sub('\s+',' ',text)
    text=re.sub("\n", " ",text)
    return ''.join(text)

In [51]:
text = pre_process1(text)

In [52]:
text

' ayush srivastava web developer final year b.tech student with 3+ years of experience in building web applications, college projects, freelancing, and contributing to open source softwares. jss boys hostel, c block, sector - 62, noida. (+91) 9599025432 srivastavs61 linkedin.com/in/geekayush github.com/geekayush geekayush.github.io experience 1mg software engineer intern jan 2020 - present avanti learning centres data visualization analyst jul 2019 - aug 2019 responsibilities task 1 - report card generator task 2 - automate student summary generation in bulk motion invite ui/ux & front-end developer jan 2018 - mar 2018 responsibilities task 1 - redesign the existed website task 2 - develop the redesigned version edcams front-end developer nov 2017 - jan 2018 responsibilities task 1 - fix the bugs task 2 - develop a new sign up portal education jssate, noida b.tech 2016 - present average percentage - 74.20% coursework included algorithms data structures sql based dbms turing machines os