In [1]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from nltk.tokenize import word_tokenize
import os
import json
import re
import pytextrank
import spacy
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def convert_pdf_to_txt(path, pageno=[]):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    all_pages_data = []
    
    if len(pageno) == 0:
        all_pages_data = {}
        all_pages_data["filename"] = path
        all_pages_data["data"] = ""
        all_pages_data["title"] = ""

    for pageNo, page in enumerate(PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True)):
        interpreter.process_page(page)
        data = retstr.getvalue()
        
        if pageNo == 0:
            try:
                result = re.search("report on (.*) submitted", data.replace("\n", " ").lower())
                project = result.group(1)
                all_pages_data["title"] = project.strip()
            except:
                raise Exception("Error in finding project name")
        
        if len(pageno) == 0:
            all_pages_data["data"] += data
        elif pageNo in pageno:
                all_pages_data.append(data)
        
        data = ""
        retstr.truncate(0)
        retstr.seek(0)
        
    fp.close()
    device.close()
    retstr.close()
    return all_pages_data

In [3]:
def get_stopwords():
    with open("long_stopwords.txt", "r") as fp:
        stopwords_long = fp.read().split("\n")
    common_stopwords = list(stopwords.words("english"))
    stopwords_list = list(set(stopwords_long + common_stopwords))
    return stopwords_list

In [4]:
stopwords_list = get_stopwords()
lemmatizer = WordNetLemmatizer()
knowledge_base = json.load(open("./knowledge_base.json"))
reports = "./reports_doc/"

In [5]:
nlp = spacy.load("en_core_web_md")
tr = pytextrank.TextRank()
nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
output_dict = {}

In [6]:
def clean(text):
    text = text.replace(",", " ").replace(":", " ")
    text = str(text).lower()
    printable = set(string.printable)
    text = "".join(list(filter(lambda x: x in printable, text)))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in string.punctuation]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join([word for word in tokens if word not in stopwords_list])

In [7]:
def extract_knowledge(text):
    tokens = word_tokenize(text)
    languages_extracted = [language for language in tokens if language in knowledge_base["languages"]]
    frameworks_extracted = [framework for framework in tokens if framework in knowledge_base["frameworks"]]
    dbs_extracted = [db for db in tokens if db in knowledge_base["dbs"]]
    libraries_extracted = [token for token in tokens if token in knowledge_base["libraries"]]
    return {
        "languages": list(set(languages_extracted)),
        "frameworks": list(set(frameworks_extracted)),
        "dbs": list(set(dbs_extracted)),
        "libraries": list(set(libraries_extracted))
    }

In [8]:
def filter_numbers(x):
    return re.sub("[^A-Za-z ]", "", str(x))

In [9]:
def parse_docs(file):
    
    try:
        
        # Reading dictionary
        
        processed_dict = convert_pdf_to_txt(reports + file)
        text = clean(processed_dict["data"])
        
        # Extracting knowledge by rules
        knowledge_extracted = extract_knowledge(text)
        
        doc = nlp(text)
        data = []
        
        # Text rank for keyword extraction
        for p in doc._.phrases:
            data.append([p.rank, p.text])
        
        # Filter keywords
        data = np.array(data)
        df = pd.DataFrame(data, columns=["rank", "keyword"])
        df["keyword"] = df["keyword"].apply(filter_numbers)
        df["keyword"] = df["keyword"].apply(lambda x: re.sub("b[0-9]{9}", "", str(x)))
        df["keyword"] = df["keyword"].apply(lambda x: str(x).strip())
        df["len"] = df["keyword"].apply(lambda x: len(str(x)))
        df = df[df["len"] >= 5]
        tf = TfidfVectorizer(analyzer="word", ngram_range=(1, 3), min_df=3, stop_words="english")
        tf.fit_transform(df["keyword"])
        
        # Choose upto 3-grams
        
        ngrams = []
        for f in tf.get_feature_names():
            if len(f.split(" ")) > 1:
                ngrams.append(f)
        
        # Collect all the information
        
        information_extracted = {
            "title": processed_dict["title"],
            "filename": processed_dict["filename"],
            "data": text,
            "keywords": ngrams,
            "libraries": knowledge_extracted["libraries"],
            "languages": knowledge_extracted["languages"],
            "frameworks": knowledge_extracted["frameworks"],
            "dbs": knowledge_extracted["dbs"]
        }
        
        return information_extracted
        
    except:
        raise Exception("Something went wrong")
    

In [10]:
def main():

    files = os.listdir(reports)
    count = 0
    
    discarded_files = []
    
    for file in files:
        
        try:
            output_dict[reports + file] = parse_docs(file)
            print("Parsing file {} is done".format(file))
            count += 1
        except:
            discarded_files.append(file)
            print("Error in parsing {}".format(file))
            
    print("Total files: {}".format(len(files)))
    print("Discarded files: {}".format(len(files)-count))
        
    json.dump(output_dict, open("./dataset_1.json", "w"))
    
    for file in discarded_files:
        os.remove(reports + file)
    
    
    

In [11]:
%%time
main()

Parsing file 1.pdf is done
Parsing file 102.pdf is done
Parsing file 103.pdf is done
Parsing file 18.pdf is done
Parsing file 22.pdf is done
Parsing file 24.pdf is done
Parsing file 25.pdf is done
Parsing file 29.pdf is done
Parsing file 2D TO 3D IMAGE CONVERSION USING_MACHINE LEARNING APPROACH.pdf is done
Parsing file 34.pdf is done
Parsing file 35.pdf is done
Parsing file 38.pdf is done
Parsing file 3D-image terrain classification using machine learning_46.pdf is done
Parsing file 50.pdf is done
Parsing file 52.pdf is done
Parsing file 54.pdf is done
Parsing file 56.pdf is done
Parsing file 76.pdf is done
Parsing file 85.pdf is done
Parsing file 90.pdf is done
Parsing file 91.pdf is done
Parsing file 93.pdf is done
Parsing file 97.pdf is done
Parsing file 99.pdf is done
Parsing file A CRYPTOSYSTEM FOR ENCRYPTION OF_DATA USING IMAGES FOR KEY_GENERATION.pdf is done
Parsing file A fast distributed key-value store based on_the RAFT protocol and Judy arrays_37.pdf is done
Parsing file A N

Parsing file ERP SYSTEM FOR A FLEXIBLE TRANSPORTATION_SERVICE_30.pdf is done
Parsing file ETHEREUM BASED BLOCKCHAIN_IMPLEMENTATION OF PEER REVIEW_SYSTEM.pdf is done
Parsing file Event_Driven_Process_Orchestration_59.pdf is done
Parsing file Exploring alternative approaches to control forwarding path of packets in virtual networks_4.pdf is done
Parsing file Expressive English Text-to-Speech Synthesis System_7.pdf is done
Parsing file Face Recognition in Videos.pdf is done
Parsing file FASHION RECOMMENDATION AND_DESIGN USING MACHINE LEARNING.pdf is done
Parsing file Fast and efficient compression method for real time systems_41.pdf is done
Parsing file final BE report.pdf is done
Parsing file final BE report1.pdf is done
Parsing file final Be Report2.pdf is done
Parsing file Final full report.pdf is done
Parsing file Final gts.pdf is done
Parsing file Final Report(Group No 95).pdf is done
Parsing file FOOD QUALITY INSPECTION USING_COMPUTER VISION.pdf is done
Parsing file Galaxy Morpholog

Parsing file MACHINE LEARNING AGENT FOR GAME_DEVELOPMENT.pdf is done
Parsing file Machine Learning Based Network Traffic Classification.pdf is done
Parsing file MANAGING LAND OWNERSHIP USING_BLOCKCHAIN.pdf is done
Parsing file Marathi translation using wsd concept_36.pdf is done
Parsing file META DATA AND CONTENT BASED YOUTUBE_VIDEO TAG GENERATION.pdf is done
Parsing file METAAPP A FRAMEWORK FOR INCOGNITO_MODE OF SMARTPHONE APPLICATIONS_25.pdf is done
Parsing file Metering and Throttling for Remedy_Applications_76.pdf is done
Parsing file MICRO VIDEO RECOMMENDATION_SYSTEM FOR CONTENT WRITER.pdf is done
Parsing file Multi-document Abstractive Summarization_1.pdf is done
Parsing file Multikeyword ranked search over encrypted data.pdf is done
Parsing file Music Discovery Using Content Based Paradigms.pdf is done
Parsing file N-SCALAR.pdf is done
Parsing file Natural Language Description of Videos_46.pdf is done
Parsing file negative news no more_42.pdf is done
Parsing file NETWORK EDGE CO

Parsing file vCom-Volunteer Computing for High Performance_26.pdf is done
Parsing file VEHICULAR SECURITY SYSTEM_3.pdf is done
Parsing file VHDX File Recovery_54.pdf is done
Parsing file VIDEO BASED TRAFFIC CONTROL SYSTEM_77.pdf is done
Parsing file Video Summarization using clustering techniques.pdf is done
Parsing file Video Summarization_31.pdf is done
Parsing file Virtual Arts Gallery_75.pdf is done
Parsing file Virtual Lab for Electronics_29.pdf is done
Parsing file VIRTUAL MACHINE COMMUNICATION_ANALYSIS.pdf is done
Parsing file Visual Question Answering Using Deep_Learning_30.pdf is done
Parsing file Visualization As A Sevice_11.pdf is done
Parsing file VISUALLY IMPAIRED ASSISTANT SYSTEM.pdf is done
Parsing file Volunteer Automation For NGO_79.pdf is done
Parsing file Web_Vulnerability Scanner.pdf is done
Parsing file ___BREAST CANCER DETECTION AND_DIAGNOSIS USING MACHINE LEARNING___.pdf is done
Total files: 419
Discarded files: 0
Wall time: 28min 19s
