# File parsing and Keyword Extraction (Phase 1)

#### 1) Input: 
   Raw documents stored in folder (reports).
#### 2) Output: 
   dataset.json file
#### 3) Algorithm
   a) Read pdf files and extract abstract and project implementation part using some if-else.<br>
   b) Use text preprocessing techniques to filter text. <br>
   c) Using rake-nltk to extract keywords using textrank algorithm.<br>
   d) Create dictionary and dump it in file named dataset.json

In [1]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import nltk
from nltk import word_tokenize
import string
import re
from nltk.stem import WordNetLemmatizer
import numpy as np
import math
import os
from itertools import zip_longest
import json
import pandas as pd
import time
from gensim.summarization.keywords import keywords
from rake_nltk import Rake, Metric
from nltk.corpus import stopwords
from multiprocessing import Pool

In [2]:
reports = "reports/comp"
abstracts = "abstract/"
models = "model/"

In [3]:
processed_dict = {}
clustered_keywords = {}

In [4]:
def get_title_project(text):
    arr = []
    i = 0
    for elem in text.split("\n"):
        if i == 0:
            i += 1
            continue
        if elem == "":
            continue
        if "savitribai phule" not in elem.lower():
            arr.append(elem)
        elif "savitribai phule" in elem.lower():
            return " ".join(arr)
    
    if len(arr) == 0:
        return ""
    return " ".join(arr)

In [5]:
def get_abstract_project(text):
    arr = text.split("\n")
    del arr[0]
    return " ".join(arr)

In [6]:
def convert_to_float(x):
    try:
        return np.float64(x)
    except:
        return 0

In [7]:
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    implementation_found = False
    abstract_found = False
    introduction_found = False
    chapter_found = -1
    abstract_page_no = 3

    new_dict = {}

    for pageNo, page in enumerate(PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True)):
        interpreter.process_page(page)
        data = retstr.getvalue()
        if pageNo == 0:
            new_dict["title"] = get_title_project(data)
        
        if pageNo == abstract_page_no:
            if "acknowledgement" in data.lower():
                abstract_page_no += 1
            else:
                abstract_found = True
                new_dict["abstract"] = get_abstract_project(data)
      
        if "references" in data.lower() and implementation_found == True:
            break
            
        if introduction_found == True:
            if "implementation" in new_dict:
                new_dict["implementation"] += data
            else:
                new_dict["implementation"] = data 
        
        if "introduction" in data.lower() and "chapter" in data.lower() and abstract_found == True:
            introduction_found = True
 

        data = ""
        retstr.truncate(0)
        retstr.seek(0)
        
    fp.close()
    device.close()
    retstr.close()
    return new_dict

In [8]:
def init():
    with open("long_stopwords.txt", "r") as fp:
        stopwords_long = fp.read().split("\n")
    common_stopwords = list(stopwords.words("english"))
    stopwords_list = list(set(stopwords_long + common_stopwords))
    punctuations = list(str(string.punctuation))
    return (stopwords_list, punctuations)

In [9]:
stopwords_list, punctuations = init()
rake = Rake(stopwords=stopwords_list, punctuations=punctuations, max_length=100000, ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO)

In [10]:
def clean(text):
    text = text.lower()
    printable = set(string.printable)
    text = filter(lambda x: x in printable, text) #filter funny characters, if any.
    return "".join(list(text))

In [11]:
def parse_doc(filename):
    
    try:
        print("Parsing file: {} started".format(filename))
        processed_dict[filename] = convert_pdf_to_txt(reports + "/" + filename)
        #nltk.download('punkt')
    except:
        print("Error in parsing file {}".format(filename))
        
    
    try:
        text = clean(processed_dict[filename]["abstract"] + processed_dict[filename]["implementation"])
        rake.extract_keywords_from_text(text)
        processed_dict[filename]["keywords"] = dict(rake.get_ranked_phrases_with_scores())
        print("Parsing file: {} completed".format(filename))
    except:
        print("Keyerror implementation in file {}".format(filename))
    
    return processed_dict

In [12]:
%%time
def main():
    
    print("\n==============Welcome to phase 1============\n")
    
    file_array = os.listdir(reports)
    pool = Pool(processes=4)
    processed_dict = pool.map(parse_doc, file_array)
    pool.close()
    pool.join()
    
    json_string = json.dumps(processed_dict)
    
    with open("dataset.json", "w") as fp:
        fp.write(json_string)
    fp.close()

    print("\n===========Phase 1 ended==========\n")

CPU times: user 9 µs, sys: 0 ns, total: 9 µs
Wall time: 15.5 µs


In [13]:
if __name__ == "__main__":
    main()



Parsing file: User Application for Shop Floor Automation using Data Analytics_14.pdf started
Parsing file: Text Mining Algorithms For Assessment of Answersheets.pdf started
Parsing file: Disease prediction using machine learning_55.pdf started
Parsing file: lazynotes_54.pdf started
Parsing file: Text Mining Algorithms For Assessment of Answersheets.pdf completed
Parsing file: HPC clodburst using docker_42.pdf started
Parsing file: User Application for Shop Floor Automation using Data Analytics_14.pdf completed
Parsing file: SCRUTINIZING BEHAVIOUR OF VM COMMUNICATION.pdf started
Parsing file: lazynotes_54.pdf completed
Parsing file: Machine Learning Based Network Traffic Classification.pdf started
Parsing file: HPC clodburst using docker_42.pdf completed
Parsing file: Stock Prediction Using Technical Analysis.pdf started
Parsing file: SCRUTINIZING BEHAVIOUR OF VM COMMUNICATION.pdf completed
Parsing file: Plant Disease Detection Using Deep Learning Techniques.pdf started
Parsing file: 

Parsing file: self learning cognitive system using large scale neural network_21.pdf completed
Parsing file: OpenAir Interface.pdf started
Parsing file: Differentiating Between Attacker And Legitimate VM.pdf completed
Parsing file: APPLICATION TO DETERMINE THE SAFEST ROUTE USING CRIME JANALYSIS VIA DECISION TREE JALGORITHM_78.pdf started
Parsing file: INTELLIGENT TOLL AUTOMATION SYSTEM_50.pdf completed
Parsing file: Exploring alternative approaches to control forwarding path of packets in virtual networks_4.pdf started
Parsing file: OpenAir Interface.pdf completed
Parsing file: Predicting_MyersBriggs_Personality_type.pdf started
Parsing file: Robust Speaker Recognition System for  online authentication and real-time verification.pdf completed
Parsing file: ONLINE REVIEW ANALYSIS.pdf started
Parsing file: APPLICATION TO DETERMINE THE SAFEST ROUTE USING CRIME JANALYSIS VIA DECISION TREE JALGORITHM_78.pdf completed
Parsing file: Android Malware Detection Using Machine Learning_67.pdf star

Parsing file: analyze online clickstream data to understand user intent and personalized ads_25.pdf completed
Parsing file: MANAGING LAND OWNERSHIP USING_BLOCKCHAIN.pdf started
Parsing file: Virtual Lab for Electronics_29.pdf started
Parsing file: Multi-document Abstractive Summarization_1.pdf completed
Parsing file: PHYSICAL WEB WITH VENDING MACHINE_77.pdf started
Parsing file: Web_Vulnerability Scanner.pdf completed
Parsing file: CONVERSION OF MARATHI TEXT TO_BRAILLE USING OPTICAL CHARACTER_RECOGNITION.pdf started
Parsing file: Virtual Lab for Electronics_29.pdf completed
Parsing file: AIR POLLUTION DETECTION WITH_NAVIGATION SYSTEM.pdf started
Parsing file: PHYSICAL WEB WITH VENDING MACHINE_77.pdf completed
Parsing file: Development of Intelligent automated indoor_navigator and assistance system.pdf started
Parsing file: CONVERSION OF MARATHI TEXT TO_BRAILLE USING OPTICAL CHARACTER_RECOGNITION.pdf completed
Parsing file: Creating server on android phone using NanoHTTPD.pdf started
Pa

Parsing file: CustomerDecisionSupportSystem_39.pdf completed
Parsing file: Crop_disease_detection.pdf started
Parsing file: INTELLIGENT CYBER SECURITY SYSTEM.pdf completed
Parsing file: DIRECT TRANSFER FROM DATA DEVICES_56.pdf started
Parsing file: PERSONALITY PREDICTION SYSTEM_USING TWITTER DATA.pdf completed
Parsing file: NOBODY PARKS HERE.pdf started
Parsing file: Generic_Traffic_Shaper_19.pdf completed
Parsing file: Accelerating Foreign Language Learning_62.pdf started
Parsing file: NOBODY PARKS HERE.pdf completed
Parsing file: DETECTING CERVICAL ABNORMALITIES_USING MACHINE LEARNING.pdf started
Parsing file: DIRECT TRANSFER FROM DATA DEVICES_56.pdf completed
Parsing file: ANALYSIS OF WORKLOAD AND ITS_CHARACHTERISITICS PROVIDED BY THE_HYPERVISOR AND RECOMMEND_OPTIMAL FIT IN CLOUD.pdf started
Parsing file: Accelerating Foreign Language Learning_62.pdf completed
Parsing file: SENTIMENT ANALYSIS ON TWITTER JDATA_74.pdf started
Parsing file: Crop_disease_detection.pdf completed
Parsing 

# End of phase 1