# Dataset Creation (Phase 2)

#### Input:
   dataset.json created in stage 1
#### Output:
   dataset.csv 
#### Algorithm:
   a) Simple loop statement will do the trick

In [1]:
import json
from time import time
import os
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import re

In [2]:
def stem_words(text):
    text = text.split()
    stemmer = WordNetLemmatizer()
    stemmed_words = [stemmer.lemmatize(word) for word in text]
    text = " ".join(stemmed_words)
    return text

In [3]:
def make_lower_case(text):
    return text.lower()

In [4]:
def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

In [5]:
def remove_removeletters(text):
    return re.sub("[0-9]", "", text)

In [6]:
def read_json():
    processed_dict = {}
    with open("dataset.json", "r") as fp:
        processed_dict = json.load(fp)
    fp.close()
    return processed_dict

In [7]:
def filter_text(text):
    return remove_removeletters(remove_punctuation(stem_words(make_lower_case(text))))

In [8]:
abstract = "abstract/"

In [9]:
def create_dataset(processed_dict):
    
    files = set()
    
    for file in processed_dict:
        
        for key in file:

            if "keywords" not in file[key]:
                continue
            
            if key in files:
                continue

            files.add(key)

            filter_filename = key.replace(",", "")
            with open(abstract + filter_filename.split(".")[0] + ".txt", "w", encoding="utf-8") as fp:
                fp.write(file[key]["abstract"])
            fp.close()
        
            weights = list(file[key]["keywords"].keys())
            keywords = list(file[key]["keywords"].values())
            
            for i in range(len(keywords)):
                cleaned_text = filter_text(keywords[i])
                with open("dataset.csv", "a") as fp:
                    fp.write(filter_filename + "," + cleaned_text + "," + str(weights[i]))
                    fp.write("\n")
                fp.close()
                
            print("File {} is written".format(key))

In [10]:
def main():
    print("\n==========Welcome to phase 2=============\n")
    start = time()
    print("Time: 0")
    processed_dict = read_json()
    create_dataset(processed_dict)
    end = time()
    print("\nTime: {}".format(end-start))
    #os.remove("dataset.json")
    print("\n===========Phase 2 ended=============\n")

In [11]:
if __name__ == "__main__":
    main()



Time: 0
File User Application for Shop Floor Automation using Data Analytics_14.pdf is written
File SCRUTINIZING BEHAVIOUR OF VM COMMUNICATION.pdf is written
File Plant Disease Detection Using Deep Learning Techniques.pdf is written
File CONTEXTUAL RECOMMENDATION AND_SUMMARIZATION OF ENTERPRISE_COMMUNICATION.pdf is written
File Speech and Speaker Recognition_for Home Security System.pdf is written
File Chatbot for automated customer support.pdf is written
File Disease Diagnosis Using Nadi Parikshan_44.pdf is written
File PLATFORM FOR BUILDING ENTERPRISE_SOLUTIONS (WORKFLOW MANAGER).pdf is written
File FASHION RECOMMENDATION AND_DESIGN USING MACHINE LEARNING.pdf is written
File Object Based Visual Sentiment Analysis.pdf is written
File META DATA AND CONTENT BASED YOUTUBE_VIDEO TAG GENERATION.pdf is written
File PATTERN DETECTION AND_RECOGNITION SYSTEM FOR VEHICLES.pdf is written
File ANALYSIS OF PRIVACY POLICIES VIA_MACHINE LEARNING.pdf is written
File Human detection and gender class

File AUTOMATED BRAKING TEST FOR_VEHICLE LICENSES.pdf is written
File NETWORK MONITORING TOOL_45.pdf is written
File AUTO PAINTER_ TEXT TO IMAGE_SYNTHESIS.pdf is written
File INSIDER THREAT DETECTION AND_ADAPTIVE TWO FACTOR_AUTHENTICATION SYSTEM FOR CLOUD_COMPUTING BY ANALYZING SYSTEM_LOGS.pdf is written
File Interactive User Behaviour Analysis and_Recommendation System Using Web Log_Mining_15.pdf is written
File QA GENERATION APPLICATION USING NLP.pdf is written
File PIXTALK_ PICTURE EXCHANGE COMMUNICATION_FOR AUTISM SPECTRUM DISORDER IN CHILDREN_USING MACHINE LEARNING.pdf is written
File Improving performance of Virtual Machines by Virtio Bridge Bypass for PCI devices_69.pdf is written
File Analyzing Product Image-Metadata Based On Click Through Rate For Advertisement Logistics_61.pdf is written
File NextGen_Router_17.pdf is written
File TEXT TO IMAGE SYNTHESIS IN FASHION.pdf is written
File Analysis of Indian Stock Market Analysts.pdf is written
File CONVERSION OF MARATHI TEXT TO_BRA

# End of phase 2