In [1]:

HTML_INPUT_PATH = 'html/' # path to html-files and their corresponding folders to be converted into plain-text
CSV_OUTPUT_PATH = "out/" # top-level directory for all sort of output files; The CSV-file can be used to get all gathered information in one single file to make statistics easier to implement.
PLAIN_OUTPUT_PATH = 'out/plain/' # directory for exported plain-text per file
JSON_OUTPUT_PATH = "out/json/" # directory for exported json-files
CCS_INPUT_PATH = "CCS_Classes.json" # path to ccs_Classes.json file


class CleanedPaper: # structure to store all information about a fully extracted paper
    def __init__(self, filename: str, doi: str, authors: str, year: int, title: str, keywords: str, ccs: str,
                 specified_class: str):
        self.filename = filename
        self.doi = doi
        self.authors = authors
        self.year = year
        self.title = title
        self.keywords = keywords
        self.ccs = ccs
        self.specified_class = specified_class


import os
import json

try:  #catch errors corresponding to empty or not existing directories for HTML input
    if len(os.listdir(HTML_INPUT_PATH)) == 0:
        print("No HTML input files found!")
except FileNotFoundError:
    print("No Directory Found for HTML Input! Please change path in HTML_INPUT_PATH: " + HTML_INPUT_PATH)

try: # catch errors corresponding with reading the ccs_classes from ccs_Classes.json
    f = open(CCS_INPUT_PATH, 'r')
    json_ccs = json.load(f) # load input from file as json structure
    CCS_TOP_CLASSES = json_ccs['top_classes']
    CCS_SUB_GENERAL_AND_REFERENCE = json_ccs['sub_general_and_reference']
    CCS_SUB_HARDWARE = json_ccs['sub_hardware']
    CCS_SUB_COMPUTER_SYSTEMS_ORGANIZATION = json_ccs['sub_computer_systems_organization']
    CCS_SUB_NETWORKS = json_ccs['sub_networks']
    CCS_SUB_SOFTWARE_AND_ITS_ENGINEERING = json_ccs['sub_software_and_its_engineering']
    CCS_SUB_THEORY_OF_COMPUTING = json_ccs['sub_theory_of_computing']
    CCS_SUB_MATHEMATICS_OF_COMPUTING = json_ccs['sub_mathematics_of_computing']
    CCS_SUB_INFORMATION_SYSTEMS = json_ccs['sub_information_systems']
    CCS_SUB_SECURITY_AND_PRIVACY = json_ccs['sub_security_and_privacy']
    CCS_SUB_HUMAN_CENTERED_COMPUTING = json_ccs['sub_human_centered_computing']
    CCS_SUB_COMPUTING_METHODOLOGIES = json_ccs['sub_computing_methodologies']
    CCS_SUB_APPLIED_COMPUTING = json_ccs['sub_applied_computing']
    CCS_SUB_SOCIAL_AND_PROFESSIONAL_TOPICS = json_ccs['sub_social_and_professional_topics']
    
    sum_classes = len(CCS_TOP_CLASSES)+len(CCS_SUB_GENERAL_AND_REFERENCE)+len(CCS_SUB_HARDWARE)+len(CCS_SUB_COMPUTER_SYSTEMS_ORGANIZATION)+len(CCS_SUB_NETWORKS)+len(CCS_SUB_SOFTWARE_AND_ITS_ENGINEERING)+len(CCS_SUB_THEORY_OF_COMPUTING)+len(CCS_SUB_MATHEMATICS_OF_COMPUTING)+len(CCS_SUB_INFORMATION_SYSTEMS)+len(CCS_SUB_SECURITY_AND_PRIVACY)+len(CCS_SUB_HUMAN_CENTERED_COMPUTING)+len(CCS_SUB_COMPUTING_METHODOLOGIES)+len(CCS_SUB_APPLIED_COMPUTING)+len(CCS_SUB_SOCIAL_AND_PROFESSIONAL_TOPICS) # sum up how many classes are found overall        
    print("classes found: ",sum_classes)
    
except FileNotFoundError:
    print("No file Found for CCS Input!")
try:  #catch errors corresponding to empty or not existing directories for plaintext output
    os.listdir(CSV_OUTPUT_PATH)
except FileNotFoundError:
    os.mkdir(CSV_OUTPUT_PATH)
    print("No Directory Found for csv output! Created this directory in CSV_OUTPUT_PATH: " + CSV_OUTPUT_PATH)
try:  #catch errors corresponding to empty or not existing directories for plaintext output
    os.listdir(PLAIN_OUTPUT_PATH)
except FileNotFoundError:
    os.mkdir(PLAIN_OUTPUT_PATH)
    print("No Directory Found for plaintext output! Created this directory in PLAIN_OUTPUT_PATH: " + PLAIN_OUTPUT_PATH)
try:  #catch errors corresponding to empty or not existing directories for json output
    os.listdir(JSON_OUTPUT_PATH)
except FileNotFoundError:
    os.mkdir(JSON_OUTPUT_PATH)
    print("No Directory Found for json output! Created this directory in JSON_OUTPUT_PATH: " + JSON_OUTPUT_PATH)

13
classes found:  2092


In [3]:
import pandas as pd
all_CCS = list()
all_CCS.extend(CCS_TOP_CLASSES)
all_CCS.extend(CCS_SUB_GENERAL_AND_REFERENCE)
all_CCS.extend(CCS_SUB_HARDWARE)
all_CCS.extend(CCS_SUB_COMPUTER_SYSTEMS_ORGANIZATION)
all_CCS.extend(CCS_SUB_NETWORKS)
all_CCS.extend(CCS_SUB_SOFTWARE_AND_ITS_ENGINEERING)
all_CCS.extend(CCS_SUB_THEORY_OF_COMPUTING)
all_CCS.extend(CCS_SUB_MATHEMATICS_OF_COMPUTING)
all_CCS.extend(CCS_SUB_INFORMATION_SYSTEMS)
all_CCS.extend(CCS_SUB_SECURITY_AND_PRIVACY)
all_CCS.extend(CCS_SUB_HUMAN_CENTERED_COMPUTING)
all_CCS.extend(CCS_SUB_COMPUTING_METHODOLOGIES)
all_CCS.extend(CCS_SUB_APPLIED_COMPUTING)
all_CCS.extend(CCS_SUB_SOCIAL_AND_PROFESSIONAL_TOPICS)
df = pd.DataFrame(all_CCS, columns=["Col"])
print(len(df.Col)) # number of categories overall
print(len(df.Col.unique())) # number of unique categories


2092
1910


In [3]:
print(len(CCS_TOP_CLASSES)+len(CCS_SUB_GENERAL_AND_REFERENCE)+len(CCS_SUB_HUMAN_CENTERED_COMPUTING)+len(CCS_SUB_APPLIED_COMPUTING)) # count number of selected ccs categories

303


extract plaintext from all html files

In [None]:
import html2text
import os

# extract Content of html files into plain txt format for easier handling and content extraction

num = 0 # counter for readable outputs
for file in os.listdir(HTML_INPUT_PATH): 
    if file.endswith('.html'): # loop through all files ending with .html to write them as plaintext
        print("progressing: writing plaintext files from html documents: file " + file + " " + str(
            num + 1) + " of " + str(len(os.listdir(HTML_INPUT_PATH)) / 2) + " (" + str(round(
            (num + 1) / (len(os.listdir(HTML_INPUT_PATH)) / 2) * 100, 2)) + "%)", end='\r')
        html_content = open(HTML_INPUT_PATH + file, encoding="utf-8").read() # open html file with utf-8 encoding
        num += 1 # set counter

        html_converter = html2text.HTML2Text() # initialize html2text converter with bundle of options listed below
        html_converter.ignore_links = True
        html_converter.ignore_images = True
        html_converter.images_to_alt = True
        html_converter.ignore_tables = True
        html_converter.ignore_mailto_links = True
        html_converter.skip_internal_links = True
        html_converter.use_automatic_links = False
        html_converter.body_width = 0 # no body_width so no inserted linebreaks
        html_converter.white_space_trim = True

        text = html_converter.handle(html_content) # convert plaintext from html-file
        output_txt = open(PLAIN_OUTPUT_PATH + "plain_text__" + file.split(".")[0] + ".txt", "w", encoding="utf-8") # write converted plaintext into .txt file
        output_txt.write(text) 
        output_txt.close()


Extract content from plain txt_files
* DOI
* Authors
* Year
* Title
* Keywords
* CCS
* Specified_CCS

In [5]:
import re


def clean_paper(path):
    f = open(path, 'r', encoding="utf-8")
    text = f.read()
    f.close()

    # extract title from plaintext via first '\n' identification
    title = text.split("\n\n")[0]
    title = title[2:len(title) - 1].replace("\n", " ")  # remove leading '#' and trailing '\n' in title

    # extract doi from plaintext
    try:
        doi = re.split("DOI:", text, flags=re.IGNORECASE)[1]  # split after "DOI:" until next '\n'
        doi = doi.split("\n")[0].strip()
    except IndexError:
        doi = "undefined"

    # extract authors as list from plaintext
    authors = list()
    tmp_authors = re.split("DOI", text, flags=re.IGNORECASE)[0] # split before "DOI"
    tmp_authors = tmp_authors.split("\n\n")[1:] # split after first complete empty line (\n\n)
    for author in tmp_authors:
        author = author.split(",")[0].strip()
        if len(author) > 2:
            authors.append(author) # extract authors name from whole author-information
    authors = ";".join(authors)


    # extract year of publication from plain text
    year = text.split("DOI")[1].split("\n")[1].strip().split(",")[-1].split(" ")[-1].strip()
    try:
        year = int(year)
    except ValueError: # if no year could be extracted, define year as -1
        year = int(-1)

    # extract given full CCS classification tree from plaintext
    try:
        ccs = re.split("CCS CONCEPTS:", text, flags=re.IGNORECASE)[1]  # split after "CCS concepts:" until next '\n'
        ccs = ccs.split("\n")[0]
    except IndexError:
        ccs = "undefined"

    # extract most important CCS classification branch from given ccs concepts
    specified_class = []  #default fallback to have the most general classification at least 
    # specified_class extraction in own text

    ccs_splitted = ccs.lower().split(";") # split given ccs Classification from Plaintext
    for ccs_split in ccs_splitted:
        for given in CCS_TOP_CLASSES:
            if given in ccs_split:
                tmp = given
                if 'general and reference' in given: # extract more precise classifications from class "general and reference"
                    for detailed_gr_class in CCS_SUB_GENERAL_AND_REFERENCE:
                        if detailed_gr_class in ccs_split:
                            tmp = detailed_gr_class
                            if not tmp in specified_class:
                                specified_class.append(tmp)
                if 'human-centered computing' in given: # extract more precise classifications from class "human-centered computing"
                    for detailed_hcc_class in CCS_SUB_HUMAN_CENTERED_COMPUTING:
                        if detailed_hcc_class in ccs_split:
                            tmp = detailed_hcc_class
                            if not tmp in specified_class:
                                specified_class.append(tmp)
                if 'applied computing' in given: # extract more precise classifications from class "applied computing"
                    for detailed_ac_class in CCS_SUB_APPLIED_COMPUTING:
                        if detailed_ac_class in ccs_split:
                            tmp = detailed_ac_class
                            if not tmp in specified_class:
                                specified_class.append(tmp)

                if not given in specified_class: # extract all general top-level classifications
                    specified_class.append(given)

    specified_class = ";".join(specified_class) # join list of given classes as string seperated by semicolon

    # extract given keywords from plaintext
    try:
        keywords = re.split("KEYWORDS:", text, flags=re.IGNORECASE)[1]  # split after "Keywords:" until next '\n'
        keywords = keywords.split("\n")[0]
        keywords = re.sub(r'\s*[;,]\s*', ',', keywords)
        keywords = keywords.split(",")
    except IndexError: # if no keywords are find, define them as "undefined"
        keywords = "undefined"

    return CleanedPaper(filename=path, doi=doi, authors=authors, year=year, title=title, keywords=keywords, ccs=ccs,
                        specified_class=specified_class) # return cleanedPaper instance with all fields setted

# print(clean_paper(PLAIN_OUTPUT_PATH+"plain_text__3300237.txt").authors)

create clean_paper instance of all given plaintext files to fill JSON_file per file with all information.
Also export them into single csv file for easier analysis.

In [17]:

import json
import pandas as pd

data = pd.DataFrame(columns=['filename', 'doi', 'authors', 'title','year', 'keywords', 'ccs', 'specified_class']) # columns to be written into the main data.csv file
cnt = 1
errors = []

for file in os.listdir(PLAIN_OUTPUT_PATH):
    if file.endswith('.txt'):
        print("Writing json_files -> Progress: " + str(
            round((cnt / len(os.listdir(PLAIN_OUTPUT_PATH))) * 100, 1)) + "%  " + str(cnt) + " of " + str(
            len(os.listdir(PLAIN_OUTPUT_PATH))) + " is processing...  [" + file + "]", end='\r')

        result = clean_paper(os.path.join(PLAIN_OUTPUT_PATH, file))
        if not result.ccs == "undefined":  #if paper contains "undefined" in ccs column, it is most likely not in the correct format and therefor not usable. it is logged as error /corrupted file
            example = open('format_output.json', 'r',
                           encoding="utf-8")  #open example json-file and write single files for every document
            json_data = json.load(example)
            example.close()

            # set all fields with information from cleanedPaper named result
            json_data['title'] = result.title
            json_data['topics'] = result.specified_class.split(";")
            json_data['authors'] = result.authors.split(";")
            json_data['year'] = result.year
            if (result.doi.__contains__("doi.org/")): # extract and build doi-links if necessary
                json_data['doi'] = result.doi.split("doi.org/")[1]
                json_data['link'] = result.doi
            else:
                json_data['doi'] = result.doi
                json_data['link'] = "https://doi.org/" + result.doi

            out = open(JSON_OUTPUT_PATH + file.split(".")[0].replace("plain_text__", '') + "_quant_data.json", "w",
                       encoding="utf-8") # write json-file for each cleanedPaper instance
            json.dump(json_data, out, indent=4)
            out.close()

            data.loc[len(data.index)] = file.split(".")[
                0], result.doi, result.authors, result.title, result.year, result.keywords, result.ccs, result.specified_class
        else:  #sum up all error files to be logged in csv later
            errors.append(file)
        cnt += 1

data.to_csv(CSV_OUTPUT_PATH + "cleaned_paper.csv", index=False) # save all cleanedPaper instance as one csv-file
errors = pd.DataFrame(errors) # extract all names of corrupted papers
errors.to_csv(CSV_OUTPUT_PATH + "errors.csv", index=False)
print("\nCSV file exported to [" + CSV_OUTPUT_PATH + "cleaned_paper.csv]" + " found " + str(
    len(errors)) + " errors. See list of all errors in " + CSV_OUTPUT_PATH + "errors.csv]") # output

Writing json_files -> Progress: 100.0%  3724 of 3724 is processing...  [plain_text__3582074.txt]
CSV file exported to [out/cleaned_paper.csv] found 143 errors. See list of all errors in out/errors.csv]


**ONLY UNCOMMENT IF YOU WANT TO REMOVE THE PLAINTEXT DIR!!!!**

In [66]:


#shutil.rmtree(PLAIN_OUTPUT_PATH)
#print("plaintext directory has been removed to free memory!")