<h1> Creating the DataFrame </h1>


This notebook reads in the raw txt files from the [Knowledge Project ](https://tu-plogan.github.io/source/r_releases.html) and converts it to a pandas data frame.



In [1]:
import glob
import regex
import pandas as pd

<h5>  Dataframe Columns </h5>

MMSID<br>
editionTitle<br>
editor <br>
editor_date <br>
genre<br>
language<br>
termsOfAddress<br>
physicalDescription<br>
place<br>
publisher<br>
referencedBy<br>
shelfLocator<br>
editionSubTitle<br>
volumeTitle<br>
year<br>
volumeId<br>
permanentURL<br>
publisherPersons<br>
volumeNum<br>
letters<br>
part0<br>
editionNum<br>
numberOfVolumes<br>
term<br>
definition<br>
header<br>
startsAt<br>
endsAt<br>
numberOfTerms<br>
numberOfWords<br>
position<br>
termType<br>
filePath

In [2]:
#the path to the data
data_path = "./eb07_TXT_v2/"

#---------------------hardcoded columns to match dataframe structure---------------#
editor = "Stewart, Dugald"
editionTitle = "Edition 7, 1771"
editor_date = "1753-1828"
genre = "encyclopedia"
editionNum = 7
language = "eng"
termsOfAddress = "Sir"
physicalDescription = "21 v. in 22 ; 4to."
editionSubTitle = "0.0"
place = "Edinburgh"
publisher = None
referencedBy = "0.0"
shelfLocator = "EB.15Z"
publisherPersons = []
header = None
letters = None
volumeTitle = "Encyclopaedia Britannica"
year = 1842
part = None
numberOfPages = None
numberOfVolumes = 20
numberOfTerms = None

# This volumeIds list matches each volume with its volume number from 2 to 21
volumeIds = [192984259, 193057500, 193108322, 193696083, 193322690, 193819043,
            193322688, 193696084, 193469090, 193638940, 192693199, 193108323,
            193322689, 193819044, 194474782, 193469091, 193469092, 193057501, 193913444, 193819045]
permanentURL = ["https://digital.nls.uk/"+str(volumeIds[i]) for i in range(len(volumeIds))]
MMSID = "9910796273804340"
#----------------------------------------------------------------------------#

In [3]:
def get_term_description_without_term_name(text, name_ends_at):
    description = text[name_ends_at:]
    letter_match = regex.search("\p{L}", description)
    if letter_match:
        description = description[letter_match.start():]
    else:
        description = ""
    return description

In [4]:
def parse_term_name_without_parenthesis(term_name):
    last_or_index = term_name.rfind(" or")
    if last_or_index == -1:
        last_or_index = term_name.rfind(" Or")
    name = term_name.upper()
    alter_names = []
    if last_or_index > -1:
            text_before_or = term_name[:last_or_index]
            name_after_or = term_name[last_or_index + 4:]
            if text_before_or[-1] == ",":
                # all names after comma in text_before_or are alternative names
                # include the one before " or"
                indexes_of_comma = [index for index, char in enumerate(text_before_or) if char == ',']
                name = text_before_or[:indexes_of_comma[0]].upper()
                if len(indexes_of_comma) > 1:
                    for i in range(len(indexes_of_comma) - 1):
                        alter_name_after_comma = text_before_or[indexes_of_comma[i]+2:indexes_of_comma[i+1]]
                        alter_names.append(alter_name_after_comma.upper())
                alter_names.append(name_after_or.upper())
            else:
                # name after "or" is only alternative name for last word in text_before_or
                name = text_before_or.upper()
                index_of_last_white_space = text_before_or.rfind(" ")
                if index_of_last_white_space > -1:
                    text_before_last_word_in_text_before_or = text_before_or[:index_of_last_white_space + 1]
                    alter_names.append((text_before_last_word_in_text_before_or + name_after_or).upper())
                else:
                    alter_names.append(name_after_or.upper())
    else:
        # name = term_name
        pass
    return name, alter_names

In [5]:
def parse_term_name(term_name):
    """
    This function extract note about this term name, and term name, and alternative names.
    :param term_name: string with names of a term. e.g. ABA (or rather ABAU) HANIFA or HANFA
    :return: note: string (None if no note), name: name string (primary name), alter_names: list of alternative names.
    """
    parenthesis_pattern = regex.compile(r'\s\([\p{L}\p{N}\s\,\.]+\)')
    parenthesis_match = parenthesis_pattern.search(term_name)
    name = term_name.upper()
    note = None
    alter_names = []
    if parenthesis_match:
        text_without_parenthesis = term_name[:parenthesis_match.start()] + term_name[parenthesis_match.end():]
        name, alter_names = parse_term_name_without_parenthesis(text_without_parenthesis)

        parenthesis_match_text = parenthesis_match.group()

        # extract note, or alternative name in parentheses
        or_in_parenthesis_index = parenthesis_match_text.find("(or")
        if or_in_parenthesis_index > -1:
            first_upper_char_index = regex.search("\p{Lu}", parenthesis_match_text).start()
            alter_name_in_parenthesis = parenthesis_match_text[first_upper_char_index:-1]
            alter_names.append((alter_name_in_parenthesis + name[parenthesis_match.start():]).upper())
        else:
            note = parenthesis_match_text[2:-1]

    else:
        name, alter_names = parse_term_name_without_parenthesis(term_name)

    return note, name, alter_names

In [6]:
# test parse_term_name function
parse_term_name__tests = ["ANAXIMENES", "ABACK (a sea term)", "AGARIC Mineral", "ARMSTRONG, John, M.D.", "ARCHEUS (from the principal, chief, or first mover)", "ARUNDELIAN Marbles, Oxford Marbles, or Parian", "ARAUSIO, Civitas Arausiensis or Arausicorum", "ANTIOCHIAN Sect or Academy", "ABA (or rather ABAU) HANIFA or HANFA", "ASPHALTITES, or Lake of Bitumen", "ANTIPAS Herod, or Herod"]
for test_text in parse_term_name__tests:
    print(parse_term_name(test_text))

(None, 'ANAXIMENES', [])
('a sea term', 'ABACK', [])
(None, 'AGARIC MINERAL', [])
(None, 'ARMSTRONG, JOHN, M.D.', [])
('from the principal, chief, or first mover', 'ARCHEUS', [])
(None, 'ARUNDELIAN MARBLES', ['OXFORD MARBLES', 'PARIAN'])
(None, 'ARAUSIO, CIVITAS ARAUSIENSIS', ['ARAUSIO, CIVITAS ARAUSICORUM'])
(None, 'ANTIOCHIAN SECT', ['ANTIOCHIAN ACADEMY'])
(None, 'ABA HANIFA', ['ABA HANFA', 'ABAU HANIFA'])
(None, 'ASPHALTITES', ['LAKE OF BITUMEN'])
(None, 'ANTIPAS HEROD', ['HEROD'])


In [7]:
def extract_page_num(description):
    """
    This function extract page number from term description.
    :param description: description of a term, it might contain strings with "[edition_num:volume_num:page_num]"
    :return: the last page number found, -1 if none found.
    """
    metadata_pattern = regex.compile(r'\[\d+:\d+:\d+\]')
    all_match = metadata_pattern.findall(description)
    if len(all_match) > 0:
        last_match = all_match[-1]
        last_colon_index = last_match.rfind(":")
        return int(last_match[last_colon_index+1:-1])
    return -1

In [8]:
# test extract_page_num function
extract_page_num_test_texts = ["no match", "A [7:2:12]", "A B C \n [7:2:260] \n another [7:2:261]"]
for test_text in extract_page_num_test_texts:
    print(extract_page_num(test_text))

-1
12
261


In [29]:
def extract_reference_term_name(description):
    """
    This function extracts reference terms after the word "See". If there are same reference name occurs multiple times, this function will only return one.
    :param description: description of a term
    :return: a list of reference term names
    """
    # Find all matches and their end indices using re.finditer
    matches = [(match.end()) for match in regex.finditer(" (?i:see)[\,\. ]", description)]
    reference_term_names = set()
    for i in range(len(matches)):
        part_after_match = ""
        if i == len(matches) - 1:
            part_after_match = description[matches[i]:]
        else:
            part_after_match = description[matches[i]:matches[i+1]]
        part_after_match = part_after_match.strip()
        words_pattern_str = "\p{Lu}[\p{L}\-\']*"
        reference_term_pattern_str = words_pattern_str +  "(,?\s"+ words_pattern_str +")*(\s(and)\s" + words_pattern_str + ")?(?:[\.\,]|$)"
        reference_term_name_pattern = regex.compile(r"^" + reference_term_pattern_str)
        reference_term_name_match = reference_term_name_pattern.match(part_after_match)
        if reference_term_name_match:
            reference_term_name = reference_term_name_match.group()
            if not reference_term_name[-1].isupper():
                reference_term_name = reference_term_name[:-1]
            and_index = reference_term_name.find(" and ")
            if and_index > -1:
                reference_term_names.add(reference_term_name[:and_index])
                reference_term_names.add(reference_term_name[and_index+5:])
            else:
                # Ignore reference Part, Index, Table, Figure, Plate
                if reference_term_name.find("Part") > -1:
                    continue
                if reference_term_name.find("Index") > -1:
                    continue
                if reference_term_name.find("Table") > -1:
                    continue
                if reference_term_name.find("Plate") > -1:
                    continue
                if reference_term_name.find("Figure") > -1:
                    continue
                reference_term_names.add(reference_term_name)
    return list(reference_term_names)


In [30]:
# test function extract_reference_term_name
extract_reference_term_name_test_texts = [
    "by the same name. See plate I. fig. I. ",
    "Fructification. See Plate LIV. FIG. 1. Spatha, a species of calix openin",
    "which is nou termed diarthrosis. See ANATOMY,",
    "which is nou termed diarthrosis. See ANATOMY. Something else, SEE ANATOMY ",
    "in surgery. See ABDUCTION",
    " or lambs. See MILK, RUNNET.",
    " in gardening, signifies grafting by approach. See GRAPTING and GARDENINCIS, of",
    "otherwise they despise it. See CHEMISTRY, Of fermentation.",
    "being put, will be set on fire. See OPTICS. BURNING-mountains. See VOLCANO."
]

for test_text in extract_reference_term_name_test_texts:
    print(extract_reference_term_name(test_text))

[]
[]
['ANATOMY']
['ANATOMY']
['ABDUCTION']
['MILK, RUNNET']
['GRAPTING', 'GARDENINCIS']
['CHEMISTRY']
['OPTICS', 'VOLCANO']


In [10]:
def create_term_info(start_page_num, term_name, description, position):
    note, name, alter_names = parse_term_name(term_name)
    description = description.strip()
    reference_terms = extract_reference_term_name(description)
    end_page_num = extract_page_num(description)
    if end_page_num == -1:
        end_page_num = start_page_num
    # 0
    term_type = "Article"
    number_of_words = len(description.split())
    if end_page_num > start_page_num + 1:
        term_type = "Topic"
    term_info = {
        "note": note,
        "name": name,
        "alter_names": alter_names,
        "description": description,
        "starts_at": start_page_num,
        "ends_at": end_page_num,
        "term_type": term_type,
        "position": position,
        "number_of_words": number_of_words,
        "reference_terms": reference_terms
    }
    return term_info

In [11]:
# parse term information from text
def parse_term_from_text(text, page_num=-1, position = -1):
    """
    This function will extract terms information (term name, alternative name, description, page number where a term description ends) from given text.
    :param page_num: page number where the first term information starts.
    :param text: text content includes metadata in the first few lines, followed by descriptions of terms with same name. The metadata was inside two lines of "+==============+".
    :return: a list of object term with format of {name: str, alter_name: str, endsAtPageNum: int, description: str}
    """
    # get entry text
    entry_text = text.split("==+")[2].strip()
    lines = entry_text.split("\n")
    # get name of this entry
    # capital_pattern = regex.compile(r"^[\p{Lu}\s\’]+(?![a-z])")
    # case 1: ANAXIMENES, an eminent Greek philosopher ...
    # case 2: AAM, or Haλm, a liquid measure in common ...
    # case 3: AHOLIB ΛH and Aπolah are two feigned names made use
    # case 4: ABA (or rather ABAU) HANIFA or HANFA, surnamed Al-Nooma, was the son of .....
    # case 5: ABACK (a sea term), the situation of the sails when the surfaces .....
    # case 6: ABA, Abas, Abos, or Abus, in Ancient Geography, the name .....
    # case 7: A. The first letter of the alphabet in every kn
    # case 8: ABBOTS-BROMLEY, a town in Staffordshire. ....
    # case 9: AGARIC Mineral, a marly earth, resembling the vegetable of that name in
    # case 10: ACCISMUS denotes a feigned refusal of something which
    # case 11: ARC, Joan of, generally called the Maid of Orleans
    # case 12: ARMSTRONG, John, M.D. an eminent physician, poet, and miscellaneous writer
    # case 13: ACCOUNTANT-general, a new officer in the court of chancery,
    # case 14: See Bangog, a small island in the Eastern Seas, near ...
    # case 15: ADANSONIA, Ethiopian Sour-gourd, Monkeys-bread, or African Calabash-tree.
    # case 16: AiD-de-Camp, in military affairs, an officer employed
    # case 17: TELEGRAPH. \nTelegraph, so named from two Greek words,
    entry_name_pattern = regex.compile(r"^\p{Lu}+")
    entry_name = entry_name_pattern.match(lines[0]).group()
    capitalised_entry_name = entry_name[0] + entry_name[1:].lower()
    #print(f"entry name: {entry_name}")

    words_pattern_str = "\p{Lu}[\p{L}\-\’]*"
    #first_term_pattern_str = words_pattern_str +  "(\s\([\p{L}\p{N}\s\,\.]+\))?(,?\s"+ words_pattern_str +")*(\s(and)\s" + words_pattern_str + ")?(,?\s(or)\s" + words_pattern_str + ")?(\sof(\s"+  words_pattern_str +")?)?"

    first_term_pattern_str = words_pattern_str +  "(\s\([\p{L}\p{N}\s\,\.]+\))?(,?\s"+ words_pattern_str +")*(((\sand)|(,?\sor))(\s" + words_pattern_str + ")+)?(\sof(\s"+  words_pattern_str +")*)?"

    term_name_pattern = regex.compile(r"^" + first_term_pattern_str)
    term_name = term_name_pattern.match(lines[0])
    parsed_info = None
    if term_name:
        parsed_info = []
        #print(term_name)
        # Get description
        #description = lines[0]
        description = get_term_description_without_term_name(lines[0], term_name.end())
        #print(description)
        term_name = term_name.group()
        parse_term_name(term_name)

        # For each of rest lines, check if it belongs to previous term, or new term starting with the same entry name.
        start_page_num = page_num
        for line in lines[1:]:
            if len(line.strip()) == 0:
                continue
            # Make all text before first comma, period, or uppercase
            first_hyphen_index = line.find("-")
            first_comma_index = line.find(",")
            first_period_index = line.find(".")
            first_hyphen_comma_period_index = min(first_hyphen_index, first_comma_index, first_period_index)

            if first_hyphen_comma_period_index > -1:
                text_before_first_comma_period = line[:first_hyphen_comma_period_index].upper()
                if text_before_first_comma_period == entry_name:
                    line = text_before_first_comma_period + line[first_hyphen_comma_period_index:]
            entry_name_pattern = "(("+ entry_name + ")|" + "(" +  capitalised_entry_name + "))"
            first_term_pattern_str_remove_first_upper = first_term_pattern_str[6:]
            term_name_pattern = regex.compile(r"^" + entry_name_pattern +  first_term_pattern_str_remove_first_upper + "[\,\.]")
            tmp_term_name = term_name_pattern.match(line)
            if tmp_term_name:
                tmp_term_name_ends_at = tmp_term_name.end()
                tmp_term_name = tmp_term_name.group()[:-1]
                if description == "" and (tmp_term_name == entry_name):
                    # previous description was empty, create new description based on this line
                    pass
                else:
                    # create and add term_info to parsed_info
                    term_info = create_term_info(start_page_num, term_name, description, position)
                    parsed_info.append(term_info)
                    end_page_num = term_info["ends_at"]
                    position += 1
                    if end_page_num > start_page_num:
                        position = 1
                    start_page_num = end_page_num

                # create new term name
                term_name = tmp_term_name
                description = get_term_description_without_term_name(line, tmp_term_name_ends_at)

            else:
                description += "\n" + line

        term_info = create_term_info(start_page_num, term_name, description, position)
        parsed_info.append(term_info)
    return parsed_info

In [12]:
# test regex pattern
pattern = regex.compile(r"^\p{Lu}+(\s\([\p{L}\s]+\))?(\s\p{Lu}\p{L}*)*")
test_text = "AGARIC Mineral,"
print(pattern.match(test_text))

<regex.Match object; span=(0, 14), match='AGARIC Mineral'>


In [13]:
def assert_terms(input_terms, expected_terms):
    if len(input_terms) != len(expected_terms):
        return False
    for index in range(len(expected_terms)):
        expected_term = expected_terms[index]
        for key in expected_term.keys():
            if input_terms[index][key] != expected_term[key]:
                return False
    return True

In [14]:
# test function parse_term_from_text
test_text_paths = [
    "./eb07_TXT_v2/a3/kp-eb0703-010204-6887-v2.txt",
    "./eb07_TXT_v2/a2/kp-eb0702-000304-9848-v2.txt",
    "./eb07_TXT_v2/a2/kp-eb0702-036301-4528-v2.txt",
    "./eb07_TXT_v2/a2/kp-eb0702-000501-9874-v2.txt",
    "./eb07_TXT_v2/a2/kp-eb0702-000504-9874-v2.txt",
    "./eb07_TXT_v2/a2/kp-eb0702-000101-9822-v2.txt",
    "./eb07_TXT_v2/a2/kp-eb0702-001601-0017-v2.txt",
    "./eb07_TXT_v2/a2/kp-eb0702-024103-2942-v2.txt",
    "./eb07_TXT_v2/a2/kp-eb0702-008704-0940-v2.txt",
    "./eb07_TXT_v2/a3/kp-eb0703-037701-0462-v2.txt",
    "./eb07_TXT_v2/a3/kp-eb0703-057804-3075-v2.txt",
    "./eb07_TXT_v2/a2/kp-eb0702-009002-0979-v2.txt",
    "./eb07_TXT_v2/s20/kp-eb0720-007502-0089-v2.txt",
    "./eb07_TXT_v2/a2/kp-eb0702-014301-1668-v2.txt",
    "./eb07_TXT_v2/a2/kp-eb0702-036401-4541-v2.txt",
    "./eb07_TXT_v2/t21/kp-eb0721-013705-0155-v2.txt"
]

expected_results = [
    [{"note": None, 'name': 'ANAXIMENES', 'alter_names': [], 'number_of_words': 126, 'reference_terms': []},
    {'note': None, 'name': 'ANAXIMENES', 'alter_names': [], 'number_of_words': 90, 'reference_terms': []},],
    [{'note': None, 'name': 'AAM', 'alter_names': ['HAΛM'], 'number_of_words': 31, 'reference_terms': []}],
    [{'note': None, 'name': 'AHOLIB ΛH AND AΠOLAH', 'alter_names': [], 'number_of_words': 101, 'reference_terms': []}],
    [{'note': None, 'name': 'ABA', 'alter_names': ['ABAU'], 'number_of_words': 171, 'reference_terms': []},
    {'note': None, 'name': 'ABA', 'alter_names': ['ABAS', 'ABOS', 'ABUS'], 'number_of_words': 190, 'reference_terms': ['Abae']}],
    [{'note': "a sea term", 'name': 'ABACK', 'alter_names': [], 'number_of_words': 153, 'reference_terms': []}],
    [{'note': None, 'name': 'A', 'alter_names': [], 'number_of_words': 1446, 'reference_terms': []},
    {'note': None, 'name': 'A', 'alter_names': [], 'number_of_words': 135, 'reference_terms': []}],
    [{'note': None, 'name': 'ABBOTS-BROMLEY', 'alter_names': [], 'number_of_words': 50, 'reference_terms': []}],
    [{'note': None, 'name': 'AGARIC MINERAL', 'alter_names': [], 'number_of_words': 13, 'reference_terms': []}],
    [{'note': None, 'name': 'ACCISMUS', 'alter_names': [], 'number_of_words': 42, 'reference_terms': []}],
    [{'note': None, 'name': 'ARC, JOAN OF', 'alter_names': [], 'number_of_words': 1323, 'reference_terms': []}],
    [{'note': None, 'name': 'ARMSTRONG, JOHN, M', 'alter_names': [], 'number_of_words': 590, 'reference_terms': []}],
    [{'note': None, 'name': 'ACCOUNTANT', 'alter_names': ['ACCOMPTANT'], 'number_of_words': 47, 'reference_terms': []},
    {'note': None, 'name': 'ACCOUNTANT-GENERAL', 'alter_names': [], 'number_of_words': 35, 'reference_terms': []}],
    [{'note': None, 'name': 'SEE BANGOG', 'alter_names': [], 'number_of_words': 21, 'reference_terms': []}],
    [{'note': None, 'name': 'ADANSONIA', 'alter_names': ['ETHIOPIAN SOUR-GOURD', 'MONKEYS-BREAD', 'AFRICAN CALABASH-TREE'], 'number_of_words': 0, 'reference_terms': []}],
    [{'note': None, 'name': 'AID, AUXILIUM', 'alter_names': [], 'number_of_words': 14, 'reference_terms': []},
    {'note': None, 'name': 'AID-DE-CAMP', 'alter_names': [], 'number_of_words': 15, 'reference_terms': []}],
    [{'note': None, 'name': 'TELEGRAPH', 'alter_names': [], 'number_of_words': 5520, 'reference_terms': []}]
]

test_count = 0
pass_count = 0
fail_count = 0
for path in test_text_paths:
    test_text = open(path, "r").read()
    terms = parse_term_from_text(test_text)
    if not assert_terms(terms, expected_results[test_count]):
        fail_count += 1
        print(f"This test case failed, it gets -> {terms} \n ------- It should be {expected_results[test_count]}")
    else:
        pass_count += 1
    test_count += 1

print(f"Total test cases: {test_count}, passed: {pass_count}, failed: {fail_count}")
#    (r\"^\p{Lu}[\p{L}\-\’]*(\s\([\p{L}\p{N}\s\,\.]+\))?(,?\s\p{Lu}[\p{L}\-\’]*)*(\\s(and)\\sp{Lu}[\\p{L}\\-\\’]*)?(,?\\sor\\sp{Lu}[\\p{L}\\-\\’]*)?(\\sof(\\s\\p{Lu}[\\p{L}\\-\\’]*)?)?\")\n",


Total test cases: 16, passed: 16, failed: 0


In [15]:
# Get each text file
# parse text data in each file

import os
# List all directories in data folder

all_info_list = []

for folder_file in sorted(os.scandir(data_path), key=lambda e: e.name):
    position = 1
    if folder_file.is_dir():
        # folder_file is sub folder of the data folder: e.g. a2, a3 ....
        #print(folder_file.path)
        # List all text files in each directory
        for txt_file in sorted(os.scandir(folder_file.path), key=lambda e: e.name):
            if txt_file.name[-3:] == "txt":

                # parse volume and page number from filename
                txt_file_name_parts = txt_file.name.split("-")
                volume_num = int(txt_file_name_parts[1][-2:])
                page_num = int(txt_file_name_parts[2][0:-2])
                print(f"text file path: {txt_file.path}, volume number: {volume_num}, page number: {page_num}")
                # get text from the file
                text = open(txt_file.path, 'r').read()
                #print(len(text))
                # parse term information from the text
                parsed_terms = parse_term_from_text(text, page_num, position)

                for term_info in parsed_terms:
                    all_info_list.append([term_info["name"], term_info["note"], term_info["alter_names"], term_info["reference_terms"], term_info["description"], term_info["starts_at"], term_info["ends_at"],term_info["position"],term_info["term_type"], txt_file.path, term_info["number_of_words"], header, letters, part, MMSID, editionTitle, editor, editor_date, genre, language, termsOfAddress, numberOfPages, physicalDescription, place, publisher, referencedBy, shelfLocator, editionSubTitle, volumeTitle, year, volumeIds[volume_num - 2], permanentURL[volume_num - 2], publisherPersons, volume_num, editionNum, numberOfVolumes, numberOfTerms])
                # update the position
                last_term_info = parsed_terms[-1]
                position = last_term_info["position"] + 1
                if last_term_info["ends_at"] > last_term_info["starts_at"]:
                    position = 1


# create pandas dataframe for information list
df = pd.DataFrame(all_info_list, columns=["term", "note", "alter_names", "reference_terms", "definition",  "startsAt", "endsAt", "position", "termType", "filePath", "numberOfWords", "header","letters", "part", "MMSID", "editionTitle", "editor", "editor_date", "genre", "language", "termsOfAddress", "numberOfPages", "physicalDescription", "place", "publisher", "referencedBy", "shelfLocator", "editionSubTitle", "volumeTitle", "year", "volumeId", "permanentURL", "publisherPersons", "volumeNum", "editionNum", "numberOfVolumes",  "numberOfTerms"])

text file path: ./eb07_TXT_v2/a2/kp-eb0702-000101-9822-v2.txt, volume number: 2, page number: 1
text file path: ./eb07_TXT_v2/a2/kp-eb0702-000201-9835-v2.txt, volume number: 2, page number: 2
text file path: ./eb07_TXT_v2/a2/kp-eb0702-000202-9835-v2.txt, volume number: 2, page number: 2
text file path: ./eb07_TXT_v2/a2/kp-eb0702-000203-9835-v2.txt, volume number: 2, page number: 2
text file path: ./eb07_TXT_v2/a2/kp-eb0702-000301-9848-v2.txt, volume number: 2, page number: 3
text file path: ./eb07_TXT_v2/a2/kp-eb0702-000302-9848-v2.txt, volume number: 2, page number: 3
text file path: ./eb07_TXT_v2/a2/kp-eb0702-000303-9848-v2.txt, volume number: 2, page number: 3
text file path: ./eb07_TXT_v2/a2/kp-eb0702-000304-9848-v2.txt, volume number: 2, page number: 3
text file path: ./eb07_TXT_v2/a2/kp-eb0702-000305-9848-v2.txt, volume number: 2, page number: 3
text file path: ./eb07_TXT_v2/a2/kp-eb0702-000306-9848-v2.txt, volume number: 2, page number: 3
text file path: ./eb07_TXT_v2/a2/kp-eb07

In [16]:
print(df.head(5))

  term  note alter_names reference_terms  \
0    A  None          []              []   
1    A  None          []              []   
2   AA  None          []              []   
3   AA  None          []              []   
4   AA  None          []              []   

                                          definition  startsAt  endsAt  \
0  The first letter of the alphabet in every know...         1       2   
1  as an abbreviation, is likewise of frequent oc...         2       2   
2  a river of the province of Groningen, in the k...         2       2   
3  a river in the province of Overyssel. in the N...         2       2   
4  a river of the province of Antwerp, in the Net...         2       2   

   position termType                                       filePath  ...  \
0         1  Article  ./eb07_TXT_v2/a2/kp-eb0702-000101-9822-v2.txt  ...   
1         1  Article  ./eb07_TXT_v2/a2/kp-eb0702-000101-9822-v2.txt  ...   
2         2  Article  ./eb07_TXT_v2/a2/kp-eb0702-000201-9835-v

In [17]:
df.loc[0]

term                                                                   A
note                                                                None
alter_names                                                           []
reference_terms                                                       []
definition             The first letter of the alphabet in every know...
startsAt                                                               1
endsAt                                                                 2
position                                                               1
termType                                                         Article
filePath                   ./eb07_TXT_v2/a2/kp-eb0702-000101-9822-v2.txt
numberOfWords                                                       1446
header                                                              None
letters                                                             None
part                                               

In [18]:
# Statistic analysis of dataframe from this work, Note that, lots of terms are not been processed in v21 folder since they are in xml files.
print("Number of files: ", len(sorted(glob.glob(data_path + "*/*.txt"))))
print("Number of terms: ", df.shape[0])
print("Number of topics: ", df[df["termType"] == "Topic"].shape[0])
print("Number of articles: ", df[df["termType"] == "Article"].shape[0])

Number of files:  21024
Number of terms:  23987
Number of topics:  1296
Number of articles:  22691


In [19]:
# List all reference terms extracted
reference_terms_df = df[df['reference_terms'].apply(lambda x: len(x) > 0)]
print(reference_terms_df["reference_terms"].values.tolist())

[['Abae'], ['Saladin'], ['Parallax', 'Astronomy', 'Astronomy'], ['Optics'], ['Nile'], ['Conic Sections'], ['Abstraction'], ['Institute'], ['Atwooďs Machine, Dynamics, Mechanics'], ['Reading'], ['Accommodation'], ['Tabourot, Stephen'], ['Achaeans'], ['Statius'], ['Chemistry'], ['Wind-Instruments, Organ, Trumpet', 'Harmonics, Music, Temperament'], ['Agrigentum'], ['Drama', 'Bibliography'], ['Hell'], ['Navy', 'Navy'], ['Navy'], ['Adonia'], ['Acclamation'], ['Divorce'], ['Pius II'], ['AEronautics'], ['AEtius'], ['Biographie Universelle'], ['Dairy', 'Dairy', 'Sect', 'Chap'], ['Girgenti'], ['Cologne'], ['Atmosphere, Meteorology'], ['Pneumatics'], ['Pneumatics'], ['Albigenses'], ['Prussia'], ['Araeometer'], ['Alcor'], ['Brewing'], ['Borgia', 'Chigī'], ['Mem'], ['Reuss, Repertorium Commentationum'], ['Good Hope'], ['Anatomy'], ['Chemistry'], ['Geometry'], ['Relievo'], ['Vespucci'], ['Haller, Bibl'], ['Mortmain'], ['Logic'], ['Antiquities'], ['Hermaphrodite'], ['Automaton'], ['Anthropophagi'], 

In [20]:
df.to_json(r'./final_eb_7_dataframe_clean_Damon', orient="index")

In [21]:
# load dataframe from Alex's work
old_df= pd.read_json('./final_eb_7_dataframe_clean_Alex', orient="index")

In [22]:
# Statistic analysis of dataframe from Alex's work
print("Number of terms: ", old_df.shape[0])
print("Number of topics: ", old_df[old_df["typeTerm"] == "Topic"].shape[0])
print("Number of articles: ", old_df[old_df["typeTerm"] == "Article"].shape[0])

Number of terms:  23121
Number of topics:  2021
Number of articles:  21100


In [23]:
old_df[old_df["term"]=="TELEGRAPH"]

Unnamed: 0,term,definition,MMSID,edTitle,editor,editor_date,genre,language,termsOfAddress,numberOfPages,...,numberOfVolumes,relatedTerms,header,startsAt,endsAt,numberOfTerms,numberOfWords,positionPage,typeTerm,altoXML
21608,TELEGRAPH,.,9910796273804340,"Seventh edition, General index","Stewart, Dugald",1753-1828,encyclopedia,eng,Sir,184,...,20,Not specified,Not specified,137,137,Not specified,1,1,Article,eb07-v1.2-TXT/t21/kp-eb0721-013705-0155-v1.txt
21609,TELEGRAPH,so named from two Greek words τέλος end or dis...,9910796273804340,"Seventh edition, General index","Stewart, Dugald",1753-1828,encyclopedia,eng,Sir,184,...,20,Not specified,Not specified,137,137,Not specified,5473,2,Topic,eb07-v1.2-TXT/t21/kp-eb0721-013705-0155-v1.txt


In [24]:
df[df["term"]=="TELEGRAPH"]

Unnamed: 0,term,note,alter_names,reference_terms,definition,startsAt,endsAt,position,termType,filePath,...,editionSubTitle,volumeTitle,year,volumeId,permanentURL,publisherPersons,volumeNum,editionNum,numberOfVolumes,numberOfTerms
22440,TELEGRAPH,,[],[],"so named from two Greek words, τέλος, end or d...",137,142,5,Topic,./eb07_TXT_v2/t21/kp-eb0721-013705-0155-v2.txt,...,0.0,Encyclopaedia Britannica,1842,193819045,https://digital.nls.uk/193819045,[],21,7,20,


In [25]:
# Find different between the topics from two dataframes
new_topics = df[df["termType"] == "Topic"]
old_topics = old_df[old_df["typeTerm"] == "Topic"]
intersection = new_topics[new_topics["term"].isin(old_topics["term"])]

print(f"{len(intersection)} topics have been recognised by both dataframe.")

# Remove the intersection from old_topics
topic_old_dif_result = old_topics[~old_topics["term"].isin(intersection["term"])]
# List the file path of topic in old_topics only
# List the file path of term in old_df only
print(f"{len(topic_old_dif_result)} topics have only been recognised by Alex's work.")
print(topic_old_dif_result["altoXML"].values.tolist())

866 topics have been recognised by both dataframe.
1117 topics have only been recognised by Alex's work.
['eb07-v1.2-TXT/a2/kp-eb0702-001306-9978-v1.txt', 'eb07-v1.2-TXT/a2/kp-eb0702-002007-0069-v1.txt', 'eb07-v1.2-TXT/a2/kp-eb0702-002305-0108-v1.txt', 'eb07-v1.2-TXT/a2/kp-eb0702-002702-0160-v1.txt', 'eb07-v1.2-TXT/a2/kp-eb0702-002901-0186-v1.txt', 'eb07-v1.2-TXT/a2/kp-eb0702-004002-0329-v1.txt', 'eb07-v1.2-TXT/a2/kp-eb0702-004308-0368-v1.txt', 'eb07-v1.2-TXT/a2/kp-eb0702-004506-0394-v1.txt', 'eb07-v1.2-TXT/a2/kp-eb0702-008706-0940-v1.txt', 'eb07-v1.2-TXT/a2/kp-eb0702-009314-1018-v1.txt', 'eb07-v1.2-TXT/a2/kp-eb0702-010711-1200-v1.txt', 'eb07-v1.2-TXT/a2/kp-eb0702-012510-1434-v1.txt', 'eb07-v1.2-TXT/a2/kp-eb0702-012510-1434-v1.txt', 'eb07-v1.2-TXT/a2/kp-eb0702-012909-1486-v1.txt', 'eb07-v1.2-TXT/a2/kp-eb0702-013407-1551-v1.txt', 'eb07-v1.2-TXT/a2/kp-eb0702-013407-1551-v1.txt', 'eb07-v1.2-TXT/a2/kp-eb0702-013407-1551-v1.txt', 'eb07-v1.2-TXT/a2/kp-eb0702-013901-1616-v1.txt', 'eb07-v1.2-T

In [26]:
# Find different between the terms from two dataframes
intersection = df[df["term"].isin(old_df["term"])]

print(f"{len(intersection)} terms have been recognised by both dataframe.")

# Remove the intersection from old_df
term_old_dif_result = old_df[~old_df["term"].isin(intersection["term"])]

# List the file path of term in old_df only
print(f"{len(term_old_dif_result)} terms have only been recognised by Alex's work.")

# Remove the intersection from df
term_dif_result = df[~df["term"].isin(intersection["term"])]

# List the file path of term in df only, Note that, lots of terms are not been processed in v21 folder since they are in xml files.
print(f"{len(term_dif_result)} terms have only been recognised by this work.")

18787 terms have been recognised by both dataframe.
3833 terms have only been recognised by Alex's work.
5200 terms have only been recognised by this work.


In [27]:
# Examine the terms only Alex's work
print(term_old_dif_result[100:150])

            term                                         definition  \
926       AGATHA  St, a market town on the Austrian principality...   
927       AGATHA  Santa a town on the banks of a small river in ...   
953       AGGERS  -HERRED, a district of Christiansand and a dio...   
971     AGLIONBY  John, an English divine, chaplain in ordinary ...   
975        AGNES  St, a large mining village in the county of Co...   
976       AGNESI  Maria Gλetana, an Italian lady, who may be jus...   
978       AGNOET  Ae (from αy≡w, to be ignorant of), in church h...   
981        AGNUS   Dei, in the church of Rome, a cake of wax sta...   
987     AGONALIS   Circus, now La Piazza Noυona, a long, large, ...   
993    AGONYCLIT  Ae, or Agonyclites, in Church History, a sect ...   
994         AGOR  AeUS, in Heathen Antiquity, an appellation giv...   
1002    AGRARIAN   Laws, among the Romans, those relating to the...   
1005    AGRICOLA  Cnaeus Julius, born at Frejus, in Provence, wa...   
1006  

In [28]:
# Examine the terms only this work
print(term_dif_result["term"].values.tolist())
print(term_dif_result[term_dif_result["term"] == "BARRINGTON, JOHN SHUTE, LORD VISCOUNT BAR-"])

"""
Wrong terms (assume no ocr errors):
------ from a2/kp-eb0702-068407-8701-v2.txt ----------
ANATOMY OF THE ORGANS OF THE ANIMAL, VOLUNTARY, OR RELATIVE FUNCTIONS
ANATOMY OF THE ORGANS PERTAINING TO THE ENTRO-PHIC OR NUTRITIVE FUNCTIONS
ANATOMY OF THE ORGANS PERTAINING TO THE REPRODUCTIVE FUNCTIONS
ANATOMY OF THE ORGANS OF RELATION
"""

['AARON AND JULIUS', 'AARSENS, FRANCIS, LORD OF SOMELDYCK', 'ABACAENUM', 'ABACUS MAJOR', 'ABAE', 'ABAUZIT, FIRMIN', 'ABBADIE, JAMES', 'ABBAS, MAHOMET’S UNCLE', 'ABBAS, SCHAH', 'ABBAS, SCHAH', 'ABBE', 'ABBOT, GEORGE', 'ABBOT, ROBERT', 'ABBOTS-BROMLEY', 'ABBOTS-LANGLEY', 'ABBS, ST', 'ABDIAS OF BABYLON', 'ABEL-MEHOLΑH', 'ABEL-MIZRAIM', 'ABEL-SATTIM', 'ABELARD, PETER', 'ABEN-EZRA, ABRAHAM', 'ABERCROMBY, THE HONOURABLE ALEXANDER', 'ABERCROMBY, SIR RALPH', 'ABERDEEN, OLD', 'ABERDEEN, NEW', 'ABERNETHY, JOHN', 'ABGILLUS, JOHN', 'ABIAD, BAHR', 'ABJURATION OF HERESY', 'ABLATIVE ABSOLUTE', 'ABRAHAM, DEN CHAILA', 'ABRAHAM USQUE', 'ABRUD-BANYA', 'ABRUZZO ULTERIORE FIRST', 'ABRUZZO ULTERIORE SECOND', 'ABRUZZO CITERIORE', 'ABSOLUTE GOVERNMENT', 'ABSOLUTE EQUATIONS', 'ABSOLUTE NUMBER', 'ABSORBENT MEDICINES', 'ABSORBENTS', 'ABSORUS, APSORUS, ABSYRTIS, ABSYRTIDES', 'ABSTERGENT MEDICINES', 'ABSTRACT IDEAS', 'ABSTRACT NUMBERS', 'ABSTRACT TERMS', 'ABU AND CANDU', 'ABU-ARISCH', 'ABULFARAGIUS, GREGORY', 'ABU

'\nWrong terms (assume no ocr errors):\n------ from a2/kp-eb0702-068407-8701-v2.txt ----------\nANATOMY OF THE ORGANS OF THE ANIMAL, VOLUNTARY, OR RELATIVE FUNCTIONS\nANATOMY OF THE ORGANS PERTAINING TO THE ENTRO-PHIC OR NUTRITIVE FUNCTIONS\nANATOMY OF THE ORGANS PERTAINING TO THE REPRODUCTIVE FUNCTIONS\nANATOMY OF THE ORGANS OF RELATION\n'