This notebook explores the essential sparql query used in defoe

In [8]:
import pandas as pd
from SPARQLWrapper import SPARQLWrapper

# Setup SparqlWrapper
hto_sparql_endpoint = "http://127.0.0.1:3030/hto"
hto_sparql_wrapper = SPARQLWrapper(hto_sparql_endpoint)

In [9]:
from SPARQLWrapper import JSON


# List all collections
def get_all_collections():
    collections=[]
    query="""
      PREFIX hto: <https://w3id.org/hto#>
      PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
      SELECT ?collection ?name WHERE {
        ?collection a hto:WorkCollection;
            hto:name ?name.
        FILTER (regex(?name, "Collection$", "i"))
    }
    """
    hto_sparql_wrapper.setQuery(query)
    hto_sparql_wrapper.setReturnFormat(JSON)
    results = hto_sparql_wrapper.query().convert()
    for r in results["results"]["bindings"]:
        collections.append({"uri": r["collection"]["value"], "name": r["name"]["value"]})
    return collections

In [10]:
all_collections = get_all_collections()
all_collections

[{'uri': 'https://w3id.org/hto/WorkCollection/EncyclopaediaBritannica',
  'name': 'Encyclopaedia Britannica Collection'},
 {'uri': 'https://w3id.org/hto/WorkCollection/ChapbooksOfScotland',
  'name': 'Chapbooks printed in Scotland Collection'}]

In [11]:
def get_nls_page_from_nls(collection_name):
    """
    This function will query hto kg, and get all eb terms with their descriptions and metadata extracted from NLS dataset.
    :return: eb_terms_nls, a list of terms with their descriptions and metadata extracted from NLS dataset.
    """
    source_provider_name = "National Library of Scotland"
    pages=[]
    query="""
      PREFIX hto: <https://w3id.org/hto#>
      PREFIX prov: <http://www.w3.org/ns/prov#>
      PREFIX foaf: <http://xmlns.com/foaf/0.1/>
      SELECT * WHERE {
        ?page hto:number ?page_number;
            hto:hasOriginalDescription ?description.
        ?description hto:text ?text;
            hto:wasExtractedFrom ?source_dataset.
        ?source_dataset prov:wasAttributedTo ?agent.
        ?agent foaf:name "%s".
        ?volume a hto:Volume;
            hto:hadMember ?page.
        ?series a hto:Series;
            hto:hadMember ?volume.
        ?eb_collection a hto:WorkCollection;
            hto:name "%s";
            hto:hadMember ?series.
    }
    """ % (source_provider_name, collection_name)
    print(query)
    hto_sparql_wrapper.setQuery(query)
    hto_sparql_wrapper.setReturnFormat(JSON)
    results = hto_sparql_wrapper.query().convert()
    for r in results["results"]["bindings"]:
        pages.append(
            {"page_uri": r["page"]["value"],
             "page_number": r["page_number"]["value"],
             "text": r["text"]["value"],
             "volume": r["volume"]["value"]
             }
        )
    return pages

In [25]:
def get_eb_terms_from_nls():
    """
    This function will query hto kg, and get all eb terms with their descriptions and metadata extracted from NLS dataset. Note that, since the hto kg contains the supplement (in 1842) which only has dissertations, this function will also get these dissertations.
    :return: eb_terms_nls, a list of terms with their descriptions and metadata extracted from NLS dataset.
    """
    collection_name = "Encyclopaedia Britannica Collection"
    nls_agent_name = "National Library of Scotland"
    terms=[]
    query="""
      PREFIX hto: <https://w3id.org/hto#>
      PREFIX prov: <http://www.w3.org/ns/prov#>
      PREFIX foaf: <http://xmlns.com/foaf/0.1/>
      SELECT * WHERE {
        ?term a ?term_type;
            hto:name ?term_name;
            hto:startsAtPage ?page;
            hto:hasOriginalDescription ?description.
        FILTER (?term_type = hto:ArticleTermRecord || ?term_type = hto:TopicTermRecord)
        ?description hto:text ?description_text;
            hto:wasExtractedFrom ?source_dataset.
        ?source_dataset prov:wasAttributedTo ?agent.
        ?agent foaf:name "%s".
        ?volume a hto:Volume;
            hto:hadMember ?page.
        ?edition a hto:Edition;
            hto:hadMember ?volume.
        ?eb_collection a hto:WorkCollection;
            hto:name "%s";
            hto:hadMember ?edition.
    }
    """ % (nls_agent_name, collection_name)
    print(query)
    hto_sparql_wrapper.setQuery(query)
    hto_sparql_wrapper.setReturnFormat(JSON)
    results = hto_sparql_wrapper.query().convert()
    for r in results["results"]["bindings"]:
        terms.append(
            {"term_uri": r["term"]["value"],
             "term_name": r["term_name"]["value"],
             "description": r["description_text"]["value"],
             "edition": r["edition"]["value"]
             }
        )
    return terms

In [16]:
eb_edition_mmsids = ["992277653804341" # 1st 1771
                     , "9929192893804340" # 1st 1773
                     , "997902523804341" # 2nd 1778
                     , "997902543804341" # 3rd 1797
                     , "9910796343804340" # 3rd 1801
                     , "9910796233804340" # 4th 1810
                     , "9922270543804340" # 5th 1815
                     , "9910796253804340" # 6th 1823
                     , "9910796273804340" # 7th 1842
                    , "9929777383804340" # 8th 1853
]
print(eb_edition_mmsids)
filter_str = "FILTER (?mmsid = \"" + "\" || ?mmsid = \"".join(eb_edition_mmsids) + "\")"
print(filter_str)

['992277653804341', '9929192893804340', '997902523804341', '997902543804341', '9910796343804340', '9910796233804340', '9922270543804340', '9910796253804340', '9910796273804340', '9929777383804340']
FILTER (?mmsid = "992277653804341" || ?mmsid = "9929192893804340" || ?mmsid = "997902523804341" || ?mmsid = "997902543804341" || ?mmsid = "9910796343804340" || ?mmsid = "9910796233804340" || ?mmsid = "9922270543804340" || ?mmsid = "9910796253804340" || ?mmsid = "9910796273804340" || ?mmsid = "9929777383804340")


In [28]:
def create_alter_names_dicts():
    hto_sparql_wrapper.setQuery("""
    PREFIX hto: <https://w3id.org/hto#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT * WHERE {
        ?term_uri a ?term_type;
            rdfs:label ?alter_name.
        FILTER (?term_type = hto:ArticleTermRecord || ?term_type = hto:TopicTermRecord)
    }
    """
                    )
    ret = hto_sparql_wrapper.queryAndConvert()
    term_alter_names = {}
    for r in ret["results"]["bindings"]:
        term_uri = r["term_uri"]["value"]
        alter_name = r["alter_name"]["value"]
        if term_uri in term_alter_names:
            term_alter_names[term_uri].append(alter_name)
        else:
            term_alter_names[term_uri] = [alter_name]

    return term_alter_names

In [39]:
def get_eb_terms_simple(edition_mmsids, source_provider):
    """
    This function queries eb terms from a list of editions with one specific source provider.
    :param edition_mmsids: A list of edition mmsids.
    :param source_provider: People or Organisations who transcribed the EB collection into digital text.
    :return: a list of terms, each term has term_uri, year, edition_uri, edition_title, edition number, vol_uri, vol_num, num_pages, letters, part, file_path, start_page, header, term_primary_name, term_alter_names, note, description.
    """
    collection_name = "Encyclopaedia Britannica Collection"
    mmsid_filter_string = "FILTER (?mmsid = \"" + "\" || ?mmsid = \"".join(edition_mmsids) + "\")"
    terms=[]
    query="""
      PREFIX hto: <https://w3id.org/hto#>
      PREFIX prov: <http://www.w3.org/ns/prov#>
      PREFIX foaf: <http://xmlns.com/foaf/0.1/>
      SELECT * WHERE {
        ?term a ?term_type;
            hto:name ?term_name;
            hto:startsAtPage ?page;
            hto:hasOriginalDescription ?description.
        OPTIONAL {?term hto:note ?note}
        FILTER (?term_type = hto:ArticleTermRecord || ?term_type = hto:TopicTermRecord)
        ?description hto:text ?description_text;
            hto:wasExtractedFrom ?source_dataset.
        ?source_dataset prov:wasAttributedTo ?agent;
            prov:value ?source_filepath.
        ?agent foaf:name "%s".
        ?page hto:number ?page_number.
        OPTIONAL {?page hto:header ?header}
        ?volume a hto:Volume;
            hto:number ?volume_number;
            hto:numberOfPages ?number_pages;
            hto:hadMember ?page.
        OPTIONAL {?volume hto:letters ?letters;}
        OPTIONAL {?volume hto:part ?part}
        ?edition a hto:Edition;
            hto:title ?edition_title;
            hto:yearPublished ?year;
            hto:mmsid ?mmsid;
            hto:hadMember ?volume.
        OPTIONAL {?edition hto:number ?edition_number}
        %s
        ?eb_collection a hto:WorkCollection;
            hto:name "%s";
            hto:hadMember ?edition.
    }
    """ % (source_provider, mmsid_filter_string, collection_name)
    print(query)
    hto_sparql_wrapper.setQuery(query)
    hto_sparql_wrapper.setReturnFormat(JSON)
    results = hto_sparql_wrapper.query().convert()
    for r in results["results"]["bindings"]:
        terms.append(
            {"term_uri": r["term"]["value"],
             "term_name": r["term_name"]["value"],
             "description": r["description_text"]["value"]
             }
        )
    return terms

In [53]:
def get_eb_terms_from_editions_with_source_provider(edition_mmsids, source_provider):
    """
    This function queries eb terms from a list of editions with one specific source provider.
    :param edition_mmsids: A list of edition mmsids.
    :param source_provider: People or Organisations who transcribed the EB collection into digital text.
    :return: a list of terms, each term has term_uri, year, edition_uri, edition_title, edition number, vol_uri, vol_num, num_pages, letters, part, file_path, start_page, header, term_primary_name, term_alter_names, note, description.
    """
    terms_alter_names = create_alter_names_dicts()
    collection_name = "Encyclopaedia Britannica Collection"
    mmsid_filter_string = "FILTER (?mmsid = \"" + "\" || ?mmsid = \"".join(edition_mmsids) + "\")"
    terms=[]
    query="""
      PREFIX hto: <https://w3id.org/hto#>
      PREFIX prov: <http://www.w3.org/ns/prov#>
      PREFIX foaf: <http://xmlns.com/foaf/0.1/>
      SELECT * WHERE {
        ?term a ?term_type;
            hto:name ?term_name;
            hto:startsAtPage ?page;
            hto:hasOriginalDescription ?description.
        OPTIONAL {?term hto:note ?note}
        FILTER (?term_type = hto:ArticleTermRecord || ?term_type = hto:TopicTermRecord)
        ?description hto:text ?description_text;
            hto:wasExtractedFrom ?source_dataset.
        ?source_dataset prov:wasAttributedTo ?agent;
            prov:value ?source_filepath.
        ?agent foaf:name "%s".
        ?page hto:number ?page_number.
        OPTIONAL {?page hto:header ?header}
        ?volume a hto:Volume;
            hto:number ?volume_number;
            hto:numberOfPages ?number_pages;
            hto:hadMember ?page.
        OPTIONAL {?volume hto:letters ?letters;}
        OPTIONAL {?volume hto:part ?part}
        ?edition a hto:Edition;
            hto:title ?edition_title;
            hto:yearPublished ?year;
            hto:mmsid ?mmsid;
            hto:hadMember ?volume.
        OPTIONAL {?edition hto:number ?edition_number}
        %s
        ?eb_collection a hto:WorkCollection;
            hto:name "%s";
            hto:hadMember ?edition.
    }
    """ % (source_provider, mmsid_filter_string, collection_name)
    print(query)
    hto_sparql_wrapper.setQuery(query)
    hto_sparql_wrapper.setReturnFormat(JSON)
    results = hto_sparql_wrapper.query().convert()
    for r in results["results"]["bindings"]:
        note = None
        if "note" in r:
            note = r["note"]["value"]
        edition_number = None
        if "edition_number" in r:
            edition_number = r["edition_number"]["value"]
        part = None
        if "part" in r:
            part = r["part"]["value"]
        letters = None
        alter_names = []
        term_uri = r["term"]["value"]
        if term_uri in terms_alter_names:
            alter_names = terms_alter_names[term_uri]
        if "letters" in r:
            letters = r["letters"]["value"]
        terms.append(
            {"term_uri": term_uri,
             "term_name": r["term_name"]["value"],
             "alter_names": alter_names,
             "note": note,
             "file_path": r["source_filepath"]["value"],
             "description": r["description_text"]["value"],
             "edition_uri": r["edition"]["value"],
             "edition_title": r["edition_title"]["value"],
             "edition_number": edition_number,
             "volume_uri": r["volume"]["value"],
             "volume_number": r["volume_number"]["value"],
             "num_pages": r["number_pages"]["value"],
             "letters": letters,
             "part": part,
             "start_page_uri": r["page"]["value"],
             "start_page_number": r["page_number"]["value"]
             }
        )
    return terms

In [13]:
def get_hto_object(collection_name, source):
    """
    This function will query hto kg, and get a list of hto object based on the collection name and source. For EB collection, it will return terms from given source dataset, For other NLS collection, it will return pages from given source datase.
    :param collection_name: Name of digital collections in hto kg.
    :param source: Three types of sources: NLS, Neuspell and HQ.
    :return: a list of hto object based on the collection name and source.
    """
    results = []
    if collection_name == "Encyclopaedia Britannica Collection":
        if source == "NLS":
            edition_mmsids = ["992277653804341" # 1st 1771
                     , "9929192893804340" # 1st 1773
                     , "997902523804341" # 2nd 1778
                     , "997902543804341" # 3rd 1797
                     , "9910796343804340" # 3rd 1801
                     , "9910796233804340" # 4th 1810
                     , "9922270543804340" # 5th 1815
                     , "9910796253804340" # 6th 1823
                     , "9910796273804340" # 7th 1842
                    , "9929777383804340" # 8th 1853
                                ]
            results = get_eb_terms_from_editions_with_source_provider(edition_mmsids, "National Library of Scotland")
        elif source == "HQ":
            ash_edition_mmsid = ["992277653804341"]
            nckp_edition_mmsid = ["9910796273804340"]
            nls_edition_mmsids = ["9929192893804340" # 1st 1773
                     , "997902523804341" # 2nd 1778
                     , "997902543804341" # 3rd 1797
                     , "9910796343804340" # 3rd 1801
                     , "9910796233804340" # 4th 1810
                     , "9922270543804340" # 5th 1815
                     , "9910796253804340" # 6th 1823
                    , "9929777383804340" # 8th 1853
                                ]
            ash_terms = get_eb_terms_from_editions_with_source_provider(ash_edition_mmsid, "Ash Charlton")
            nls_terms = get_eb_terms_from_editions_with_source_provider(nls_edition_mmsids, "National Library of Scotland")
            nckp_terms = get_eb_terms_from_editions_with_source_provider(nckp_edition_mmsid, "Nineteenth-Century Knowledge Project")
            results.extend(ash_terms)
            results.extend(nls_terms)
            results.extend(nckp_terms)
    else:
        if source == "NLS":
            results = get_nls_page_from_nls(collection_name)
    return results

In [26]:
all_terms = get_eb_terms_from_nls()
len(all_terms)


      PREFIX hto: <https://w3id.org/hto#>
      PREFIX prov: <http://www.w3.org/ns/prov#>
      PREFIX foaf: <http://xmlns.com/foaf/0.1/>
      SELECT * WHERE {
        ?term a ?term_type;
            hto:name ?term_name;
            hto:startsAtPage ?page;
            hto:hasOriginalDescription ?description.
        FILTER (?term_type = hto:ArticleTermRecord || ?term_type = hto:TopicTermRecord)
        ?description hto:text ?description_text;
            hto:wasExtractedFrom ?source_dataset.
        ?source_dataset prov:wasAttributedTo ?agent.
        ?agent foaf:name "National Library of Scotland".
        ?volume a hto:Volume;
            hto:hadMember ?page.
        ?edition a hto:Edition;
            hto:hadMember ?volume.
        ?eb_collection a hto:WorkCollection;
            hto:name "Encyclopaedia Britannica Collection";
            hto:hadMember ?edition.
    }
    


151065

In [54]:
terms_from_editions_nls = get_eb_terms_from_editions_with_source_provider(eb_edition_mmsids, "National Library of Scotland")
# It should be 150569
len(terms_from_editions_nls)


      PREFIX hto: <https://w3id.org/hto#>
      PREFIX prov: <http://www.w3.org/ns/prov#>
      PREFIX foaf: <http://xmlns.com/foaf/0.1/>
      SELECT * WHERE {
        ?term a ?term_type;
            hto:name ?term_name;
            hto:startsAtPage ?page;
            hto:hasOriginalDescription ?description.
        OPTIONAL {?term hto:note ?note}
        FILTER (?term_type = hto:ArticleTermRecord || ?term_type = hto:TopicTermRecord)
        ?description hto:text ?description_text;
            hto:wasExtractedFrom ?source_dataset.
        ?source_dataset prov:wasAttributedTo ?agent;
            prov:value ?source_filepath.
        ?agent foaf:name "National Library of Scotland".
        ?page hto:number ?page_number.
        OPTIONAL {?page hto:header ?header}
        ?volume a hto:Volume;
            hto:number ?volume_number;
            hto:numberOfPages ?number_pages;
            hto:hadMember ?page.
        OPTIONAL {?volume hto:letters ?letters;}
        OPTIONAL {?volume hto:p

150569

In [40]:
simple_terms_from_editions_nls = get_eb_terms_simple(eb_edition_mmsids, "National Library of Scotland")
# 150569
len(simple_terms_from_editions_nls)


      PREFIX hto: <https://w3id.org/hto#>
      PREFIX prov: <http://www.w3.org/ns/prov#>
      PREFIX foaf: <http://xmlns.com/foaf/0.1/>
      SELECT * WHERE {
        ?term a ?term_type;
            hto:name ?term_name;
            hto:startsAtPage ?page;
            hto:hasOriginalDescription ?description.
        OPTIONAL {?term hto:note ?note}
        FILTER (?term_type = hto:ArticleTermRecord || ?term_type = hto:TopicTermRecord)
        ?description hto:text ?description_text;
            hto:wasExtractedFrom ?source_dataset.
        ?source_dataset prov:wasAttributedTo ?agent;
            prov:value ?source_filepath.
        ?agent foaf:name "National Library of Scotland".
        ?page hto:number ?page_number.
        OPTIONAL {?page hto:header ?header}
        ?volume a hto:Volume;
            hto:number ?volume_number;
            hto:numberOfPages ?number_pages;
            hto:hadMember ?page.
        OPTIONAL {?volume hto:letters ?letters;}
        OPTIONAL {?volume hto:p

150569

In [46]:
ash_edition_mmsid = ["992277653804341"]
ash_terms = get_eb_terms_from_editions_with_source_provider(ash_edition_mmsid, "Ash Charlton")



      PREFIX hto: <https://w3id.org/hto#>
      PREFIX prov: <http://www.w3.org/ns/prov#>
      PREFIX foaf: <http://xmlns.com/foaf/0.1/>
      SELECT * WHERE {
        ?term a ?term_type;
            hto:name ?term_name;
            hto:startsAtPage ?page;
            hto:hasOriginalDescription ?description.
        FILTER (?term_type = hto:ArticleTermRecord || ?term_type = hto:TopicTermRecord)
        ?description hto:text ?description_text;
            hto:wasExtractedFrom ?source_dataset.
        ?source_dataset prov:wasAttributedTo ?agent.
        ?agent foaf:name "Ash Charlton".
        ?volume a hto:Volume;
            hto:hadMember ?page.
        ?edition a hto:Edition;
            hto:mmsid ?mmsid;
            hto:hadMember ?volume.
        FILTER (?mmsid = "992277653804341")
        ?eb_collection a hto:WorkCollection;
            hto:name "Encyclopaedia Britannica Collection";
            hto:hadMember ?edition.
    }
    


1

In [48]:
len(ash_terms)

15955

In [55]:
terms_from_editions_nls_df = pd.DataFrame(terms_from_editions_nls)
terms_from_editions_nls_df

Unnamed: 0,term_uri,term_name,alter_names,note,file_path,description,edition_uri,edition_title,edition_number,volume_uri,volume_number,num_pages,letters,part,start_page_uri,start_page_number
0,https://w3id.org/hto/ArticleTermRecord/9910796...,ALMAGEST,[],,191253819/alto/192151933.34.xml,"in Matters of Literature, larly used for a col...",https://w3id.org/hto/Edition/9910796233804340,"Encyclopaedia Britannica; or, A dictionary of ...",4,https://w3id.org/hto/Volume/9910796233804340_1...,1,424,Agriculture-AME,2,https://w3id.org/hto/Page/9910796233804340_191...,331
1,https://w3id.org/hto/ArticleTermRecord/9910796...,ALKENNA,[],,191253819/alto/192151738.34.xml,"See Lawsonia,",https://w3id.org/hto/Edition/9910796233804340,"Encyclopaedia Britannica; or, A dictionary of ...",4,https://w3id.org/hto/Volume/9910796233804340_1...,1,424,Agriculture-AME,2,https://w3id.org/hto/Page/9910796233804340_191...,316
2,https://w3id.org/hto/ArticleTermRecord/9910796...,ALOGOTROPHIA,[],,191253819/alto/192152063.34.xml,"among Phy/icians, a term signifying the unequa...",https://w3id.org/hto/Edition/9910796233804340,"Encyclopaedia Britannica; or, A dictionary of ...",4,https://w3id.org/hto/Volume/9910796233804340_1...,1,424,Agriculture-AME,2,https://w3id.org/hto/Page/9910796233804340_191...,341
3,https://w3id.org/hto/ArticleTermRecord/9910796...,AMBROSIN,[],,191253819/alto/192153038.34.xml,"in middle-age writers, denotes a coin struck b...",https://w3id.org/hto/Edition/9910796233804340,"Encyclopaedia Britannica; or, A dictionary of ...",4,https://w3id.org/hto/Volume/9910796233804340_1...,1,424,Agriculture-AME,2,https://w3id.org/hto/Page/9910796233804340_191...,416
4,https://w3id.org/hto/ArticleTermRecord/9910796...,ALEUROM,[],,191253819/alto/192150347.34.xml,"ANC T, the same with what was ctherwife called...",https://w3id.org/hto/Edition/9910796233804340,"Encyclopaedia Britannica; or, A dictionary of ...",4,https://w3id.org/hto/Volume/9910796233804340_1...,1,424,Agriculture-AME,2,https://w3id.org/hto/Page/9910796233804340_191...,209
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150564,https://w3id.org/hto/TopicTermRecord/992977738...,MOROCCO,[],,193592632/alto/193600554.34.xml,"MOROCCO. \r c |)CCO. plexion, and live by hunt...",https://w3id.org/hto/Edition/9929777383804340,Encyclopaedia Britannica,8,https://w3id.org/hto/Volume/9929777383804340_1...,15,868,Milan-NAV,,https://w3id.org/hto/Page/9929777383804340_193...,609
150565,https://w3id.org/hto/TopicTermRecord/992977738...,TEES,[],,193819047/alto/193843911.34.xml,T H E A Theatres. There was something very app...,https://w3id.org/hto/Edition/9929777383804340,Encyclopaedia Britannica,8,https://w3id.org/hto/Volume/9929777383804340_1...,21,1062,T-ZWO,,https://w3id.org/hto/Page/9929777383804340_193...,180
150566,https://w3id.org/hto/TopicTermRecord/992977738...,BRIDGES,[],,193696087/alto/193792243.34.xml,IRON BRIDGES. Iron Bridges. The exclusive use ...,https://w3id.org/hto/Edition/9929777383804340,Encyclopaedia Britannica,8,https://w3id.org/hto/Volume/9929777383804340_1...,12,858,Hume-JOM,,https://w3id.org/hto/Page/9929777383804340_193...,587
150567,https://w3id.org/hto/TopicTermRecord/997902543...,STENOGRAPHY,[],,191253800/alto/191923684.34.xml,pofed the omission of vowels in_ the middle of...,https://w3id.org/hto/Edition/997902543804341,Encyclopaedia Britannica,3,https://w3id.org/hto/Volume/997902543804341_19...,17,918,SCO-STR,,https://w3id.org/hto/Page/997902543804341_1912...,868


In [57]:
alter_names_terms_df = terms_from_editions_nls_df[terms_from_editions_nls_df.apply(lambda row: len(row["alter_names"]) > 0, axis=1)]
alter_names_terms_df

Unnamed: 0,term_uri,term_name,alter_names,note,file_path,description,edition_uri,edition_title,edition_number,volume_uri,volume_number,num_pages,letters,part,start_page_uri,start_page_number
36358,https://w3id.org/hto/ArticleTermRecord/9910796...,CANDAHAR,[KANDAHAR],,193322690/alto/193347980.34.xml,"or Kandahar, an extensive Afghanistan, situate...",https://w3id.org/hto/Edition/9910796273804340,Encyclopaedia Britannica,7,https://w3id.org/hto/Volume/9910796273804340_1...,6,930,CAL-Clock,,https://w3id.org/hto/Page/9910796273804340_193...,85
36372,https://w3id.org/hto/ArticleTermRecord/9910796...,HYPOGLOTTIS,[HYPOGLOSSIS],,192693199/alto/192939598.34.xml,"or Hypoglossis, under, and y Xuirra, tongue, i...",https://w3id.org/hto/Edition/9910796273804340,Encyclopaedia Britannica,7,https://w3id.org/hto/Volume/9910796273804340_1...,12,902,Hydrodynamics-KYR,,https://w3id.org/hto/Page/9910796273804340_192...,148
36403,https://w3id.org/hto/ArticleTermRecord/9910796...,EDOM,[IDUMAEA],,193322688/alto/193328740.34.xml,"or Idumea, in Ancient Geography, a district of...",https://w3id.org/hto/Edition/9910796273804340,Encyclopaedia Britannica,7,https://w3id.org/hto/Volume/9910796273804340_1...,8,974,DIA-England,,https://w3id.org/hto/Page/9910796273804340_193...,441
36407,https://w3id.org/hto/ArticleTermRecord/9910796...,HYPANTE,[HYPERPANTE],,192693199/alto/192939572.34.xml,"or Hyperpante, a name given by the Greeks to t...",https://w3id.org/hto/Edition/9910796273804340,Encyclopaedia Britannica,7,https://w3id.org/hto/Volume/9910796273804340_1...,12,902,Hydrodynamics-KYR,,https://w3id.org/hto/Page/9910796273804340_192...,146
36422,https://w3id.org/hto/ArticleTermRecord/9910796...,HYPOBOLE,[SUBJECTION],,192693199/alto/192939598.34.xml,"or Subjection (from wro, and fict Kku, I cast)...",https://w3id.org/hto/Edition/9910796273804340,Encyclopaedia Britannica,7,https://w3id.org/hto/Volume/9910796273804340_1...,12,902,Hydrodynamics-KYR,,https://w3id.org/hto/Page/9910796273804340_192...,148
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148973,https://w3id.org/hto/TopicTermRecord/991079627...,DONGOLA,[DANKALA],,193322688/alto/193324593.34.xml,"Donegal, celebrated places of resort for pilgr...",https://w3id.org/hto/Edition/9910796273804340,Encyclopaedia Britannica,7,https://w3id.org/hto/Volume/9910796273804340_1...,8,974,DIA-England,,https://w3id.org/hto/Page/9910796273804340_193...,122
149005,https://w3id.org/hto/TopicTermRecord/991079627...,MAHOMMED,"[MAHOMET, MOHAMMED]",,193322689/alto/194200162.34.xml,"Mahomheard to her cousin Warakah Ebn Nawfal, w...",https://w3id.org/hto/Edition/9910796273804340,Encyclopaedia Britannica,7,https://w3id.org/hto/Volume/9910796273804340_1...,14,938,Magnetism-Mexico,,https://w3id.org/hto/Page/9910796273804340_193...,36
149076,https://w3id.org/hto/TopicTermRecord/991079627...,HALL,[SUABIAN HALL],,193638940/alto/193640605.34.xml,"Hall, tacked him had little effect, and the pe...",https://w3id.org/hto/Edition/9910796273804340,Encyclopaedia Britannica,7,https://w3id.org/hto/Volume/9910796273804340_1...,11,872,Grotius-HYD,,https://w3id.org/hto/Page/9910796273804340_193...,126
149196,https://w3id.org/hto/TopicTermRecord/991079627...,SERPENTS,[OPHIDIAN REPTILES],,193913444/alto/193941865.34.xml,"Ophidian We now arrive, “ by lingering steps a...",https://w3id.org/hto/Edition/9910796273804340,Encyclopaedia Britannica,7,https://w3id.org/hto/Volume/9910796273804340_1...,20,1056,Sculpture-SUR,,https://w3id.org/hto/Page/9910796273804340_193...,134


In [60]:
alter_names_terms_df[:5].to_dict(orient="records")

[{'term_uri': 'https://w3id.org/hto/ArticleTermRecord/9910796273804340_193322690_2559301054_0',
  'term_name': 'CANDAHAR',
  'alter_names': ['KANDAHAR'],
  'note': None,
  'file_path': '193322690/alto/193347980.34.xml',
  'description': 'or Kandahar, an extensive Afghanistan, situated between the 31st and of north latitude, and between the 64th and of east longitude. To the north it is bounded try of Balk, to the south by Beloochistan, east it has Sinde and Beloochistan ; and sandy desert of various breadth divides it vince of Seistan, in Persia. Part of this province of mountains, and part of arid and uncultivated crossed by ranges of hills running westward ropamisan Mountains. But though the general of the country be waste and barren, most parts ply water and forage to the pastoral hordes frequented; and it is not destitute of many ed and pleasant valleys, and some fertile plains, ed by mountains. The western part of this means so mountainous as the northern ; times it was a fertile 

In [52]:
non_letters_terms = terms_from_editions_nls_df[terms_from_editions_nls_df["letters"].isnull()]
non_letters_terms

Unnamed: 0,term_uri,term_name,note,file_path,description,edition_uri,edition_title,edition_number,volume_uri,volume_number,num_pages,letters,part,start_page_uri,start_page_number
34102,https://w3id.org/hto/ArticleTermRecord/9910796...,ALMAGEST,,192547789/alto/192890315.34.xml,"II. 529; III. 733, 734.",https://w3id.org/hto/Edition/9910796273804340,Encyclopaedia Britannica,7,https://w3id.org/hto/Volume/9910796273804340_1...,0,184,,,https://w3id.org/hto/Page/9910796273804340_192...,19
34103,https://w3id.org/hto/ArticleTermRecord/9910796...,AFFINITY,,192547789/alto/192890276.34.xml,"a relation contracted by marriage, II. 210. , ...",https://w3id.org/hto/Edition/9910796273804340,Encyclopaedia Britannica,7,https://w3id.org/hto/Volume/9910796273804340_1...,0,184,,,https://w3id.org/hto/Page/9910796273804340_192...,16
34104,https://w3id.org/hto/ArticleTermRecord/9910796...,BARBARISM,,192547789/alto/192890458.34.xml,IV. 360. Barbaro on architecture. III. 425.,https://w3id.org/hto/Edition/9910796273804340,Encyclopaedia Britannica,7,https://w3id.org/hto/Volume/9910796273804340_1...,0,184,,,https://w3id.org/hto/Page/9910796273804340_192...,30
34105,https://w3id.org/hto/ArticleTermRecord/9910796...,ABSTINENTS,,192547789/alto/192890250.34.xml,"a sect of heretics, II. 46.",https://w3id.org/hto/Edition/9910796273804340,Encyclopaedia Britannica,7,https://w3id.org/hto/Volume/9910796273804340_1...,0,184,,,https://w3id.org/hto/Page/9910796273804340_192...,14
34106,https://w3id.org/hto/ArticleTermRecord/9910796...,AMMA,,192547789/alto/192890328.34.xml,"II. 657. Amman, in Palestine, XVI. 746. (John ...",https://w3id.org/hto/Edition/9910796273804340,Encyclopaedia Britannica,7,https://w3id.org/hto/Volume/9910796273804340_1...,0,184,,,https://w3id.org/hto/Page/9910796273804340_192...,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148328,https://w3id.org/hto/TopicTermRecord/992977738...,GENERALINDEX,,193322702/alto/193441918.34.xml,THE following Index contains a full and comple...,https://w3id.org/hto/Edition/9929777383804340,Encyclopaedia Britannica,8,https://w3id.org/hto/Volume/9929777383804340_1...,0,290,,,https://w3id.org/hto/Page/9929777383804340_193...,11
148884,https://w3id.org/hto/TopicTermRecord/991079627...,INDEX,,192547789/alto/192890510.34.xml,"INDEX BER—BET Berog, zoophyte, XXL 1018. Berce...",https://w3id.org/hto/Edition/9910796273804340,Encyclopaedia Britannica,7,https://w3id.org/hto/Volume/9910796273804340_1...,0,184,,,https://w3id.org/hto/Page/9910796273804340_192...,34
149765,https://w3id.org/hto/TopicTermRecord/992977738...,INDEX,,193322702/alto/193442191.34.xml,"INDEX. ARI—ARM his method of discussion, ib.—h...",https://w3id.org/hto/Edition/9929777383804340,Encyclopaedia Britannica,8,https://w3id.org/hto/Volume/9929777383804340_1...,0,290,,,https://w3id.org/hto/Page/9929777383804340_193...,32
149766,https://w3id.org/hto/TopicTermRecord/992977738...,PEEFACE,,193322702/alto/193445168.34.xml,"xv together, “ they exhibited,” according to t...",https://w3id.org/hto/Edition/9929777383804340,Encyclopaedia Britannica,8,https://w3id.org/hto/Volume/9929777383804340_1...,0,290,,,https://w3id.org/hto/Page/9929777383804340_193...,261


In [38]:
chapbooks_pages = get_nls_page_from_nls("National Library of Scotland", "Chapbooks printed in Scotland Collection")
len(chapbooks_pages)


      PREFIX hto: <https://w3id.org/hto#>
      PREFIX prov: <http://www.w3.org/ns/prov#>
      PREFIX foaf: <http://xmlns.com/foaf/0.1/>
      SELECT * WHERE {
        ?page hto:number ?page_number;
            hto:hasOriginalDescription ?description.
        ?description hto:text ?text;
            hto:wasExtractedFrom ?source_dataset.
        ?source_dataset prov:wasAttributedTo ?agent.
        ?agent foaf:name "National Library of Scotland".
        ?volume a hto:Volume;
            hto:hadMember ?page.
        ?series a hto:Series;
            hto:hadMember ?volume.
        ?eb_collection a hto:WorkCollection;
            hto:name "Chapbooks printed in Scotland Collection";
            hto:hadMember ?series.
    }
    


47329

In [51]:
hq_eb_terms = get_hto_object("Encyclopaedia Britannica Collection", "HQ")
len(hq_eb_terms)


      PREFIX hto: <https://w3id.org/hto#>
      PREFIX prov: <http://www.w3.org/ns/prov#>
      PREFIX foaf: <http://xmlns.com/foaf/0.1/>
      SELECT * WHERE {
        ?term a ?term_type;
            hto:name ?term_name;
            hto:startsAtPage ?page;
            hto:hasOriginalDescription ?description.
        FILTER (?term_type = hto:ArticleTermRecord || ?term_type = hto:TopicTermRecord)
        ?description hto:text ?description_text;
            hto:wasExtractedFrom ?source_dataset.
        ?source_dataset prov:wasAttributedTo ?agent.
        ?agent foaf:name "Ash Charlton".
        ?volume a hto:Volume;
            hto:hadMember ?page.
        ?edition a hto:Edition;
            hto:mmsid ?mmsid;
            hto:hadMember ?volume.
        FILTER (?mmsid = "992277653804341")
        ?eb_collection a hto:WorkCollection;
            hto:name "Encyclopaedia Britannica Collection";
            hto:hadMember ?edition.
    }
    

      PREFIX hto: <https://w3id.org/hto#>
      PR

168112

In [66]:
test_result = {'1823': [20, 17546, 22018, 12054402], '1771': [3, 2722, 8923, 1949168], '1773': [3, 2740, 9194, 1931151]}

In [65]:
import yaml

yml = yaml.safe_dump(dict(test_result))
yml

TypeError: 'int' object is not iterable