In [1]:
sparql_endpoint = "http://query.frances-ai.com/ebo_1st_hq_old"

In [2]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd


def endpoint2df(endpoint):
    sparql = SPARQLWrapper(endpoint)
    sparql_data=[]
    if "ebo" in endpoint:
        query="""
          PREFIX eb: <https://w3id.org/eb#>
          PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
          SELECT ?uri ?year ?title ?enum ?vnum ?v ?letters ?part ?metsXML ?page ?header ?term ?definition ?numberOfWords ?numberOfPages
              WHERE {{
            ?uri a eb:Article .
            ?uri eb:name ?term .
              ?uri eb:definition ?definition .
              ?uri eb:numberOfWords ?numberOfWords .
              ?v eb:hasPart ?uri.
              ?v eb:number ?vnum.
              ?v eb:numberOfPages ?numberOfPages .
              ?v eb:metsXML ?metsXML.
              ?v eb:letters ?letters .
              ?e eb:hasPart ?v.
              ?e eb:publicationYear ?year.
              ?e eb:number ?enum.
              ?e eb:title ?title.
              ?uri eb:startsAtPage ?sp.
              ?sp eb:header ?header .
              ?sp eb:number ?page .
              OPTIONAL {?v eb:part ?part; }

              }

              UNION {
            ?uri a eb:Topic .
            ?uri eb:name ?term .
              ?uri eb:definition ?definition .
              ?uri eb:numberOfWords ?numberOfWords .
              ?v eb:hasPart ?uri.
              ?v eb:number ?vnum.
              ?v eb:numberOfPages ?numberOfPages .
              ?v eb:metsXML ?metsXML.
              ?v eb:letters ?letters .
              ?e eb:hasPart ?v.
              ?e eb:publicationYear ?year.
              ?e eb:number ?enum.
              ?e eb:title ?title.
              ?uri eb:startsAtPage ?sp.
              ?sp eb:header ?header .
              ?sp eb:number ?page .
              OPTIONAL {?v eb:part ?part; }

              }
        }
        """
        sparql.setQuery(query)
        sparql.setReturnFormat(JSON)
        results = sparql.query().convert()

        for r in results["results"]["bindings"]:
            if "part" in r:
                v_part=r["part"]["value"]
            else:
                v_part="None"
            sparql_data.append({"uri": r["uri"]["value"], "year": r["year"]["value"], "title":r["title"]["value"], "edition":r["enum"]["value"], "vuri":r["v"]["value"], "volume":r["vnum"]["value"], "numPages":r["numberOfPages"]["value"], "letters":r["letters"]["value"], "part":v_part, "archive_filename":r["metsXML"]["value"], "page":r["page"]["value"], "header":r["header"]["value"], "term":r["term"]["value"], "definition":r["definition"]["value"], "numWords":r["numberOfWords"]["value"]})
    sparql_df = pd.DataFrame(sparql_data)
    return sparql_df

In [3]:
eb_hq_df = endpoint2df(sparql_endpoint)

In [4]:
len(eb_hq_df)

26429

In [5]:
eb_hq_df_1771 = eb_hq_df[eb_hq_df["year"] == "1771"]

In [8]:
dervis_1771 = eb_hq_df_1771[eb_hq_df_1771["term"] == "DERVIS"]
print(dervis_1771)

                                                     uri  year   
14756  https://w3id.org/eb/i/Article/992277653804341_...  1771  \
14757  https://w3id.org/eb/i/Article/992277653804341_...  1771   

                title edition   
14756  Edition 1,1771       1  \
14757  Edition 1,1771       1   

                                                    vuri volume numPages   
14756  https://w3id.org/eb/i/Volume/992277653804341_1...      2     1018  \
14757  https://w3id.org/eb/i/Volume/992277653804341_1...      2     1018   

      letters  part archive_filename page header    term   
14756     C-L  None                   434    DES  DERVIS  \
14757     C-L  None                   434    DER  DERVIS   

                                              definition numWords  
14756  a name given to all Mahommedan monks, though o...      471  
14757  a name given to all Mahommedan monks, though o...      471  


In [85]:
eb_hq_year_vol_page_header = eb_hq_df[["year", "volume",  "page", "header"]]

In [111]:
eb_hq_1810_vol_page_header = eb_hq_year_vol_page_header[(eb_hq_year_vol_page_header["year"] == '1810') & (eb_hq_year_vol_page_header["page"] == '103')]
print(eb_hq_1810_vol_page_header)

       year volume page         header
56566  1810     15  103         NUMNUM
58033  1810      5  103  GCAMCAMERARIA
58043  1810      5  103  GCAMCAMERARIA
58303  1810      5  103  GCAMCAMERARIA
58428  1810      5  103  GCAMCAMERARIA
58488  1810      5  103  GCAMCAMERARIA
58519  1810      5  103  GCAMCAMERARIA
58761  1810      5  103  GCAMCAMERARIA
58867  1810      5  103  GCAMCAMERARIA
63516  1810     13  103         MELMEL
63540  1810     13  103         MELMEL
63642  1810     13  103         MELMEL
64508  1810     11  103            LAD
64602  1810     11  103            LAD
65091  1810     11  103            LAD
65227  1810     11  103            LAD
66977  1810      3  103           BEAJ
71711  1810      7  103         DAYDAY
71935  1810      7  103         DAYDAY
72773  1810     10  103         GREGRE
73691  1810     18  103          PBROB
73791  1810     18  103          PBROB
73986  1810     18  103        SAPJSAP
74138  1810     18  103        SAPJSAP
74147  1810     18  103  

In [86]:
unique_eb_hq_year_vol_page_header = eb_hq_year_vol_page_header.drop_duplicates(subset=["year", "volume",  "page", "header"])

In [87]:
print(unique_eb_hq_year_vol_page_header)

        year volume page          header
0       1771      1  681          BLIBLO
1       1771      1  819          BUFBUI
2       1771      1  786          EOTBOU
3       1771      1  630          BARBAR
4       1771      1   56          AGHAGM
...      ...    ...  ...             ...
150564  1853     13  320         LEATHER
150565  1853     13  452  LIFEPRESERVERS
150566  1853     13  582           LOGIC
150567  1853     13   99        KILKENNY
150568  1853     13   23        JOSEPHUS

[31395 rows x 4 columns]


In [88]:
year_vol_page_header_counts = unique_eb_hq_year_vol_page_header.groupby(['page', 'volume', 'year']).size().reset_index(name='header_count')

In [110]:
print(year_vol_page_header_counts[(year_vol_page_header_counts["year"] == "1810") & (year_vol_page_header_counts["header_count"] > 1)])

      page volume  year  header_count
18     100     18  1810             2
178    103     18  1810             2
231    104     18  1810             2
347    106     18  1810             2
404    107     18  1810             2
...    ...    ...   ...           ...
29966   89     10  1810             2
30231   92     17  1810             2
30562   97     18  1810             2
30630   98     18  1810             2
30700   99     18  1810             2

[649 rows x 4 columns]
