In [33]:
import requests
import json
import math
from itertools import chain
import pandas as pd
import os

In [2]:
collection_url = "https://d.lib.ncsu.edu/collections/catalog/manifest?f[fulltext_bs][]=true&f[ispartof_facet][]=Animal+Turn&page=1"

In [3]:
single_url = "https://d.lib.ncsu.edu/collections/catalog/mc00456-001-bx0004-004-001/manifest"

In [4]:
# Add &page=1 to get the first page manifest
collection_base_url = "https://d.lib.ncsu.edu/collections/catalog/manifest/page?f%5Bfulltext_bs%5D%5B%5D=true&f%5Bispartof_facet%5D%5B%5D=Animal+Turn&page=1"

In [5]:
res = requests.get(collection_base_url)
data = res.content

In [6]:
manifest = json.loads(data)
manifest["members"]

[{'@id': 'https://d.lib.ncsu.edu/collections/catalog/mc00456-001-bx0004-004-001/manifest',
  '@type': 'sc:Manifest',
  'label': '"In at the death": Freeman on cruelty in sport',
  'dcterms:modified': '2020-01-25T05:47:13Z'},
 {'@id': 'https://d.lib.ncsu.edu/collections/catalog/mc00344_118824_20200131_1256/manifest',
  '@type': 'sc:Manifest',
  'label': '"Some Uses of Laboratory Animals"',
  'dcterms:modified': '2020-02-17T13:59:09Z'},
 {'@id': 'https://d.lib.ncsu.edu/collections/catalog/mc00456-001-bx0006-034-001/manifest',
  '@type': 'sc:Manifest',
  'label': '"The Times" article on the results of experiments on living animals',
  'dcterms:modified': '2020-01-25T05:47:57Z'},
 {'@id': 'https://d.lib.ncsu.edu/collections/catalog/mc00456-001-bx0007-013-001/manifest',
  '@type': 'sc:Manifest',
  'label': 'A bill for the abolition of vivisection',
  'dcterms:modified': '2020-01-25T05:47:35Z'},
 {'@id': 'https://d.lib.ncsu.edu/collections/catalog/mc00456-001-bx0002-009-001/manifest',
  '@ty

In [7]:
def get_id_from_url(url):
    chunks = url.split("/")
    return chunks[-2]

In [25]:
def get_ids_titles_from_manifest(manifest_url):
    res = requests.get(manifest_url)
    data = res.content
    manifest = json.loads(data)
    ids_titles = [(get_id_from_url(item["@id"]), item["label"]) for item in manifest["members"]]
    return ids_titles

In [26]:
get_ids_titles_from_manifest(collection_base_url)

[('mc00456-001-bx0004-004-001',
  '"In at the death": Freeman on cruelty in sport'),
 ('mc00344_118824_20200131_1256', '"Some Uses of Laboratory Animals"'),
 ('mc00456-001-bx0006-034-001',
  '"The Times" article on the results of experiments on living animals'),
 ('mc00456-001-bx0007-013-001', 'A bill for the abolition of vivisection'),
 ('mc00456-001-bx0002-009-001', 'A call for more helpers'),
 ('mc00456-001-bx0002-023-001', 'A concise handbook on cruel sports'),
 ('mc00456-001-bx0002-024-001',
  'A correspondence in "The Daily Mail" between Sir Victor Horsely and Dr. Walter R. Hadwen on vivisection, September 1908'),
 ('mc00620-001-bx0001-034-001', 'A dangerous ideal'),
 ('mc00456-001-bx0002-042-001',
  'A debate on is vivisection immoral, cruel, useless and unscientific?'),
 ('mc00456-001-bx0002-043-001',
  'A debate on should vivisection be abolished?'),
 ('mc00620-001-bx0001-063-001', 'A letter to children'),
 ('mc00620-001-bx0001-064-001', 'A little mourner never forgotten'),
 (

In [12]:
def get_item_count(manifest_url):
    data = requests.get(manifest_url).content
    manifest = json.loads(data)
    return manifest["total"]

In [13]:
get_item_count(collection_url)

1006

In [14]:
num_pages = get_item_count(collection_url) / 20
num_pages

50.3

In [15]:
all_items = []
for page in range(1, math.ceil(num_pages) + 1):
    url = f"https://d.lib.ncsu.edu/collections/catalog/manifest/page?f%5Bfulltext_bs%5D%5B%5D=true&f%5Bispartof_facet%5D%5B%5D=Animal+Turn&page={page}"
    ids_titles = get_ids_titles_from_manifest(url)
    all_items.append(ids_titles)

In [16]:
all_items_flat = list(chain(*all_items))

In [18]:
len(all_items_flat)

1006

In [20]:
len(set(all_items_flat))

1006

In [22]:
all_items_flat[:5]

[('mc00456-001-bx0004-004-001',
  '"In at the death": Freeman on cruelty in sport'),
 ('mc00344_118824_20200131_1256', '"Some Uses of Laboratory Animals"'),
 ('mc00456-001-bx0006-034-001',
  '"The Times" article on the results of experiments on living animals'),
 ('mc00456-001-bx0007-013-001', 'A bill for the abolition of vivisection'),
 ('mc00456-001-bx0002-009-001', 'A call for more helpers')]

In [21]:
ocr_link_example = "https://ocr.lib.ncsu.edu/ocr/mc/mc00456-001-bx0004-004-001/mc00456-001-bx0004-004-001.txt"

In [27]:
ids_titles_pre = [(id, title, id[:2]) for (id, title) in all_items_flat]
ids_titles_pre[:10]

[('mc00456-001-bx0004-004-001',
  '"In at the death": Freeman on cruelty in sport',
  'mc'),
 ('mc00344_118824_20200131_1256', '"Some Uses of Laboratory Animals"', 'mc'),
 ('mc00456-001-bx0006-034-001',
  '"The Times" article on the results of experiments on living animals',
  'mc'),
 ('mc00456-001-bx0007-013-001',
  'A bill for the abolition of vivisection',
  'mc'),
 ('mc00456-001-bx0002-009-001', 'A call for more helpers', 'mc'),
 ('mc00456-001-bx0002-023-001', 'A concise handbook on cruel sports', 'mc'),
 ('mc00456-001-bx0002-024-001',
  'A correspondence in "The Daily Mail" between Sir Victor Horsely and Dr. Walter R. Hadwen on vivisection, September 1908',
  'mc'),
 ('mc00620-001-bx0001-034-001', 'A dangerous ideal', 'mc'),
 ('mc00456-001-bx0002-042-001',
  'A debate on is vivisection immoral, cruel, useless and unscientific?',
  'mc'),
 ('mc00456-001-bx0002-043-001',
  'A debate on should vivisection be abolished?',
  'mc')]

In [28]:
ocr_links = [f"https://ocr.lib.ncsu.edu/ocr/{pre}/{id}/{id}.txt" for (id, _, pre) in ids_titles_pre]
ocr_links[40:80]

['https://ocr.lib.ncsu.edu/ocr/mc/mc00344-001-bx0001_1-008-000/mc00344-001-bx0001_1-008-000.txt',
 'https://ocr.lib.ncsu.edu/ocr/mc/mc00344-001-bx0001_1-006-000/mc00344-001-bx0001_1-006-000.txt',
 'https://ocr.lib.ncsu.edu/ocr/mc/mc00344-001-bx0001_1-007-000/mc00344-001-bx0001_1-007-000.txt',
 'https://ocr.lib.ncsu.edu/ocr/mc/mc00344-001-bx0001_2-001-000/mc00344-001-bx0001_2-001-000.txt',
 'https://ocr.lib.ncsu.edu/ocr/mc/mc00344-001-bx0001_2-002-000/mc00344-001-bx0001_2-002-000.txt',
 'https://ocr.lib.ncsu.edu/ocr/mc/mc00620-001-bx0001-008-001/mc00620-001-bx0001-008-001.txt',
 'https://ocr.lib.ncsu.edu/ocr/mc/mc00620-001-bx0001-011-002/mc00620-001-bx0001-011-002.txt',
 'https://ocr.lib.ncsu.edu/ocr/mc/mc00620-001-bx0001-011-001/mc00620-001-bx0001-011-001.txt',
 'https://ocr.lib.ncsu.edu/ocr/mc/mc00456-001-bx0005-007-004/mc00456-001-bx0005-007-004.txt',
 'https://ocr.lib.ncsu.edu/ocr/mc/mc00620-001-hb0002-018-004/mc00620-001-hb0002-018-004.txt',
 'https://ocr.lib.ncsu.edu/ocr/mc/mc0062

In [30]:
# destructure the tuples
ids = [id for (id, _, _) in ids_titles_pre]
titles = [title for (_, title, _) in ids_titles_pre]
prefixes = [pre for (_, _, pre) in ids_titles_pre] 

In [31]:
df = pd.DataFrame(data = {
    "id": ids,
    "title": titles, 
    "prefix": prefixes, 
    "ocr_url": ocr_links
})
df.head()

Unnamed: 0,id,title,prefix,ocr_url
0,mc00456-001-bx0004-004-001,"""In at the death"": Freeman on cruelty in sport",mc,https://ocr.lib.ncsu.edu/ocr/mc/mc00456-001-bx...
1,mc00344_118824_20200131_1256,"""Some Uses of Laboratory Animals""",mc,https://ocr.lib.ncsu.edu/ocr/mc/mc00344_118824...
2,mc00456-001-bx0006-034-001,"""The Times"" article on the results of experime...",mc,https://ocr.lib.ncsu.edu/ocr/mc/mc00456-001-bx...
3,mc00456-001-bx0007-013-001,A bill for the abolition of vivisection,mc,https://ocr.lib.ncsu.edu/ocr/mc/mc00456-001-bx...
4,mc00456-001-bx0002-009-001,A call for more helpers,mc,https://ocr.lib.ncsu.edu/ocr/mc/mc00456-001-bx...


In [32]:
df.to_csv("data/item-info.csv")

In [35]:
if not os.path.exists("texts"):
    os.makedirs("texts")

In [46]:
for (id, url) in zip(ids, ocr_links):
    res = requests.get(url)
    with open(f"texts/{id}.txt", 'wb') as f:
        f.write(res.content)
     

In [47]:
import glob

In [48]:
len(glob.glob("texts/*.txt"))

1006