### FTP API data grab

This file contains the code for downloading a FTP transcription export from their API

Still a work in progress

I am using Python's requests library because I have more facility in Python and Windows PowerShell is a curse that I prefer not to suffer under when I can avoid it. If on a MacOS or Linux/Unix system, the documentation on the FTP website explains how to accomplish this from the command line: https://content.fromthepage.com/project-owner-documentation/api-keys/ 

In [1]:
# hiding the API key
import os
import dotenv

# change to the directory where the dotenv file is (unique for each person)
os.chdir("/Users/charl/JBPP")

# load in stuff hidden in the .env file
dotenv.load_dotenv()
JBPP_key = os.getenv('JBPP_key')

In [3]:
# import required packages
import requests
import pandas as pd
import json
import re

# code to create post request
apikey = JBPP_key

root = "http://fromthepage.com/iiif"
endpoint = "/collection/charlie-transfer-to-drupal" # this endpoint is the only thing that needs editing 
# use IIIF slug found at bottom of "export" tab in FTP document set you want to export from
headers = {"Authorization": f"Token token={apikey}"}

In [5]:
# submit post request using requests library (operates same as cURL, just in Python)
response = requests.post(root+endpoint, headers=headers)


In [4]:
# to run if you wanna look at the raw text or check status

# print(response.status_code)
# should be 200
# print(response.text)

In [7]:
# convert to dataframe using json_normalize
# record_path=['manifests'] is to ignore metadata associated with the API call that's returned in the response
# but is not connected to the actual doc set content
response_df = pd.json_normalize(json.loads(response.text), record_path=['manifests'])

In [11]:
# if interested in taking a look at the dataframe:

# response_df.head()
# I'm curious if anything else will become metadata because there's more metadata in the bulk uploaded documents
# If it breaks later, refer to that column to see how to fix it

In [7]:
# don't run this cell (you can, you just don't need to)

#bunch_of_tuples = [] # this is where we'll store all the key (PJB ID) - value (Document Body) pairs to convert to a dataframe

#for i in range(len(response_df)): # iterates over each row of the dataframe - 
#    # there are other ways to do this but it's not prohibitively inefficient
#    url = response_df['@id'][i] # indexes into the value in the first column of the dataframe (the IIIF url)
#    cut = url.split('/')[4] # slices out the unique work_id - to be used to locate plaintext export
#    try:
#        pjb_id = response_df['metadata'][i][0]['value'] # tries to make key based on identifier aka PJB ID
#    except TypeError:
#       pjb_id = cut # if it fails, it instead makes key on the basis of the work_id (guaranteed to be unique)
#    new_url = f'https://fromthepage.com/iiif/{cut}/export/plaintext/verbatim' # url that hosts the plaintext export
#    final = requests.get(new_url) # get request on plaintext export url
#    bunch_of_tuples.append((pjb_id, final.text)) # appends key-value pair to dictionary
    

In [13]:
# testing for XHTML compatibility (I think it might be better than plaintext)
bunch_of_tuples = [] # this is where we'll store all the variable pairs (PJB ID, Document Body) to convert to a dataframe

for i in range(len(response_df)): # iterates over each row of the dataframe - 
    # there are other ways to do this but it's not prohibitively inefficient
    url = response_df['@id'][i] # indexes into the value in the first column of the dataframe (the IIIF url)
    cut = url.split('/')[4] # slices out the unique work_id - to be used to locate html export
    try:
        pjb_id = response_df['metadata'][i][0]['value'] # tries to make key based on idenitifier aka PJB ID
    except TypeError:
        pjb_id = cut # if it fails, it instead makes key on the basis of the work_id (guaranteed to be unique)
    new_url = f'https://fromthepage.com/iiif/{cut}/export/html' # url that hosts the html export
    final = requests.get(new_url) # get request on html export url
    html = final.text
    title_position = html.find('<title>')
    desired_content = html[title_position:] 
    bunch_of_tuples.append((pjb_id, desired_content))# appends to list of tuples

In [125]:
# if you wanna check out the dictionary (should be same length as response_df)

# bunch_of_tuples

In [15]:
# create dataframe, label columns + set PJB ID as index
df_final = pd.DataFrame(bunch_of_tuples, columns=['PJB ID', 'Document Body'])

In [17]:
df_final

Unnamed: 0,PJB ID,Document Body
0,PJB 2353,"<title> From Julian Bond to Vivian Henderson,..."
1,PJB 2654,<title> (2nd Copy) From Julian Bond to Camille...
2,PJB 2747,<title> (2nd copy) From Julian Bond to Gary Co...
3,PJB 2057,<title> (Carbon Copy) To Julian Bond from Dr. ...
4,PJB 2213,<title> (College Student Letter) To Julian Bon...
...,...,...
221,PJB 1874,"<title> To Julian Bond from Walter Kyle, 24 Ju..."
222,PJB 1609,"<title> To Julian Bond from Walter Pinkston, T..."
223,PJB 1493,"<title> To Julian Bond from Wilbur Hicks, ca. ..."
224,PJB 1517,"<title> To Julian Bond from William DuFresne, ..."


In [19]:
# to check how it looks with a two page doc
# newlist = [x for x in list if "To Julian Bond from Margaret Linton" in x]
# newlist

In [21]:
# if you wanna take a look at the final dataframe

mylist = df_final['Document Body'].tolist()
mylist[0]
# I wonder if the title tags would break this

'<title>  From Julian Bond to Vivian Henderson, 20 Feb 1969</title>\n    </head>\n\n    <body>\n    <h1 class="work-title"> From Julian Bond to Vivian Henderson, 20 Feb 1969</h1>\n    <div class="export-metadata"><span class="translation_missing" title="translation missing: en.export.show.html.erb.export_metadata, work:  From Julian Bond to Vivian Henderson, 20 Feb 1969, collection: The Papers of Julian Bond, time: 2025-02-24 00:28:34 +0000">Export Metadata</span>\n        <p><span class="translation_missing" title="translation missing: en.export.show.html.erb.identifier, work: PJB 2353">Identifier</span></p>\n      <p>\n        <span class="translation_missing" title="translation missing: en.export.show.html.erb.fromthepage_version, version: 22.10">Fromthepage Version</span>\n      </p>\n    </div>\n\n    <hr />\n    <h2 class="divider"><span class="translation_missing" title="translation missing: en.export.show.html.erb.page_transcripts">Page Transcripts</span></h2>\n\n    <div class

In [23]:
editors_tag = 'small'
editors_tag_class = ' class="page-version-username"'
# title_tag = 'title'
# title_tag_class = ''
# I think I would prefer to match on title even if PJB ID is more robust because title is easier to extrac
content_tag = 'div'
content_tag_class = ' class="page-content"'
tags = {editors_tag: editors_tag_class,
       # title_tag: title_tag_class,
       content_tag: content_tag_class}
final_list = []

for i in range(len(mylist)):
    dirty = mylist[i]
    dictionary = {}
    for tag, tag_class in tags.items():
        reg_str = "<" + tag + tag_class + ">(.*?)</" + tag + ">"
        res = re.findall(reg_str, dirty, re.DOTALL)
        key = tag
        dictionary[key] = res
    dictionary['small'] = set(dictionary['small'])
    for k,v in dictionary.items():
        target = ' '.join(v)
        target_stage_2 = target.replace('\n','')
        dictionary[k] = target_stage_2.strip(' ')
    check = dictionary['div'] + "<p>Thanks to FromThePage transcription contributors: " + dictionary['small'] + "</p>"
    final_list.append(check)
    

In [25]:
df_final['Document Body'] = final_list

In [27]:
id_list = df_final['PJB ID'].to_list()
new_ids = []
for id in id_list:
    # Remove any spaces or PJBs for standardization
    id = id.replace(" ", "")
    id = id.replace("PJB", "")
    id = 'PJB ' + id
    new_ids.append(id)
        
print(new_ids)
df_final['PJB ID'] = new_ids
df_final

['PJB 2353', 'PJB 2654', 'PJB 2747', 'PJB 2057', 'PJB 2213', 'PJB 3252', 'PJB 5174', 'PJB 5014', 'PJB 4772', 'PJB 4773', 'PJB 4775', 'PJB 2147', 'PJB 2246', 'PJB 2217', 'PJB 2183', 'PJB 2151', 'PJB 2185', 'PJB 5395', 'PJB 4396', 'PJB 3743', 'PJB 3535', 'PJB 3652', 'PJB 3728', 'PJB 3840', 'PJB 5397', 'PJB 4440', 'PJB 3726', 'PJB 3540', 'PJB 3660', 'PJB 3735', 'PJB 3854', 'PJB 2866', 'PJB 2238', 'PJB 2805', 'PJB 2324', 'PJB 3290', 'PJB 2154', 'PJB 5673', 'PJB 5718', 'PJB 2710', 'PJB 4553', 'PJB 3261', 'PJB 4152', 'PJB 2432', 'PJB 2454', 'PJB 2479', 'PJB 2424', 'PJB 1938', 'PJB 2421', 'PJB 2441', 'PJB 2462', 'PJB 2443', 'PJB 2445', 'PJB 2368', 'PJB 2397', 'PJB 2367', 'PJB 2433', 'PJB 2444', 'PJB 2422', 'PJB 2437', 'PJB 2403', 'PJB 2468', 'PJB 2442', 'PJB 2456', 'PJB 1903', 'PJB 2035', 'PJB 1881', 'PJB 2003', 'PJB 1911', 'PJB 2023', 'PJB 1901', 'PJB 1850', 'PJB 2016', 'PJB 1842', 'PJB 1959', 'PJB 1869', 'PJB 1862', 'PJB 1997', 'PJB 1965', 'PJB 2027', 'PJB 1971', 'PJB 1985', 'PJB 1993', 'PJ

Unnamed: 0,PJB ID,Document Body
0,PJB 2353,"<p>February 20, 1969</p><p>Dear Dr. Henderson,..."
1,PJB 2654,"<p>October 23, 1969</p><p>Dear Mrs. Levy,</p><..."
2,PJB 2747,"<p>November 5, 1969</p><p>Dear Mr. Conner,</p>..."
3,PJB 2057,<p>memorandum to Julian Bond<br/>from Dr. H. M...
4,PJB 2213,<p>[Letterhead logo: Inner three-ringed circl...
...,...,...
221,PJB 1874,<p>Wedgewood High School<br/>3420 PINESTEAD RO...
222,PJB 1609,<p>[WESTERN UNION TELEGRAM]<br/>[left hand sid...
223,PJB 1493,"<p>FISK UNIVERSITY<br/>NASHVILLE, TENNESSEE 37..."
224,PJB 1517,<p><i><b>Economic Opportunity</b> </i><br/><i>...


In [43]:
# export to csv for transfer to Drupal!
import datetime
# just gonna do UTC minus 5 because timezones are a pain, and this doesn't need to be perfect
# so sometimes it'll be CDT and sometimes EST but so be it
date = datetime.datetime.now(datetime.UTC) - datetime.timedelta(hours = 5) 
print(f'{date:%Y%m%d}')
df_final.to_csv(f'export_to_drupal_{date:%m%d%Y}.csv', index=False)

20250223
