In [1]:
from bs4 import BeautifulSoup
import csv
import numpy as np
import pandas as pd
import requests
import sqlite3
import time

In [2]:
total_url = "https://www.loc.gov/collections/directory-of-us-newspapers-in-american-libraries/?all=True&c=50"
loc_url = "https://www.loc.gov/collections/directory-of-us-newspapers-in-american-libraries/?all=true&c=50&fa=partof_collection:chronicling+america&sp=1"
chron_am_url = "https://chroniclingamerica.loc.gov/newspapers/"

In [3]:
# first get a list of lccn titles appearing on LC site -- filter by Chron Am subject tag (3683 as of 7/20/22)
# we'll iterate through 1000 items per page until we get to results (upgraded to 1000 -- remember to turn down to 50 for testing purposes)
pages = list(map(lambda x: 'https://www.loc.gov/collections/directory-of-us-newspapers-in-american-libraries/?all=true&c=1000&fa=partof_collection:chronicling+america&sp={}'.format(x), range(1, 5)))
#len(pages)

In [4]:
# use beautiful soup to grab urls for lccns
links = []
for page in pages:
    response = requests.get(page)
    time.sleep(2)
    soup = BeautifulSoup(response.text, "html.parser")
    for title in soup.find_all("span", "item-description-title"):
        link = title.find("a")["href"]
        #print(link)
        links.append(link)

In [5]:
#write list to single column csv
header = ["lccn"]
with open("lc_output.csv", "w") as f:
    write = csv.writer(f) 
    write.writerow(header) 
    for link in links:
        write.writerow([link])

In [6]:
# the chron am site, however, has more titles (3,758 as of 7/20/22)
# we go through same exercise as above but scrape data from chron am site
# this will allow us to compare the lccn title lists
# we'll also grab a bit more contextual info (state, earliest/latest issue, # of issues, and if a title essay is included)
 
    
pages = 'https://chroniclingamerica.loc.gov/newspapers/'

# use beautiful soup to grab urls for lccns
links = []
response = requests.get(pages)
soup = BeautifulSoup(response.text, "html.parser")

# print(soup.prettify())
find_table = soup.find('table')
rows = find_table.find_all('tr')

# print(rows)

chronam_data = []

for i in rows:
    table_data = i.find_all('td')
    link = i.find_all("a")
    datum = [j.text for j in table_data]
    dats = [j.get("href") for j in link]
    zipped = [item for sublist in zip(datum, dats) for item in sublist]
#     print(zipped)
    chronam_data.append(zipped)


In [7]:
# print(chronam_data)

header = ['state', 'state_link', 'title', 'lccn', 'browse', 'browse_link', 'issue_num', 'first_issue_link', 
          'first_issue', 'last_issue_link', 'last_issue', 'lccn_link' ]
with open("chronam.csv", "w") as f:
    write = csv.writer(f) 
    write.writerow(header) 
    for link in chronam_data:
        write.writerow(link)

In [8]:
#####

In [9]:
# Next we import both csvs and use pandas to clean up data.
# becasue both lists use the same lccns we use a left join to figure out
# what is included on the chron am site but not yet live on LC

lcdf = pd.read_csv('lc_output.csv')  
cadf = pd.read_csv('chronam.csv')

lcdf.head()
#cadf.head()

Unnamed: 0,lccn
0,https://www.loc.gov/item/sn85026945/
1,https://www.loc.gov/item/sn93067670/
2,https://www.loc.gov/item/sn93067668/
3,https://www.loc.gov/item/sn84026853/
4,https://www.loc.gov/item/sn85042527/


In [10]:
# strip off excess from lccns
cadf.lccn = cadf.lccn.str.strip('/lccn/')
lcdf.lccn = lcdf['lccn'].str.replace(r'.*m', '', regex = True)
lcdf.lccn = lcdf.lccn.str.strip('/')

In [11]:
# check data (remove this)
lcdf.head()

Unnamed: 0,lccn
0,sn85026945
1,sn93067670
2,sn93067668
3,sn84026853
4,sn85042527


In [12]:
#reset the index for lccns
lcdf.set_index('lccn')

sn85026945
sn93067670
sn93067668
sn84026853
sn85042527
...
sn84024547
2017218620
2017218621
2017218622
sn91037345


In [13]:
#reset the index for lccns
cadf.set_index('lccn')

Unnamed: 0_level_0,state,state_link,title,browse,browse_link,issue_num,first_issue_link,first_issue,last_issue_link,last_issue,lccn_link
lccn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
sn86072192,Alabama,/newspapers/alabama/,"The age-herald. [volume]Birmingham, Ala., 1897...",,/lccn/sn86072192/issues/,1630,/lccn/sn86072192/1897-08-01/ed-1/,1897-08-01,/lccn/sn86072192/1902-05-20/ed-1/,1902-05-20,/lccn/sn86072192/
sn84021903,Alabama,/newspapers/alabama/,Alabama state intelligencer. [volume]Tuscaloos...,,/lccn/sn84021903/issues/,50,/lccn/sn84021903/1831-01-01/ed-1/,1831-01-01,/lccn/sn84021903/1831-12-24/ed-1/,1831-12-24,/lccn/sn84021903/
sn84020639,Alabama,/newspapers/alabama/,"Birmingham age-herald. [volume]Birmingham, Ala...",,/lccn/sn84020639/issues/,423,/lccn/sn84020639/1894-07-01/ed-1/,1894-07-01,/lccn/sn84020639/1895-10-03/ed-1/,1895-10-03,/lccn/sn84020639/
sn85038485,Alabama,/newspapers/alabama/,"The Birmingham age-herald. [volume]Birmingham,...",,/lccn/sn85038485/issues/,8237,/lccn/sn85038485/1902-05-21/ed-1/,1902-05-21,/lccn/sn85038485/1924-12-31/ed-1/,1924-12-31,/lccn/sn85038485/
sn85044812,Alabama,/newspapers/alabama/,"Birmingham state herald.Birmingham, Ala., 1895...",,/lccn/sn85044812/issues/,570,/lccn/sn85044812/1895-10-04/ed-1/,1895-10-04,/lccn/sn85044812/1897-07-31/ed-1/,1897-07-31,/lccn/sn85044812/
...,...,...,...,...,...,...,...,...,...,...,...
sn92067235,Wyoming,/newspapers/wyoming/,"Rawlins semi-weekly Republican.Rawlins, Carbon...",,/lccn/sn92067235/issues/,344,/lccn/sn92067235/1898-01-05/ed-1/,1898-01-05,/lccn/sn92067235/1901-08-03/ed-1/,,
sn83002748,Wyoming,/newspapers/wyoming/,"The Saratoga sun.Saratoga, Carbon County, Wyo....",,/lccn/sn83002748/issues/,1748,/lccn/sn83002748/1891-07-14/ed-1/,1891-07-14,/lccn/sn83002748/1926-02-25/ed-1/,1926-02-25,/lccn/sn83002748/
sn83025232,Wyoming,/newspapers/wyoming/,"Wyoming state tribune. [volume]Cheyenne, Wyo.,...",,/lccn/sn83025232/issues/,2,/lccn/sn83025232/1918-11-11/ed-1/,1918-11-11,/lccn/sn83025232/1919-10-30/ed-1/,1919-10-30,/lccn/sn83025232/
sn83025231,Wyoming,/newspapers/wyoming/,"The Wyoming tribune. [volume]Cheyenne, Wyo., 1...",,/lccn/sn83025231/issues/,199,/lccn/sn83025231/1888-05-10/ed-1/,1888-05-10,/lccn/sn83025231/1910-11-02/ed-1/,1910-11-02,/lccn/sn83025231/


In [14]:
diff_df = pd.merge(lcdf, cadf, how='outer', indicator='Exist')

In [15]:
diff_df = diff_df.loc[diff_df['Exist'] != 'both']
diff_df

Unnamed: 0,lccn,state,state_link,title,browse,browse_link,issue_num,first_issue_link,first_issue,last_issue_link,last_issue,lccn_link,Exist
364,sn83025521,,,,,,,,,,,,left_only
374,2010218508,,,,,,,,,,,,left_only
653,sn82016478,,,,,,,,,,,,left_only
666,02027093,,,,,,,,,,,,left_only
889,sn84025954,,,,,,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,sn92067235,Wyoming,/newspapers/wyoming/,"Rawlins semi-weekly Republican.Rawlins, Carbon...",,/lccn/sn92067235/issues/,344.0,/lccn/sn92067235/1898-01-05/ed-1/,1898-01-05,/lccn/sn92067235/1901-08-03/ed-1/,,,right_only
3879,sn83002748,Wyoming,/newspapers/wyoming/,"The Saratoga sun.Saratoga, Carbon County, Wyo....",,/lccn/sn83002748/issues/,1748.0,/lccn/sn83002748/1891-07-14/ed-1/,1891-07-14,/lccn/sn83002748/1926-02-25/ed-1/,1926-02-25,/lccn/sn83002748/,right_only
3880,sn83025232,Wyoming,/newspapers/wyoming/,"Wyoming state tribune. [volume]Cheyenne, Wyo.,...",,/lccn/sn83025232/issues/,2.0,/lccn/sn83025232/1918-11-11/ed-1/,1918-11-11,/lccn/sn83025232/1919-10-30/ed-1/,1919-10-30,/lccn/sn83025232/,right_only
3881,sn83025231,Wyoming,/newspapers/wyoming/,"The Wyoming tribune. [volume]Cheyenne, Wyo., 1...",,/lccn/sn83025231/issues/,199.0,/lccn/sn83025231/1888-05-10/ed-1/,1888-05-10,/lccn/sn83025231/1910-11-02/ed-1/,1910-11-02,/lccn/sn83025231/,right_only


In [16]:
#write to csv
diff_df.to_csv("diff.csv")

In [None]:
## Notes:
## left only exists on lc but not chron am. appears a later addition? 
## right only appears only on chron am -- perhaps metadata tagging issues?
## number discrepancies on chron am site -- appears to be hardcoded and not updated
## also need to figure out filtering on lc site.