In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option("display.max_row", None)
import numpy as np
import re
import nltk
import glob

In [2]:
# Function that calculates "points" for a card to be a "main entry" vs. "narrative," returns True if it's a "main entry"
def is_main_entry(index, row,df):
    # Check if previous and subsequent card authors match -> fix mispelled name in between
    if index > 0 and index < len(df)-1 and df.iloc[index-1]['Name'] == df.iloc[index+1]['Name'] and row['Name'] != df.iloc[index-1]['Name']:
        row['Name'] = df.iloc[index-1]['Name']
    # Check for new author -> automatic "main entry"
    elif index == 0 or row['Name'] != df.iloc[index-1]['Name']:
        return True
        
    # Calculate "points" for classification
    main_entry_pts, narrative_pts = 0, 0
    
    # Check length, longer -> narrative, shorter -> main entry
    if len(row['Text']) > 300:
        narrative_pts += 1
    elif len(row['Text']) < 200:
        main_entry_pts += 1
        
    # Check for misc. main entry identifiers ("GUIDE", "Section X", "X items") -> main entry
    if "Section" in row['Text']:
        main_entry_pts += 1
    if "items" in row['Text']:
        main_entry_pts += 1
    if "GUIDE" in row['Text'].upper() or "GUID" in row['Text'].upper() or "GUDE" in row['Text'].upper():
        main_entry_pts += 3
        
    # Check for misc. narrative identifiers
    if "FOR INFORMATION ABOUT THIS COLLECTION" in row['Text']:
        narrative_pts += 5
    if "Addition" in row['Text'] or "added" in row['Text']:
        narrative_pts += 1
    if not re.search(r"\d+", row['Text']):
        narrative_pts += 1
    if type(row['Text']) == str and type(df.iloc[index-1]['Text']) == str:
        # Matches begining of previous card
        if row['Text'][:50] == df.iloc[index-1]['Text'][:50]:
            narrative_pts += 1
        # Previous card indicates continuation of narrative onto current card
        if "(Continued on next card)" in df.iloc[index-1]['Text'] or "SEE NEXT CRD" in df.iloc[index-1]['Text'] or "Go to following card" in df.iloc[index-1]['Text']:
            narrative_pts += 5
        
    # Check for capitalized last name & card number at beginning of text (Card 2, JONES, Smith, John 3, etc.) -> narrative
    if re.search(r"Card [0-9]+", row['Text']):
        narrative_pts += 2
    elif (row['Name'].split(",")[0]).upper() in row['Text']:
        narrative_pts += 1
    elif re.search(r"{row['Name']} \d+", row['Text']):
        narrative_pts += 2
    
    # Tokenize card
    word_tokens = nltk.word_tokenize(row['Text'])
    tagged_text = nltk.pos_tag(word_tokens)
    
    # Check for "sentence format" (pos only present in narrative [verb, adj.]) -> narrative
    narr_pos = ["VB", "JJ", "PRP", "RB", "IN", "DT"]
    for word in tagged_text:
        if word[1] in narr_pos:
            narrative_pts += .3
    
    # Check for accessioned date (e.g., 13 MAR 97, 10-10-69) -> main entry
    if re.search(r"\d{1,2}-\d{1,2}-\d{2}", row['Text']):
        main_entry_pts += 1
    elif re.search(r"\d{1,2}\s[A-Z]{3}\s\d{2}", row['Text']):
        main_entry_pts += 2
    elif re.search(r"[A-Z]{3}\s\d{1,2}\s\d{4}", row['Text']):
        main_entry_pts += 2
        
#     print("{}: {} {}".format(index, main_entry_pts, narrative_pts))
        
    # Determine card classification
    if main_entry_pts >= narrative_pts:
        return True
    return False

In [3]:
def sort_collection(df):
    # Check every card and classify as "main entry" or "narrative", number by collection
    collection_num, collection_list = -1, []
    for index, row in df.iterrows():
        # Check for NaN values
        if pd.isna(row['Text']) or pd.isna(row['Name']) or row['Text'] == "" or row['Name'] == "":
            collection_list.append(-1)
        else:
            # Main entries signify a new collection
            if is_main_entry(index, row,df):
                collection_num += 1
            collection_list.append(collection_num)
    # Add column to dataframe with collections numbered
    df['Collection'] = collection_list
    # Label by collection headers as index, with original index
    df["Page_No_Within_Drawer"] = df.index
    curr = -100
    collection_header = []
    for index,row in df.iterrows():
        if(row["Collection"]!= curr):
            collection_header.append(row["Name"])
        else:
            collection_header.append("")
        curr = row["Collection"]
    df["Collection_Head"] = collection_header
    return df

In [None]:
# Sort all drawers and combine them
file_list_name = (glob.glob(r"catalog\file_name_checked/*"))
file_list = [pd.read_csv(i) for i in file_list_name][:2]
sorted_list = [sort_collection(i) for i in file_list]
all_text = pd.concat(sorted_list)


In [8]:
all_text.to_csv("all_sorted_before_year_column.csv",index=False)

In [11]:
all_text = pd.read_csv("all_sorted_before_year_column.csv")

In [12]:
# add an is_collection_header simple column to label easily
li = []
n = -100
for i in all_text.Collection.values:
    if(n==-1):
        li.append(0)
    elif(n!=i):
        li.append(1)
    else:
        li.append(0)
    n=i
all_text["Is_collection_head"] = li
all_text.head()


Unnamed: 0,Name,Text,Collection,Page_No_Within_Drawer,Collection_Head,Is_collection_head
0,A. B. Davis and Company,"A. B. Davis and Company (Philadelphia, Pa. ) See Davis (A. B.) and Company",0,0,A. B. Davis and Company,1
1,A.H. Roscoe,"AeHe Roscoe (Firm: Nashville, Tenne )e Journal, 1853, Septe-1857, Dece 1 item( 800 ppe )e Wholesale and retail druggist and dealer in paints, oils, and dyestuffse Summary: Journal (account book ) documents the sale of chemical and herbal drugs, paint and painting supplies, dyestuffs, personal and household supplies, and garden seeds to individuals and businessese le Drugstores--Tennesseee 2e Paint shops--Equipment and supplies-—- Tennesseee 3e Dyes and dyeinge 46 Household supplies--Tennesseee 5c Herbs--Therapeutic usee 6+ Seed industry and trade--Tennessees 7e Nashville ite® nne )—--Commercee 20 MAY QO1 23804905 NDHYme",1,1,A.H. Roscoe,1
2,Abbeville District,Abbeville District (8.C.) See South Carolina. Abbeville District,2,2,Abbeville District,1
3,"Abbott, William B","Abbott, William B. Papers, 1862-1864 Frederick Co., Va. Section A 5-16-57 GUIDE 10 items",3,3,"Abbott, William B",1
4,"Abbott, William B","Abbott, William B. Papers, 1862-1864, Fre- Gerick Co., Va. 10 items. Sketch These are the papers of William B. Abbott, evidently a well-to-do farmer of Frederick Co., Va. There are several documents concerned with the evaluation of damage done to his property by C. S. A. troops in 1862, and.receipts in 1864 for hay bought from Abbott at various times in Aug., 1864 by the C. S. A. Army.",3,4,,0


In [13]:
# Create a year column for main entries
main_entries =  all_text[all_text["Is_collection_head"]==1]
ind_main_entries = all_text[all_text["Is_collection_head"]==1].index


# patterns used to extract sequentially if one does not exist
first = "(1\d{3}[-]\s?\d{2,4})"
second = "(1\d{3}.{0,9}1\d{0,3})"
third = "(1\d{3})"
fourth = "(1\d{3}[-]c(a)?\\.)"
fifth = "(n\\.\sd\\.)"    #no date

year_vals = main_entries.Text.str.extract(first)[0].fillna(( # if it returns na, find next pattern
    main_entries.Text.str.extract(second)[0])).fillna((
        main_entries.Text.str.extract(third)[0])).fillna((
        main_entries.Text.str.extract(fourth)[0])).fillna((
        main_entries.Text.str.extract(fifth)[0])).values

all_text["Year"] =np.nan
all_text.loc[ind_main_entries,"Year"]= year_vals
all_text.head(10)

Unnamed: 0,Name,Text,Collection,Page_No_Within_Drawer,Collection_Head,Is_collection_head,Year
0,A. B. Davis and Company,"A. B. Davis and Company (Philadelphia, Pa. ) See Davis (A. B.) and Company",0,0,A. B. Davis and Company,1,
1,A.H. Roscoe,"AeHe Roscoe (Firm: Nashville, Tenne )e Journal, 1853, Septe-1857, Dece 1 item( 800 ppe )e Wholesale and retail druggist and dealer in paints, oils, and dyestuffse Summary: Journal (account book ) documents the sale of chemical and herbal drugs, paint and painting supplies, dyestuffs, personal and household supplies, and garden seeds to individuals and businessese le Drugstores--Tennesseee 2e Paint shops--Equipment and supplies-—- Tennesseee 3e Dyes and dyeinge 46 Household supplies--Tennesseee 5c Herbs--Therapeutic usee 6+ Seed industry and trade--Tennessees 7e Nashville ite® nne )—--Commercee 20 MAY QO1 23804905 NDHYme",1,1,A.H. Roscoe,1,"1853, Septe-1857"
2,Abbeville District,Abbeville District (8.C.) See South Carolina. Abbeville District,2,2,Abbeville District,1,
3,"Abbott, William B","Abbott, William B. Papers, 1862-1864 Frederick Co., Va. Section A 5-16-57 GUIDE 10 items",3,3,"Abbott, William B",1,1862-1864
4,"Abbott, William B","Abbott, William B. Papers, 1862-1864, Fre- Gerick Co., Va. 10 items. Sketch These are the papers of William B. Abbott, evidently a well-to-do farmer of Frederick Co., Va. There are several documents concerned with the evaluation of damage done to his property by C. S. A. troops in 1862, and.receipts in 1864 for hay bought from Abbott at various times in Aug., 1864 by the C. S. A. Army.",3,4,,0,
5,Abbott & Company,"Abbott & Company Papers, 1856-1871 Philadelphia, Pennsylvania a Section A 66 items OCT 10 ""49",4,5,Abbott & Company,1,1856-1871
6,Abbott & Company,"Abderhalden, Emil Papers, 1919 Halle, Germany Josiah C. Trent Collection in the History of Medicine--, Div, 4-8-60",5,6,Abbott & Company,1,1919
7,Abbott & Company,"Abbott & Company. Papers 1856-1871 Philadelphia, Pennsylvania, € Miscellaneous letters concerning scales sold by Abbott & Company.",6,7,Abbott & Company,1,1856-1871
8,"Abel, Ernest L.","Abel, Ernest L. Papers, 1922-1952 West Palm Beach, Palm Beach Co., Fla. Saat; 550 items & 8 vols. Box 1 of Labor Archives",7,8,"Abel, Ernest L.",1,1922-1952
9,"Abel, Ernest L.","Abel, Ernest L. Papers. West Palm Beach, Palm Beach Co., Fla. _’ : Ernest L. Abel was involved in postal union activity at least from the mid-1920s to the early 1950s. He served from 1925-1926 as Secretary-Treasurer for Local Union No. 749 of the National Federation of Post Office Clerks (AFL) and was State Representative for the National Federation at the same time. In 1927, Abel became the first president of the Florida Federation of Post Office Clerks, His union service included being President Union No. 749 in 1945, and State Legislative Representative of the’ ® Florida",7,9,,0,


In [15]:
# Get locations
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
loc_vals = []

for i in main_entries.index:
    doc = nlp(str(main_entries.Text[i]))
    l = list()
    st= set()
    for ent in doc.ents:
        if(ent.text in st):
            continue
        if (ent.label_ == 'GPE'):
            l.append(ent.text)
            st.add(ent.text)
        elif (ent.label_ == 'LOC'):
            l.append(ent.text)
            st.add(ent.text)

    loc_vals.append(", ".join(l))
all_text["Loc"] =np.nan
all_text.loc[ind_main_entries,"Loc"]= loc_vals
all_text.head()

Unnamed: 0,Name,Text,Collection,Page_No_Within_Drawer,Collection_Head,Is_collection_head,Year,Loc
0,A. B. Davis and Company,"A. B. Davis and Company (Philadelphia, Pa. ) See Davis (A. B.) and Company",0,0,A. B. Davis and Company,1,,Pa.
1,A.H. Roscoe,"AeHe Roscoe (Firm: Nashville, Tenne )e Journal, 1853, Septe-1857, Dece 1 item( 800 ppe )e Wholesale and retail druggist and dealer in paints, oils, and dyestuffse Summary: Journal (account book ) documents the sale of chemical and herbal drugs, paint and painting supplies, dyestuffs, personal and household supplies, and garden seeds to individuals and businessese le Drugstores--Tennesseee 2e Paint shops--Equipment and supplies-—- Tennesseee 3e Dyes and dyeinge 46 Household supplies--Tennesseee 5c Herbs--Therapeutic usee 6+ Seed industry and trade--Tennessees 7e Nashville ite® nne )—--Commercee 20 MAY QO1 23804905 NDHYme",1,1,A.H. Roscoe,1,"1853, Septe-1857","Nashville, Tenne"
2,Abbeville District,Abbeville District (8.C.) See South Carolina. Abbeville District,2,2,Abbeville District,1,,"Abbeville District, South Carolina"
3,"Abbott, William B","Abbott, William B. Papers, 1862-1864 Frederick Co., Va. Section A 5-16-57 GUIDE 10 items",3,3,"Abbott, William B",1,1862-1864,"Abbott, Va."
4,"Abbott, William B","Abbott, William B. Papers, 1862-1864, Fre- Gerick Co., Va. 10 items. Sketch These are the papers of William B. Abbott, evidently a well-to-do farmer of Frederick Co., Va. There are several documents concerned with the evaluation of damage done to his property by C. S. A. troops in 1862, and.receipts in 1864 for hay bought from Abbott at various times in Aug., 1864 by the C. S. A. Army.",3,4,,0,,


In [16]:
# Get linke identifiers, size of each drawer and drawer numbers 
identifier_list = ['rubensteinmanuscriptcatalog_A_to_Amer', 'rubensteinmanuscriptcatalog_Ames_to_Armh', 'rubensteinmanuscriptcatalog_Armi_to_Bal', 'rubensteinmanuscriptcatalog_Bam_to_Bedh', 'rubensteinmanuscriptcatalog_Bedi_to_Bem', 'rubensteinmanuscriptcatalog_Ben_to_Blac', 'rubensteinmanuscriptcatalog_Blad_to_Q._Boyd', 'rubensteinmanuscriptcatalog_R._Boyd_to_I._Brown', 'rubensteinmanuscriptcatalog_J._Brown_to_L._Butler', 'rubensteinmanuscriptcatalog_M._Butler_to_Caq', 'rubensteinmanuscriptcatalog_Car_to_Chaq', 'rubensteinmanuscriptcatalog_Char_to_Clav', 'rubensteinmanuscriptcatalog_Claw_to_Com', 'rubensteinmanuscriptcatalog_Con_to_Conk', 'rubensteinmanuscriptcatalog_Conl_to_Crd', 'rubensteinmanuscriptcatalog_Cre_to_I._Davis', 'rubensteinmanuscriptcatalog_J._Davis_to_Dek', 'rubensteinmanuscriptcatalog_Del_to_Dov', 'rubensteinmanuscriptcatalog_Dow_to_Durg', 'rubensteinmanuscriptcatalog_Durh_to_Els', 'rubensteinmanuscriptcatalog_Elt_to_Fie', 'rubensteinmanuscriptcatalog_Fif_to_Frn', 'rubensteinmanuscriptcatalog_Fro_to_Geq', 'rubensteinmanuscriptcatalog_Ger_to_Gorl', 'rubensteinmanuscriptcatalog_Gorm_to_Grd', 'rubensteinmanuscriptcatalog_Gre_to_Grh', 'rubensteinmanuscriptcatalog_Gri_to_Hand', 'rubensteinmanuscriptcatalog_Hane_to_Harv', 'rubensteinmanuscriptcatalog_Harw_to_Hem', 'rubensteinmanuscriptcatalog_Hen_to_Holc', 'rubensteinmanuscriptcatalog_Hold_to_Huba', 'rubensteinmanuscriptcatalog_Hubb_to_I._Jackson', 'rubensteinmanuscriptcatalog_J._Jackson_to_I._Johnson', 'rubensteinmanuscriptcatalog_J._Johnson_to_Jz', 'rubensteinmanuscriptcatalog_K_to_Kira', 'rubensteinmanuscriptcatalog_Kirb_to_Lano', 'rubensteinmanuscriptcatalog_Lanp_to_Ler', 'rubensteinmanuscriptcatalog_Les_to_Lowq', 'rubensteinmanuscriptcatalog_Lowr_to_Mack', 'rubensteinmanuscriptcatalog_Macl_to_Manh', 'rubensteinmanuscriptcatalog_Mani_to_Maw', 'rubensteinmanuscriptcatalog_Max_to_Metg', 'rubensteinmanuscriptcatalog_Meth_to_Mh', 'rubensteinmanuscriptcatalog_Mi_to_Mord', 'rubensteinmanuscriptcatalog_More_to_Mur', 'rubensteinmanuscriptcatalog_Mus_to_Nn', 'rubensteinmanuscriptcatalog_No_to_Oz', 'rubensteinmanuscriptcatalog_P_to_Peo', 'rubensteinmanuscriptcatalog_Pep_to_Pn', 'rubensteinmanuscriptcatalog_Po_to_Puk', 'rubensteinmanuscriptcatalog_Pul_to_Rh', 'rubensteinmanuscriptcatalog_Ri_to_Rooj', 'rubensteinmanuscriptcatalog_Rook_to_Sam', 'rubensteinmanuscriptcatalog_San_to_Sem', 'rubensteinmanuscriptcatalog_Sen_to_Simo', 'rubensteinmanuscriptcatalog_Simp_to_Wh._Smith', 'rubensteinmanuscriptcatalog_Wi._Smith_to_So', 'rubensteinmanuscriptcatalog_Sp_to_Std', 'rubensteinmanuscriptcatalog_Ste_to_Sv', 'rubensteinmanuscriptcatalog_Sw_to_I._Thomas', 'rubensteinmanuscriptcatalog_J._Thomas_to_Tom', 'rubensteinmanuscriptcatalog_Ton_to_Tz', 'rubensteinmanuscriptcatalog_U', 'rubensteinmanuscriptcatalog_V_to_Ward', 'rubensteinmanuscriptcatalog_Ware_to_H._White', 'rubensteinmanuscriptcatalog_I._White_to_R._Williams', 'rubensteinmanuscriptcatalog_S._Williams_to_Wood', 'rubensteinmanuscriptcatalog_T._Wood_to_Wz', 'rubensteinmanuscriptcatalog_X_to_Z']
real_size = [838, 680, 935, 720, 639, 787, 765, 776, 730, 681, 707, 777, 798, 645, 790, 741, 654, 760, 581, 658, 772, 766, 757, 782, 660, 704, 761, 780, 736, 737, 750, 923, 730, 706, 668, 691, 720, 753, 844, 686, 713, 339, 604, 787, 751, 764, 796, 858, 762, 877, 839, 833, 778, 823, 653, 662, 692, 702, 873, 733, 713, 689, 659, 774, 815, 734, 727, 543, 284]
drawer_no = ['157', '158', '159', '160', '161', '162', '163', '164', '165', '166', '167', '169', '170', '171', '172', '173', '174', '175', '176', '177', '178', '179', '181', '182', '183', '184', '185', '186', '187', '188', '189', '190', '191', '193', '194', '195', '196', '197', '198', '199', '200', '201', '202', '203', '205', '206', '207', '208',
 '209', '210', '211', '212', '213', '214', '215', '217', '218', '219', '220', '221', '222', '223', '224', '225', '226', '227', '229', '230', '231']

In [17]:
# create link list for the dataset
link_identifier_list = ["https://archive.org/details/" + i  for i in identifier_list]
link_list = []
for i in range(len(link_identifier_list)):
    curr_size = real_size[i]
    curr_ident = link_identifier_list[i]
    # print(curr_size,curr_ident)
    for i in range(curr_size):
        link_list.append(curr_ident+"/page/n{}".format(i)) 


drawer_no_list = []
for i in range(len(real_size)):
    for j in range(real_size[i]):
        drawer_no_list.append(drawer_no[i])   
all_text["Link"] = link_list
all_text["Drawer_No"] = drawer_no_list

In [18]:
all_text.set_index("Collection_Head",inplace=True)
all_text.rename(columns = {"Page_No_Within_Drawer":"Page_drawer","Is_collection_head":"Coll_head"},inplace=True)
all_text = all_text[["Name","Text","Year","Loc","Page_drawer","Drawer_No","Link",'Coll_head',"Collection"]]

In [19]:
all_text.head()

Unnamed: 0_level_0,Name,Text,Year,Loc,Page_drawer,Drawer_No,Link,Coll_head,Collection
Collection_Head,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A. B. Davis and Company,A. B. Davis and Company,"A. B. Davis and Company (Philadelphia, Pa. ) See Davis (A. B.) and Company",,Pa.,0,157,https://archive.org/details/rubensteinmanuscriptcatalog_A_to_Amer/page/n0,1,0
A.H. Roscoe,A.H. Roscoe,"AeHe Roscoe (Firm: Nashville, Tenne )e Journal, 1853, Septe-1857, Dece 1 item( 800 ppe )e Wholesale and retail druggist and dealer in paints, oils, and dyestuffse Summary: Journal (account book ) documents the sale of chemical and herbal drugs, paint and painting supplies, dyestuffs, personal and household supplies, and garden seeds to individuals and businessese le Drugstores--Tennesseee 2e Paint shops--Equipment and supplies-—- Tennesseee 3e Dyes and dyeinge 46 Household supplies--Tennesseee 5c Herbs--Therapeutic usee 6+ Seed industry and trade--Tennessees 7e Nashville ite® nne )—--Commercee 20 MAY QO1 23804905 NDHYme","1853, Septe-1857","Nashville, Tenne",1,157,https://archive.org/details/rubensteinmanuscriptcatalog_A_to_Amer/page/n1,1,1
Abbeville District,Abbeville District,Abbeville District (8.C.) See South Carolina. Abbeville District,,"Abbeville District, South Carolina",2,157,https://archive.org/details/rubensteinmanuscriptcatalog_A_to_Amer/page/n2,1,2
"Abbott, William B","Abbott, William B","Abbott, William B. Papers, 1862-1864 Frederick Co., Va. Section A 5-16-57 GUIDE 10 items",1862-1864,"Abbott, Va.",3,157,https://archive.org/details/rubensteinmanuscriptcatalog_A_to_Amer/page/n3,1,3
,"Abbott, William B","Abbott, William B. Papers, 1862-1864, Fre- Gerick Co., Va. 10 items. Sketch These are the papers of William B. Abbott, evidently a well-to-do farmer of Frederick Co., Va. There are several documents concerned with the evaluation of damage done to his property by C. S. A. troops in 1862, and.receipts in 1864 for hay bought from Abbott at various times in Aug., 1864 by the C. S. A. Army.",,,4,157,https://archive.org/details/rubensteinmanuscriptcatalog_A_to_Amer/page/n4,0,3


In [10]:
all_text.to_csv("all_sorted_collection.csv")