In [16]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option("display.max_row", None)
import numpy as np
import re
import nltk

In [17]:
# Function that calculates "points" for a card to be a "main entry" vs. "narrative," returns True if it's a "main entry"
def is_main_entry(index, row):
    # Check for new author -> automatic "main entry"
    if index == 0 or row['Name'] != df.iloc[index-1]['Name']:
        return True
    
    # Check if previous and subsequent card authors match -> mispelled name -> main entry
    if df.iloc[index-1]['Name'] == df.iloc[index+1]['Name']:
        return True
    
    # Calculate "points" for classification
    main_entry_pts, narrative_pts = 0, 0
    
    # Check length, longer -> narrative, shorter -> main entry
    if len(row['Text']) > 300:
        narrative_pts += 1
    elif len(row['Text']) < 200:
        main_entry_pts += 1
        
    # Check for misc. main entry identifiers ("GUIDE", "Section X", "X items") -> main entry
    if "Section" in row['Text']:
        main_entry_pts += 1
    if "items" in row['Text']:
        main_entry_pts += 1
    if "GUIDE" in row['Text'].upper() or "GUID" in row['Text'].upper() or "GUDE" in row['Text'].upper():
        main_entry_pts += 2
        
    # Check for misc. narrative identifiers
    if "FOR INFORMATION ABOUT THIS COLLECTION" in row['Text']:
        narrative_pts += 5
    if "Addition" in row['Text'] or "added" in row['Text']:
        narrative_pts += 1
    # Matches begining of previous card
    if row['Text'][:50] == df.iloc[index-1]['Text'][:50]:
        narrative_pts += 1
    # Previous card indicates continuation of narrative onto current card
    if "(Continued on next card)" in df.iloc[index-1]['Text'] or "SEE NEXT CRD" in df.iloc[index-1]['Text']:
        narrative_pts += 5
        
    # Check for capitalized last name & card number at beginning of text (Card 2, JONES, Smith, John 3, etc.) -> narrative
    if re.search(r"Card [0-9]+", row['Text']):
        narrative_pts += 2
    elif (row['Name'].split(",")[0]).upper() in row['Text']:
        narrative_pts += 1
    elif re.search(r"" + row['Name'] + "\d+", row['Text']):
        narrative_pts += 1
    
    # Tokenize card
    word_tokens = nltk.word_tokenize(row['Text'])
    tagged_text = nltk.pos_tag(word_tokens)
    
    # Check for "sentence format" (pos only present in narrative [verb, adj.]) -> narrative
    narr_pos = ["VB", "JJ", "PRP", "RB", "IN", "DT"]
    for word in tagged_text:
        if word[1] in narr_pos:
            narrative_pts += .3
    
    # Check for accessioned date (e.g., 13 MAR 97, 10-10-69) -> main entry
    if re.search(r"\d{1,2}-\d{1,2}-\d{2}", row['Text']):
        main_entry_pts += 1
    elif re.search(r"\d{1,2}\s[A-Z]{3}\s\d{2}", row['Text']):
        main_entry_pts += 2
    elif re.search(r"[A-Z]{3}\s\d{1,2}\s\d{4}", row['Text']):
        main_entry_pts += 2
        
#     print("{}: {} {}".format(index, main_entry_pts, narrative_pts))
        
    # Determine card classification
    if main_entry_pts >= narrative_pts:
        return True
    return False

In [18]:
df = pd.read_csv("catalog/file_name_checked/drawer_230.csv")

In [19]:
# Check every card and classify as "main entry" or "narrative", number by collection
collection_num, collection_list = -1, []
for index, row in df.iterrows():
    # Check for NaN values
    if pd.isna(row['Text']) or pd.isna(row['Name']) or row['Text'] == "" or row['Name'] == "":
        collection_list.append(-1)
    else:
        # Main entries signify a new collection
        if is_main_entry(index, row):
            collection_num += 1
        collection_list.append(collection_num)

# Add column to dataframe with collections numbered
df['Collection'] = collection_list

In [20]:
# Label by collection headers as index, with original index
df["Page_No_Within_Drawer"] = df.index
curr = -100
collection_header = []
for index,row in df.iterrows():
    if(row["Collection"]!= curr):
        collection_header.append(row["Name"])
    else:
        collection_header.append("")
    curr = row["Collection"]
df["Collection_Head"] = collection_header
df.set_index("Collection_Head",inplace=True)

In [21]:
df.head(100)

Unnamed: 0_level_0,Name,Text,Collection,Page_No_Within_Drawer
Collection_Head,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Wood, Thomas F.","Wood, Thomas F.","Wood, Thomas F., Inc, Papers, 1913-1923 Wilmington, New Hanover County, N. C. 6997 1 vol. 41-70",0,0
,"Wood, Thomas F.","Wood, Thomas F., Inc, Papers. Wilmington, New Hanover County, N. C. The business of Thomas F. Wood, Inc., whole- sale and retail ship chandlers and provisioners, is represented by its Ledger, 1913-1923. The earlier accounts probably belonged to the partnership of Maffitt & Wood which was listed in the city directory of 1913-1914. At that time the ledger showed capital stock accounts for Clarence D, Maffitt and Thomas F. Wood. The city directory of 1922 has’ a listing for Thomas F. Wood, Inc.",0,1
"Wood, Thomas Fanning","Wood, Thomas Fanning","Wood, Thomas Fanning Papers, 1885 Wilmington, New Hanover Co., N. C. Section A 1 item 8-28-70",1,2
"Wood, Thomas Fanning","Wood, Thomas Fanning","Wood, Thomas Fanning FOR INFORMATION ABOUT THIS COLLECTION OR ADDITIONS TO THIS COLLECTION, PLEASE ASK A STAFF MEMBER TO CONSULT THE ACCESSION RECORDS.",2,3
"Wood, Thomas Fanning","Wood, Thomas Fanning","Wood, Thomas Fanning. Papers, 1885 Wilmington, N. C. Thomas Fanning Wood was an eminent physician of Wilmington. In 1877, chiefly through his efforts, the General Assembly established the State Board of Health. In 1879 the Board elect- ed Dr. Wood its first secretary, and he served in that capacity until he died in 1892. He also edited the North Carolina Medical Journal. A letter of May 20, 1885, from Alvan Went- worth Chapman to Dr. Wood concerns the Reverend Moses Ashley Curtis, botanist and minister, whose botanical studies led him to specialize in fungi and become a noted authority on that",3,4
,"Wood, Thomas Fanning","Wood, Thomas Fanning 2 subject. Chapman was responding to an inquiry by Wood about materials for his writing a sketch of Curtis. Chapman says he corresponded with Curtis for many years and comments on his per- sonality as reflected in his letters. He says that he has destroyed all the letters he re- ceived from Curtis and tells why. He goes on to list several of Curtis's printed works on the botany of North Carolina and to say that Henry William Ravenel was probably more intimate with Curtis than any other botanist.",3,5
"Wood, William","Wood, William","Wood, William Daybook (General Store), 1819-1831 Winchester,Frederick Co., Va. 244 po. Calf 9-20-51 F- 6718 cd",4,6
,,,-1,7
,,,-1,8
,,,-1,9
