In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option("display.max_row", None)
import numpy as np
import re
import nltk

In [56]:
# Function that calculates "points" for a card to be a "main entry" vs. "narrative," returns True if it's a "main entry"
def is_main_entry(index, row):
    # Check for new author -> automatic "main entry"
    if index == 0 or row['Name'] != df.iloc[index-1]['Name']:
        return True
    
    # Check if previous and subsequent card authors match -> mispelled name -> main entry
    if df.iloc[index-1]['Name'] == df.iloc[index+1]['Name']:
        return True
    
    # Calculate "points" for classification
    main_entry_pts, narrative_pts = 0, 0
    
    # Check length, longer -> narrative, shorter -> main entry
    if len(row['Text']) > 300:
        narrative_pts += 1
    elif len(row['Text']) < 200:
        main_entry_pts += 1
        
    # Check for misc. main entry identifiers ("GUIDE", "Section X", "X items") -> main entry
    if "Section" in row['Text']:
        main_entry_pts += 1
    if "items" in row['Text']:
        main_entry_pts += 1
    if "GUIDE" in row['Text'].upper() or "GUID" in row['Text'].upper() or "GUDE" in row['Text'].upper():
        main_entry_pts += 2
        
    # Check for misc. narrative identifiers
    if "FOR INFORMATION ABOUT THIS COLLECTION" in row['Text']:
        narrative_pts += 5
    if "Addition" in row['Text'] or "added" in row['Text']:
        narrative_pts += 1
    # Matches begining of previous card
    if row['Text'][:50] == df.iloc[index-1]['Text'][:50]:
        narrative_pts += 1
    # Previous card indicates continuation of narrative onto current card
    if "(Continued on next card)" in df.iloc[index-1]['Text'] or "SEE NEXT CRD" in df.iloc[index-1]['Text']:
        narrative_pts += 5
        
    # Check for capitalized last name & card number at beginning of text (Card 2, JONES, Smith, John 3, etc.) -> narrative
    if re.search(r"Card [0-9]+", row['Text']):
        narrative_pts += 2
    elif (row['Name'].split(",")[0]).upper() in row['Text']:
        narrative_pts += 1
    elif re.search(r"" + row['Name'] + "\d+", row['Text']):
        narrative_pts += 1
    
    # Tokenize card
    word_tokens = nltk.word_tokenize(row['Text'])
    tagged_text = nltk.pos_tag(word_tokens)
    
    # Check for "sentence format" (pos only present in narrative [verb, adj.]) -> narrative
    narr_pos = ["VB", "JJ", "PRP", "RB", "IN", "DT"]
    for word in tagged_text:
        if word[1] in narr_pos:
            narrative_pts += .3
    
    # Check for accessioned date (e.g., 13 MAR 97, 10-10-69) -> main entry
    if re.search(r"\d{1,2}-\d{1,2}-\d{2}", row['Text']):
        main_entry_pts += 1
    elif re.search(r"\d{1,2}\s[A-Z]{3}\s\d{2}", row['Text']):
        main_entry_pts += 2
    elif re.search(r"[A-Z]{3}\s\d{1,2}\s\d{4}", row['Text']):
        main_entry_pts += 2
        
#     print("{}: {} {}".format(index, main_entry_pts, narrative_pts))
        
    # Determine card classification
    if main_entry_pts >= narrative_pts:
        return True
    return False

In [53]:
df = pd.read_csv("catalog/file_name_checked/drawer_158.csv")

In [57]:
# Check every card and classify as "main entry" or "narrative", number by collection
collection_num, collection_list = -1, []
for index, row in df.iterrows():
    # Check for NaN values
    if row['Text'] != row['Text'] or row['Name'] != row['Name']:
        collection_list.append(-1)
    else:
        # Main entries signify a new collection
        if is_main_entry(index, row):
            collection_num += 1
        collection_list.append(collection_num)

# Add column to dataframe with collections numbered
df['Collection'] = collection_list

In [58]:
df.head(100)

Unnamed: 0,Name,Text,Collection
0,"Ames, Electa E.","Ames, Electa E. (Ray), and Fordyce wW. Papers, 1849-1931 DeRuyter, Madison Co., N.Y. 246 items 12-C 5-10-71",0
1,"Ames, Electa E.","Ames, Electa E. (Ray), and Fordyce W. Papers. DeRuyter, Madison Co., N.Y. Fordyce W. Ames, a farmer and one-time school director, was born in Plymouth, New York, in 1820. He and his wife, Electa E. (Ray) Ames (1822-1878), the daughter of Robert and Betsy Ray, were married in 1843. They made their home in DeRuyter, New York, and had eight child- ren. The children are Hartwell, Fred, Henry, Frank N., Willie, John F., and probably Warren W. and Whitford. The majority of the letters in this collec-",0
2,"Ames, Electa E.","Ames, Electa E. (Ray), and Fordyce W. rene tion were written by Electa Ames” son, Frank N. Ames, and by her sister, Jane C. (Ray) Warren. Jane and her husband Jared W. Warren were both teachers in Rutherford Co., Tennessee. In a letter of August 27, 1863, Mrs. Warren describes the treatment of slaves before and after Union occupation, a battle which took place on and near her property, and conditions in Tennessee during the Civil War. The letters of Jane and Jared frequently discuss schools and teaching in Tennessee. There are a couple of letters from Electa Ames' brother, J.M. Ray,",0
3,"Ames, Electa E.","Ames, Electa E. (Ray), and Fordyce W. 3 who was a Union soldier during the Civil War.",1
4,"Ames, Fisher","Ames, Fisher Papers, 1790-1801 Dedham, Norfolk Co., Mass. XII-D 1 item 1 item added, 8-25-72 1-28-57",2
5,"Ames, Fisher","Ames, Fisher. Papers, 1790. Dedham, Norfolk Co., Mass. 1 item, Sketch Fisher Ames (1758-1808), lawyer, Federal- ist leader, and Congressman, wrote on June 5, 1790, to U. S. Judge John Lowell of Mass. abw payments to veterans of the N. C. Line in the Revolution. The letter concerns legislation before the House of Representatives to prevent frauds in assignments of back pay. Senate proposals are noted, as also the roles of Vice President John Adams and President Washington. The character of John Jay is reviewed, This session of Congress is described as the ""most",2
6,"Ames, Fisher","Ames, Fisher 2 embarrassing that can be conceived."" A pic- ture of Ames and a biographical clipping are included. 1 item added, 8-25-72: A letter from Ames to Benjamin Bourne concerning an unidentified appli- cant for an editorial position with a Federalist newspaper (February 2, 1801).",2
7,"Ames, James B.","Ames, James, Bey fle 18394. Letter, 1834. 1 iteme Commission merchante Letter (1834 Dece 22) with a completed printed form to Te De Bowen of Rhode Island, relating to the export of cctton to the North and to Europee Includes typescripte 1e Cotton trade--Alabama--Mobitlee 19 MAY 88 17975721 NDHYme",3
8,"Ames, James Tyler","Ames, James Tyler Papers, 1865 Chicopee, Hampden Co., Massachusetts Section A 2 items 2-1-62",4
9,"Ames, James Tyler","Ames, James Tyler. Yapers, 1865. Chicopee, Hampden Co., Massachusetts James Tyler Ames (1810-1883), mechanic and manufacturer, had one of the largest munitions factories in the North during the Civil War. On Nov. 2 and 18, 1865, W. M. Mitchell writes from Milledgeville and Dougherty County, Georgia concerning investment in cotton plantation land which he wished Ames to make.",4
