In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option("display.max_row", None)
import numpy as np
import re
import nltk

In [31]:
# Function that calculates "points" for a card to be a "main entry" vs. "narrative," returns True if it's a "main entry"
def is_main_entry(index, row):
    # Check for new author -> automatic "main entry"
    if index == 0 or row['Name'] != df.iloc[index-1]['Name']:
        return True
    
    # Calculate "points" for classification
    main_entry_pts, narrative_pts = 0, 0
    
    # Check length, longer -> narrative, shorter -> main entry
    if len(row['Text']) > 300:
        narrative_pts += 1
    elif len(row['Text']) < 200:
        main_entry_pts += 1
        
    # Check for misc. main entry identifiers ("GUIDE", "Section X", "X items") -> main entry
    if "Section" in row['Text']:
        main_entry_pts += 1
    if "items" in row['Text']:
        main_entry_pts += 1
    if "GUIDE" in row['Text']:
        main_entry_pts += 1
        
    # Check for misc. narrative identifiers
    if "ACCESSION RECORDS" in row['Text']:
        narrative_pts += 1
    # Matches begining of previous card
    if row['Text'][:50] == df.iloc[index-1]['Text'][:50]:
        narrative_pts += 1
    if "(Continued on next card)" in df.iloc[index-1]['Text'] or "SEE NEXT CRD" in df.iloc[index-1]['Text']:
        narrative_pts += 5
        
    # Check for capitalized last name & card number at beginning of text (Card 2, JONES, Smith, John 3, etc.) -> narrative
    if re.search(r"Card [0-9]+", row['Text']):
        narrative_pts += 1
    elif (row['Name'].split(",")[0]).upper() in row['Text']:
        narrative_pts += 1
    elif re.search(r"" + row['Name'] + "\d+", row['Text']):
        narrative_pts += 1
    
    # Tokenize card
    word_tokens = nltk.word_tokenize(row['Text'])
    tagged_text = nltk.pos_tag(word_tokens)
    
    # Check for "sentence format" (pos only present in narrative [verb, adj.]) -> narrative
    narr_pos = ["VB", "JJ", "PRP", "RB", "IN", "DT"]
    for word in tagged_text:
        if word[1] in narr_pos:
            narrative_pts += .5
    
    # Check for accessioned date (e.g., 13 MAR 97, 10-10-69) -> main entry
    if re.search(r"\d{1,2}-\d{1,2}-\d{2}", row['Text']):
        main_entry_pts += 1
    elif re.search(r"\d{1,2}\s[A-Z]{3}\s\d{2}", row['Text']):
        main_entry_pts += 2
    elif re.search(r"[A-Z]{3}\s\d{1,2}\s\d{4}", row['Text']):
        main_entry_pts += 2
        
#     print("{}: {} {}".format(index, main_entry_pts, narrative_pts))
        
    # Determine card classification
    if main_entry_pts >= narrative_pts:
        return True
    return False

In [25]:
df = pd.read_csv("catalog/file_name_checked/drawer_201.csv")

In [32]:
# Check every card and classify as "main entry" or "narrative", number by collection
collection_num, collection_list = -1, []
for index, row in df.iterrows():
    # Main entries signify a new collection
    if is_main_entry(index, row):
        collection_num += 1
    collection_list.append(collection_num)

# Add column to dataframe with collections numbered
df['Collection'] = collection_list

In [33]:
df.head(100)

Unnamed: 0,Name,Text,Collection
0,"Maxcy, Jonathan ,","Maxcy, Jonathan, 1768-18206 Letter, 1800. 1 iteme President of Rhode Island Collegee Collection consists of a letter from Maxcy to Jonathan Edwards, Jre, president of Union Collegee nae from Guidee cs le Maxcy, Jonathan, 1768-18204 2e College presidents~-Correspondencee 21 JUL 98 39522003 NDHYme",0
1,"Maxcy, Vere","Maxcy, Vere ts Papers, 1834-1838. 6 itemse Charleston, SeCe residente Collection contains Maxcy's financial papers, including an account of: the public auction of a slave family in Charleston, SeCe in 1837-6 Cataloged from Guidee *les le Manan, Veet: 2e Slave-trade-—- South Carolina~-Charlestone Je Slaves --Prices--South Carolinae 4e Slavery-—- United Statese 21 JUL 98 39522055 NDHYme",1
2,"Maxcy, Virgil .","Maxcy, Virgil. Papers, 1834-1838. Charleston, s. ¢, 6 items. Sketch. Bills, receipts, and a, check pertaining to Virgil Maxcy. There is aah icount of the public auction of a Negro family in Charleston, S. C., in 1837. (These items apparently relate not to Virgil Maxcy of Maryland, who lived in Washington, D. C., from 1830 to 1837 and in Belgium from 1837 to 1842, but to his nephew Virgil Maxcy, son of President Jonathan Maxcy of South Carolina College.)",2
3,"Maxey, S. B.","Maxey, Se Be (Samuel Bell), 1825-18956 Letter, 18786 1 iteme UeSe Senator from Texase Coltection consists of a routine administrative letter of.Maxeye oe from Guidee *lcs le Maxey, Se Be (Samuel Bell), 1825- 1895-e¢ 2e Legislators~-United States-- Correspondencee 3e Texas--Politics and government——1865-1950- 21 JUL 98 39522014 NDHYme",3
4,"Maxey, Samuel Bell","Maxey, Samuel Bell. Papers, 1878. Paris, Texas. litem, Sketch. A letter of Oct. 30, 1978 from U. 9. Senator Samuel Bell Maxey (1825-1895) of Texas to S. D. Merchant, Superintendent of the U. S. Senate Folding Room, thanking him for sending documents and arranging to have more sent up to the open- ing of the next session of Congress.",4
5,"Maxwell, Sarah P.","Maxwell, Sarah Pe Papers, 1779-1801- 2 itemse Savannah, Gae residente Collection consists of two letters concerning the seizure and return of thirty slaves by British troops, and giving recipes for home remediese [ore eues from Guidee cs le Slave-trade--United Statese 2e Medicine, Popular--Early works to 1800e¢ 3e Medicine——-Formulae, receipts, prescriptionse 10 DEC 97 38065800 NDHYme",5
6,"Maxwell, Sir William","Maxwell, Sir William (d. 1947) Papers, 1915-1939 Aberdeen, Aberdeenshire, XVILI-E 29 items",6
7,"Maxwell, Sir William","e 6th 10:°B Maxwell, William, Sire Papers, 1915-1939. 29 itemse Newpaper executive and member of the Unionist Partye Collection contains letters: to . Maxwell concerning his activities in ‘Unionist and Conservative party politicse Cataloged from Guidee *lcs le Maxwell, William, Sire 2. ‘Unionist Party (Great Britain) Je Conservative Party (Great Britain) 4e Great Britain--Politics and governszent -—-20th centurye 21 JUL 98 39522084 NDHYme",6
8,"Maxwell, Sir William","Maxwell, Sir William (d. 1947). Papers, 1915- 1939. Aberdeen, Aberdeenshire, Scotland Sir William Maxwell (d. 1947) was a newspaper executive in Aberdeen, Scotland, and an active member of the Unionist Party. His early career included work on the literary staffs of the London Standard, Pall Mall Gazette, St. Jame's Gazette, and the Scotsman. He was editor of the Aberdeen Journal and the Aberdeen Press and Journal, 1910-1927, and in 1928 he became director of the Aberdeen Journals.",6
9,"Maxwell, Sir William","Maxwell, Sir William (d. 1947) 2 During 1934-1936 Maxwell was chairman of the Eastern Divisional Council of the Scottish Unionist Association, and his activities in Unionist and Conservative party politics are reflected in miscellaneous correspondence, 1915-1939, from a variety of individuals. The Unionists or Liberal Unionists merged into the Conservative Party. On April 26, 1915, Lord Aberdeen responded to statements in the Aberdeen Journal about his retirement as Lord Lieutenant of Ireland. He",6
