In [3]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option("display.max_row",None)
import numpy as np
import re
import nltk

In [31]:
# Function that tabulates "points" for a card to be a "main entry" vs. "narrative," returns True if it's a "main entry"
def is_main_entry(index, row):
    # Check for new author -> automatic "main entry"
    if index == 0 or row['Name'] != df.iloc[index-1]['Name']:
        return True
    
    # Calculate "points" for classification
    main_entry_pts, narrative_pts = 0, 0
    
    # Check length, longer -> narrative, shorter -> main entry
    if len(row['Text']) > 300:
        narrative_pts += 1
    elif len(row['Text']) < 200:
        main_entry_pts += 1
        
    # Check for misc. main entry identifiers ("MSS.", "Section X", "X items") -> main entry
    if "MSS" in row['Text']:
        main_entry_pts += 1
    if "Section" in row['Text']:
        main_entry_pts += 1
    if "items" in row['Text']:
        main_entry_pts += 1
        
    # Check for capitalized last name & card number at beginning of text (Card 2, JONES 3, etc.) -> narrative
    if re.search(r"Card [0-9]+", row['Text']):
        narrative_pts += 1
    elif (row['Name']).upper() in row['Text']:
        narrative_pts += 1
    
    # Tokenize card
    word_tokens = nltk.word_tokenize(row['Text'])
    tagged_text = nltk.pos_tag(word_tokens)
    
    # TODO: Check for "sentence format" (pos only present in narrative [verb, adj.?]) -> narrative
    
    # TODO: Check for NO title, date, location, etc. -> narrative
    # Contains no date
    if not re.search(r"[1][0-9]{3}", row['Text']):
        narrative_pts += 1 
    
    # TODO: Check for date range and accessioned date -> main entry
    if re.search(r"([1][0-9]{3})-([0-9]{1,4})", row['Text']):
        main_entry_pts += 1
        
    # Determine card classification
    if main_entry_pts >= narrative_pts:
        return True
    return False

In [24]:
df = pd.read_csv("catalog/file_name_checked/drawer_231.csv")

In [32]:
# Check every card and classify as "main entry" or "narrative", number by collection
collection_num, collection_list = -1, []
for index, row in df.iterrows():
    # Main entries signify a new collection
    if is_main_entry(index, row):
        collection_num += 1
    collection_list.append(collection_num)

# Add column to dataframe with collections numbered
df['Collection'] = collection_list

In [34]:
df.head(100)

Unnamed: 0,Name,Text,Collection
0,Yadkin Falls Manufacturing Company,Yadkin Falls Manufacturing Company Papers See William Alexander Smith Papers,0
1,"Yancey, Benjamin Cudworth","YANCEY, Benjamin Cudworth Letters, 1846-82 Charleston, 8S. C. Cab. 45 102 pieces",1
2,"Yancey, Benjamin Cudworth","YANCEY, Benjamin Cudworth Letters 1846-82 Charleston, 8S. C. ""102 pieces Benjamin Cudworth Yancey was born in Charles ton, S. 0., in 1817, His father died when he was only a few months 01d and his mother returned to Ga. There they remained until 1822 when the mother married N. S. S$. Beman and went to Troy, N.Y. Benjamin was educated at the U. of Ga. and at York, Apout 1838 he joined his brother, Wm. Lowndes, in Alabama where he farmed and published the Wetumpka Gazette. (For more detailed sketch see National Cyclopedia of American Biography, XIII, 560.) aes",2
3,"Yancey, Benjamin Cudworth","YANCEY -2- This collection begins in 1846 after B. G¢ Yancey had married Sarah Paris, daughter of Thos. Napier Hamilton, Columbia Co., @a., and after he had returned to S. C. and was practicing law at Hamburg and was in the state legislature. After about ten years at Hamburg, he went (1851) to hte Coosa river plantation (probably in Dallas Co., Ala.). The letters of this period deal with plantation affairs. There is aletter from James Hamilton, Feb, 27, 1859, which deals with the settlement of the estate of T. N, Hamilton and whdeh reveals that good slaves were at that time",2
4,"Yancey, Benjamin Cudworth","YANCEY -3- bringing from $1,000 - $1,700. In 1858 Yancey went to Argentina as minister resident. These are ezceptional letters tn that they tell of the difficulties that Yancey had in negothating a treaty with that South American state (1859). By Aug. 1861, Yancey w&s with Cobb's Legion at Petersburg, Va. The Civil War letters of this collection are from Mrs, Yancey, who lived in Atlanta at least during the first year of the war, Her letters have a great deal to say about business affairs, and wspecially the trouble people wk¥KX were having with slave labor. After",2
5,"Yancey, Benjamin Cudworth","YANCEY -4- , the war Yancey lived on a plantation near Albany, Ga. The letters of this period are few and little can be gained from them, Yancey's son, Hamilton (c. 1848) was a stu- dent at the U. of Ga, in 1866 and at the U. of Va in 1868, His daughter, Mary Lohise (b. 1861) probably went to school in Staunton, Va, In addition to the letters of Benjamin C. and Sarah (Hamilton) Yancey there are seme from their children Caro, Hamilton, Mary Louise, (Caro Yancey was a daughter by a former marriage),",2
6,"Yancey, Benjamin Cudworth","YANCEY -5- from cousins and from Eye, Jeannie and Rebecca Hamilton, sisters of Mrs. Yancey. There is one from Wm. Lowndes Yancey. The collection is especially valuable for the information that 1t contains on plantation life and for Yancey's part in negotiation ghe treaty with Argentina,",2
7,"Yancey, William Lowndes","Yancey, William Lowndes Letters. 1846, Wetumpka, Elmore Co., Alabama ‘Section A 2 pieces July 5, 1959,",3
8,"Yancey, William Lowndes","YANCEY, William Lowndes, Letters, 1846, W William Lowndes Yancey (Aug.10,1814—July 28, 1863) was born in Ogeechee Shoals,Ga,,the son of Benjamin C,Yancey who was a lawyer of Abbeville, §.C. Trained as a lawyer, W.L.Yancey moved to Ala, in 1836, There he edited the Cahaba Demo- crat and the Wetumpka Argus. He served in the legislature and in 1844 was elected to Congress but resigned in 1847. He was an ardent state rights advocate,opposed the compromise of 1850, and urged secession. He was a member of the Con federate Congress at the time of his death, The two letters are m= with politics,",3
9,"Yantis, Solomon Vance","Yantis, Solomon Vance Papers, 1863-1896 Harper's Ferry, Jefferson Co., W. Va. Section Aa 56 items 5-3-61",4
