# Pre-Processing Step
Note: grabbed list of Catalog's state_abbr from previous python data-cleaning set (renderer state index)

In [1]:
# Dictionary mapping custom state abbreviations to state names
state_abbr_dict = {
    "Conn": "Connecticut",
    "Ct": "Connecticut",
    "Me": "Maine",
    "Ma": "Massachusetts",
    "Mass": "Massachusetts",
    "NH": "New Hampshire",
    "RI": "Rhode Island",
    "Vt": "Vermont",
    "NJ": "New Jersey",
    "NYS": "New York State",
    "NYC": "New York City",
    "Pa": "Pennsylvania",
    "IN": "Indiana",
    "Ill": "Illinois",
    "Mich": "Michigan",
    "Ohio": "Ohio",
    "Wis": "Wisconsin",
    "Iowa": "Iowa",
    "Ka": "Kansas",
    "Minn": "Minnesota",
    "Mn": "Minnesota",
    "Mo": "Missouri",
    "NE": "Nebraska",
    "ND": "North Dakota",
    "SD": "South Dakota",
    "Del": "Delaware",
    "DC": "District of Columbia",
    "Fla": "Florida",
    "Ga": "Georgia",
    "Md": "Maryland",
    "NC": "North Carolina",
    "SC": "South Carolina",
    "Va": "Virginia",
    "WV": "West Virginia",
    "Ala": "Alabama",
    "Ky": "Kentucky",
    "MS": "Mississippi",
    "Tenn": "Tennessee",
    "AR": "Arkansas",
    "La": "Louisiana",
    "OK": "Oklahoma",
    "Tex": "Texas",
    "Ariz": "Arizona",
    "Col": "Colorado",
    "Colo": "Colorado",
    "ID": "Idaho",
    "NM": "New Mexico",
    "MT": "Montana",
    "Utah": "Utah",
    "NV": "Nevada",
    "WY": "Wyoming",
    "AK": "Alaska",
    "Ca": "California",
    "Cal": "California",
    "So Cal": "Southern California",
    "HI": "Hawaii",
    "OR": "Oregon",
    "Wash": "Washington"
}

In [2]:
# # Example usage:
# state_abbr = "NYC"
# state_name = state_abbr_dict.get(state_abbr)
# print(state_name)  # Output: New York City

### define _state_abbreviations_

In [3]:
# Extract state abbreviations from the dictionary
state_abbreviations = list(state_abbr_dict.keys())

# # Example usage:
# print(state_abbreviations)

# Divide Sample into Sections
Note: valid_sections were capitalized in txt file

In [4]:
import pandas as pd

# Function to parse the text and extract section headers
def parse_catalog(text):
    # Initialize variables
    section_data = {}
    current_section = None
    current_section_data = []
    
    # Define the list of valid section headers
    valid_sections = ['EARTHENWARE', 'MAJOLICA', 'LUSTERWARE', 'STONEWARE', 'IRONSTONE', 'PORCELAIN', 'TILES AND PLAQUES', 'CERAMIC FIGURES', 'CHALKWARE']

    # Split the text into lines
    lines = text.strip().split('\n')

    # Iterate over each line
    for line in lines:
        line = line.strip()
        
        # Check if the line is a section header (uppercase) and is in the valid_sections array
        if line.isupper() and line in valid_sections:
            if current_section:
                # Convert the current_section_data to a DataFrame and store it
                section_data[current_section] = pd.DataFrame(current_section_data, columns=['Data'])
                current_section_data = []  # Reset current_section_data for the new section
            current_section = line
        elif current_section:
            # Append the line to the current_section_data
            current_section_data.append(line)

    # Convert the data of the last section to a DataFrame and store it
    if current_section:
        section_data[current_section] = pd.DataFrame(current_section_data, columns=['Data'])

    return section_data

# Read the contents of the text file
with open('CatCeram_sample.txt', 'r') as file:
    text = file.read()

# Parse the catalog data
section_data = parse_catalog(text)

# # Display the DataFrames for each section
# for section, df in section_data.items():
#     print("Section:", section)
#     print(df)
#     print()

In [5]:
# section_data.items()

# Pre-Check Functions

In [6]:
# # Determine if line contains a state abbr
# def has_state_abbreviation(text):
#     # Split the hyphenated alphanumeric by hyphen and check if the first part is in state_abbreviations
#     parts = text.split('-')
#     if parts[0] in state_abbreviations:
#         return True
#     return False

In [7]:
# # Example usage:
# hyphenated_alphanumeric = "NYC-123"
# if has_state_abbreviation(hyphenated_alphanumeric):
#     print("The hyphenated alphanumeric contains a state abbreviation.")
# else:
#     print("The hyphenated alphanumeric does not contain a state abbreviation.")

## Check for lines with hyphenation
### 1. Accession Numbers
Note: determining if the line contains a state_abbreviation helps differentiate the Catalog's accession number from the microfiche (in this case, copied into the txt as an excess line to track random samples)
### 2. Date Ranges
### 3. Microfiche Location Numbers
Note: need this function to check again object names with 3 characters, i.e., "Jar"

In [8]:
# Function to check if a hyphenated alphanumeric contains a state abbreviation
def has_state_abbreviation(text):
    # Convert text to uppercase for case-insensitive comparison
    upper_text = text.upper()
    
    # Check if the text contains any of the state abbreviations
    for abbreviation in state_abbreviations:
        if abbreviation.upper() in upper_text:
            return True
    
    return False

# Function to check if a hyphenated alphanumeric contains a date range
def has_date_range(text):
    parts = text.split('-')
    if len(parts) == 2 and all(part.isdigit() for part in parts):
        return True
    return False

# Function to check if a hyphenated alphanumeric contains a combination of numbers and letters
def has_combination(line):
    """
    Check if the line matches the combination type pattern.

    Parameters:
        line (str): The line of text to check.

    Returns:
        bool: True if the line matches the combination type pattern, False otherwise.
    """
    # Remove quotation marks if they exist
    line = line.strip('"')
    parts = line.split('-')
    if len(parts) == 3:
        part1, part2, part3 = parts
        if part1.isdigit() and 1 <= int(part1) <= 20 and part2.isalpha() and len(part2) == 1 and 'A' <= part2 <= 'E' and part3.isdigit() and 1 <= int(part3) <= 12:
            return True
    return False

In [9]:
# # EXAMPLE: Iterate through the Earthenware DataFrame
# for index, row in earthenware_df.iterrows():
#     line = row['Data']
#     if '-' in line:
#         if has_state_abbreviation(line):
#             # Process state abbreviation type
#             print("State Abbreviation Type:", line)
#         elif has_date_range(line):
#             # Process date range type
#             print("Date Range Type:", line)
#         elif has_combination(line):
#             # Process combination type
#             print("Combination Type:", line)
#     else:
#         # Process other types of lines
#         print("Other Type:", line)

# Main Functions
Note: each section was made into a data frame to test and refine overall functions (helped determine nuances on smaller sets of data)

**[Next step: test main functions on section_data]**

**--------- EARTHENWARE ---------**

In [10]:
# --- EARTHENWARE ---
earthenware_df = section_data['EARTHENWARE']
# print(earthenware_df)

### Step 1: use _has_combination_ function
Note: this weeds out the extra microfiche line (sometimes a hyphenated line 1-D-8, other times a hyphenated line surrounded by quotation marks "1-D-8")

In [11]:
#STEP 1 (EARTHENWARE)
# Filter out rows with hyphenated alphanumerics of the combination type
earthenware_df_filtered = earthenware_df[~earthenware_df['Data'].apply(has_combination)]

# Display the filtered DataFrame
print("Earthenware DataFrame after filtering out combination types:")
print(earthenware_df_filtered)

Earthenware DataFrame after filtering out combination types:
                                                 Data
0                                                    
2                                                 1D8
3                                          Flower pot
4      M: Kohler Pottery 1865-1915 Pensacola, Florida
5                                                Clay
..                                                ...
81                                        Earthenware
82                                 R: Frank Fumagalli
83  O: (1938) New York Historical Society, New Yor...
84                                         NYC-Cer-45
85                                                   

[76 rows x 1 columns]


### Step 2: split sections into individual objects
Note: ID (or clean microfiche location number) used to split section, lines remaining are bucketed as metadata

In [12]:
#STEP 2 (EARTHENWARE)
# Function to split the DataFrame by ID number
def split_by_id(df):
    # Initialize variables
    split_data = []
    current_id = None
    current_metadata = []

    # Iterate through the DataFrame
    for index, row in df.iterrows():
        line = row['Data']
        words = line.split()
        if len(words) == 1 and words[0].isalnum() and 3 <= len(words[0]) <= 5 and any(char.isdigit() for char in words[0]):
            # If a valid ID is found, append the previous ID and metadata to split_data
            if current_id is not None:
                split_data.append((current_id, current_metadata))
            # Update current_id and reset current_metadata
            current_id = words[0].strip()
            current_metadata = []
        else:
            # Append the line to current_metadata
            current_metadata.append(line.strip())
    
    # Append the last ID and metadata to split_data
    if current_id is not None:
        split_data.append((current_id, current_metadata))
    
    return split_data

# Split the filtered DataFrame by ID number
earthenware_split_data = split_by_id(earthenware_df_filtered)

# Display the split data
for id_number, metadata in earthenware_split_data:
    print("ID:", id_number)
    print("Metadata:")
    for line in metadata:
        print(line)
    print()

ID: 1D8
Metadata:
Flower pot
M: Kohler Pottery 1865-1915 Pensacola, Florida
Clay
R: Annie B. Johnston
O: (1937) Florida State Museum, Gainesville, Fla
Fla-Cer-34


ID: 2B12
Metadata:
Jar
M: Burr Frost 1847 Missouri
Clay pottery
R: Clyde L. Cheney
O: (1937) Judy Lund, Salt Lake City, Utah
Utah-Cer-3


ID: 3A6
Metadata:
Jar with cover
M: early 19th c Long Island, New York State
Glazed brown earthenware
R: Alvin Shiren
O: (1939) Metropolitan Museum of Art, New York City, NY
NYC-Cer-51


ID: 3E5
Metadata:
Jar
M: Pennsylvania
Earthenware
R: Yolande Delasser
O: (1938) Alfred B. Maclay, New York City, NY
NYC-Cer-68

ID: 4C12
Metadata:
Jar
M: John Eardley 186O’s St. George, Utah
Glazed clay
R: Clyde L. Cheney
O: Miss Fern Seegmiller, St. George, Utah
Utah-Cer-22

ID: 5A3
Metadata:
Pitcher
M: Bell late 18th c Strasburg, Virginia
Glazed earthenware
R: Hugh Ryan
O: (1940) Mrs Bess H. Muller, Chester County, Pa
Del-Cer-29

ID: 5A12
Metadata:
Jug
M: T. Grim 1866-70 Strasburg, Virginia
Coil pottery,

### Step 3: Parse metadata
Note: this step required the most fine tuning, especially for edge-cases like: accessions numbers that included "no class #" and if OCR pulled in semicolons instead of colons

**[Next step: will have to clean up accession numbers; maybe more successful after particular key-value pair is called]**

In [13]:
#STEP 3 (EARTHENWARE)
# Function to parse metadata and assign key-value pairs
def parse_metadata(metadata):
    parsed_data = {}
    name = None
    maker = None
    materials = []
    renderer = None
    owner = None
    code = None

# Iterate over each line in the metadata
    for line_index, line in enumerate(metadata):
        if name is None:
            # The first line without a colon or semicolon is considered as the object name
            name = line.strip()
        elif ':' in line or ';' in line:
            if ':' in line:
                separator = ':'
            elif ';' in line:
                separator = ';'

            key, value = line.split(separator, 1)
            key = key.strip()
            value = value.strip()
            if key == 'M':
                maker = value
            elif key == 'R':
                renderer = value
            elif key == 'O':
                owner = value
            else:
                parsed_data[key] = value
        elif line.strip():  # Check if the line is not empty
            # Check if the line fits the has_state_abbreviation type and "No class #"
            if has_state_abbreviation(line) and ("No class #" in line or "no class #" in line):
                code = line.strip()
            elif has_state_abbreviation(line):
                code = line.strip()
            elif "No class #" in line or "no class #" in line:  # Keep "No class #" in code
                code = line.strip()
            else:
                # Add the line to materials if it doesn't match any of the conditions above
                materials.append(line.strip())


    # Assign the collected metadata to the parsed data
    parsed_data['Name'] = name
    parsed_data['Maker'] = maker
    parsed_data['Materials'] = materials
    parsed_data['Renderer'] = renderer
    parsed_data['Owner'] = owner
    parsed_data['Code'] = code

    return parsed_data

# Split the filtered DataFrame by ID number
earthenware_split_data = split_by_id(earthenware_df_filtered)

# Display the split data with parsed metadata
for id_number, metadata in earthenware_split_data:
    print("ID:", id_number)
    print("Parsed Metadata:")
    parsed_metadata = parse_metadata(metadata)
    for key, value in parsed_metadata.items():
        if key == 'Materials':
            print(f"{key.capitalize()}: ", end="")  # Print "Materials:" without a newline
            for item in value:
                print(item, end=" ")  # Print each item on the same line
            print()  # Add a newline after printing all items
        else:
            print(f"{key.capitalize()}: {value}")
    print()

ID: 1D8
Parsed Metadata:
Name: Flower pot
Maker: Kohler Pottery 1865-1915 Pensacola, Florida
Materials: 
Renderer: Annie B. Johnston
Owner: (1937) Florida State Museum, Gainesville, Fla
Code: Fla-Cer-34

ID: 2B12
Parsed Metadata:
Name: Jar
Maker: Burr Frost 1847 Missouri
Materials: 
Renderer: Clyde L. Cheney
Owner: (1937) Judy Lund, Salt Lake City, Utah
Code: Utah-Cer-3

ID: 3A6
Parsed Metadata:
Name: Jar with cover
Maker: early 19th c Long Island, New York State
Materials: 
Renderer: Alvin Shiren
Owner: (1939) Metropolitan Museum of Art, New York City, NY
Code: NYC-Cer-51

ID: 3E5
Parsed Metadata:
Name: Jar
Maker: Pennsylvania
Materials: 
Renderer: Yolande Delasser
Owner: (1938) Alfred B. Maclay, New York City, NY
Code: NYC-Cer-68

ID: 4C12
Parsed Metadata:
Name: Jar
Maker: John Eardley 186O’s St. George, Utah
Materials: 
Renderer: Clyde L. Cheney
Owner: Miss Fern Seegmiller, St. George, Utah
Code: Utah-Cer-22

ID: 5A3
Parsed Metadata:
Name: Pitcher
Maker: Bell late 18th c Strasburg, 

### Repeat Steps 1-3 for each data frame

**--------- MAJOLICA ---------**

In [14]:
# --- MAJOLICA ---
majolica_df = section_data['MAJOLICA']
# print(majolica_df)

In [15]:
#STEP 1 (MAJOLICA)
# Filter out rows with hyphenated alphanumerics of the combination type
majolica_df_filtered = majolica_df[~majolica_df['Data'].apply(has_combination)]

# Display the filtered DataFrame
print("Majolica DataFrame after filtering out combination types:")
print(majolica_df_filtered)

Majolica DataFrame after filtering out combination types:
                                         Data
0                                            
1                                         6D3
2                                     Pitcher
3                             R: Della Button
4                                  No class #
..                                        ...
69                                      Plate
70                          R: Dorothy Posten
71  O: (1936) Mrs William Gibson, Crafton, Pa
72                                 Pa-Ccr-425
73                                           

[74 rows x 1 columns]


In [16]:
#STEP 2 (MAJOLICA)
# Function to split the DataFrame by ID number

# Split the filtered DataFrame by ID number
majolica_split_data = split_by_id(majolica_df_filtered)

# Display the split data
for id_number, metadata in majolica_split_data:
    print("ID:", id_number)
    print("Metadata:")
    for line in metadata:
        print(line)
    print()

ID: 6D3
Metadata:
Pitcher
R: Della Button
No class #


ID: 6D4
Metadata:
Apple butter jar
M: before 1820 Lebanon, Pennsylvania
R: Harry Mann Waddell
O: (1937) Dora B. Talaferro, San Diego, Ca
So Cal-Cer-15a


ID: 6D5
Metadata:
Dog pitcher
M: Griffin, Smith and Hill, c. 1860 Phoenixville, Pennsylvania
American majolica
R: Ernest A. Towers
O: (1938) Theodore H. Buckalew, Wilmington, Del
Del-Cer-23


ID: 6D6
Metadata:
Pitcher
M: Griffin, Smith and Hill c. 1880-90 Phoenixville, Pennsylvania
R: Ernest A. Towers
O: (1938) Theodore H. Buckalew, Wilmington, Del
Del-Cer-24


ID: 6D7
Metadata:
Pitcher
M: Griffin, Smith and Hill c. 1880-90 Phoenixville, Pennsylvania
American majolica
R: Amos Brinton
O: (1938) Mrs T.H. Buckalew, Wilmington, Del
Del-Cer-26


ID: 6D8
Metadata:
Pitcher
M: Preston Moore c. 1820 Wilmington, Delaware
American majolica
R: Edward Loper
O: (1940) Mrs Mary Cabali, Wilmington, Del
Del-Cer-30


ID: 6D9
Metadata:
Pitcher
M: c. 1885
Crockery
R: Doris Hollingsworth
O: (1940) Sio

In [17]:
#STEP 3 (MAJOLICA)
# Function to parse metadata and assign key-value pairs

# Split the filtered DataFrame by ID number
majolica_split_data = split_by_id(majolica_df_filtered)

# Display the split data with parsed metadata
for id_number, metadata in majolica_split_data:
    print("ID:", id_number)
    print("Parsed Metadata:")
    parsed_metadata = parse_metadata(metadata)
    for key, value in parsed_metadata.items():
        if key == 'Materials':
            print(f"{key.capitalize()}: ", end="")  # Print "Materials:" without a newline
            for item in value:
                print(item, end=" ")  # Print each item on the same line
            print()  # Add a newline after printing all items
        else:
            print(f"{key.capitalize()}: {value}")
    print()

ID: 6D3
Parsed Metadata:
Name: Pitcher
Maker: None
Materials: 
Renderer: Della Button
Owner: None
Code: No class #

ID: 6D4
Parsed Metadata:
Name: Apple butter jar
Maker: before 1820 Lebanon, Pennsylvania
Materials: 
Renderer: Harry Mann Waddell
Owner: (1937) Dora B. Talaferro, San Diego, Ca
Code: So Cal-Cer-15a

ID: 6D5
Parsed Metadata:
Name: Dog pitcher
Maker: Griffin, Smith and Hill, c. 1860 Phoenixville, Pennsylvania
Materials: 
Renderer: Ernest A. Towers
Owner: (1938) Theodore H. Buckalew, Wilmington, Del
Code: Del-Cer-23

ID: 6D6
Parsed Metadata:
Name: Pitcher
Maker: Griffin, Smith and Hill c. 1880-90 Phoenixville, Pennsylvania
Materials: 
Renderer: Ernest A. Towers
Owner: (1938) Theodore H. Buckalew, Wilmington, Del
Code: Del-Cer-24

ID: 6D7
Parsed Metadata:
Name: Pitcher
Maker: Griffin, Smith and Hill c. 1880-90 Phoenixville, Pennsylvania
Materials: 
Renderer: Amos Brinton
Owner: (1938) Mrs T.H. Buckalew, Wilmington, Del
Code: Del-Cer-26

ID: 6D8
Parsed Metadata:
Name: Pitcher


**--------- LUSTERWARE ---------**

In [18]:
# --- LUSTERWARE ---
lusterware_df = section_data['LUSTERWARE']
# print(lusterware_df)

In [19]:
#STEP 1 (LUSTERWARE)
# Filter out rows with hyphenated alphanumerics of the combination type
lusterware_df_filtered = lusterware_df[~lusterware_df['Data'].apply(has_combination)]

# Display the filtered DataFrame
print("Lusterware DataFrame after filtering out combination types:")
print(lusterware_df_filtered)

Lusterware DataFrame after filtering out combination types:
                                                 Data
0                                                    
1                                                 6E2
2                                              Plates
3   M: Benjamin Tucker 1832-38 Philadelphia, Penns...
4                                          Soft paste
5                                      R: Cora Parker
6            O: (1938) Cora Parker, Coral Gables, Fla
7                                          Fla-Cer-46
8                                                    
9                                                 6E3
10                                                Mug
11                                M: 1848 New England
12                                 R: Robert Scheurer
13  O: (1937) Mr and Mrs W.H. Richardson, Jersey C...
14                                           NJ-Cer-9
15                                                   
16                    

In [20]:
#STEP 2 (LUSTERWARE)
# Function to split the DataFrame by ID number

# Split the filtered DataFrame by ID number
lusterware_split_data = split_by_id(lusterware_df_filtered)

# Display the split data
for id_number, metadata in lusterware_split_data:
    print("ID:", id_number)
    print("Metadata:")
    for line in metadata:
        print(line)
    print()

ID: 6E2
Metadata:
Plates
M: Benjamin Tucker 1832-38 Philadelphia, Pennsylvania
Soft paste
R: Cora Parker
O: (1938) Cora Parker, Coral Gables, Fla
Fla-Cer-46


ID: 6E3
Metadata:
Mug
M: 1848 New England
R: Robert Scheurer
O: (1937) Mr and Mrs W.H. Richardson, Jersey City, NJ
NJ-Cer-9


ID: 6E4
Metadata:
Bowl
M: c. 1800
Luster and pottery
R: Henry Marsh
NJ-Cer-34


ID: 6E5
Metadata:
Pitcher
M: c. 1850-60
Copper luster
R: Samuel O. Klein
O: (1936) Mrs Gladys Segar, Montclair, NJ
NJ-Cer-47


ID: 6E6
Metadata:
Creamer
R: Arthur Wegg
O: (1937) Mr and Mrs W.R. Richardson, Jersey City, NJ
NJ-Cer-133


ID: 6E7
Metadata:
Plate
M: Chelsea Ceramic Art Works 1872-89 Chelsea, Massachusetts
Stoneware
R: J. Howard lams
O: (1936) Mrs Lee Knode, Washington, Pa
Pa-Cer-148


ID: 6E8
Metadata:
Pitcher
R: Willoughby Ions
O: in Fairfax County, Va
Va-Cer-7


ID: 6E9
Metadata:
Shaving mug
M: 1860
Luster and china
R: Mary Ann Burton
O: (1940) Lily M. Moncure, Richmond, Va
Va-Ccr-8




In [21]:
#STEP 3 (LUSTERWARE)
# Function to parse metadata and assign key-value pairs

# Split the filtered DataFrame by ID number
lusterware_split_data = split_by_id(lusterware_df_filtered)

# Display the split data with parsed metadata
for id_number, metadata in lusterware_split_data:
    print("ID:", id_number)
    print("Parsed Metadata:")
    parsed_metadata = parse_metadata(metadata)
    for key, value in parsed_metadata.items():
        if key == 'Materials':
            print(f"{key.capitalize()}: ", end="")  # Print "Materials:" without a newline
            for item in value:
                print(item, end=" ")  # Print each item on the same line
            print()  # Add a newline after printing all items
        else:
            print(f"{key.capitalize()}: {value}")
    print()

ID: 6E2
Parsed Metadata:
Name: Plates
Maker: Benjamin Tucker 1832-38 Philadelphia, Pennsylvania
Materials: 
Renderer: Cora Parker
Owner: (1938) Cora Parker, Coral Gables, Fla
Code: Fla-Cer-46

ID: 6E3
Parsed Metadata:
Name: Mug
Maker: 1848 New England
Materials: 
Renderer: Robert Scheurer
Owner: (1937) Mr and Mrs W.H. Richardson, Jersey City, NJ
Code: NJ-Cer-9

ID: 6E4
Parsed Metadata:
Name: Bowl
Maker: c. 1800
Materials: 
Renderer: Henry Marsh
Owner: None
Code: NJ-Cer-34

ID: 6E5
Parsed Metadata:
Name: Pitcher
Maker: c. 1850-60
Materials: Copper luster 
Renderer: Samuel O. Klein
Owner: (1936) Mrs Gladys Segar, Montclair, NJ
Code: NJ-Cer-47

ID: 6E6
Parsed Metadata:
Name: Creamer
Maker: None
Materials: 
Renderer: Arthur Wegg
Owner: (1937) Mr and Mrs W.R. Richardson, Jersey City, NJ
Code: NJ-Cer-133

ID: 6E7
Parsed Metadata:
Name: Plate
Maker: Chelsea Ceramic Art Works 1872-89 Chelsea, Massachusetts
Materials: 
Renderer: J. Howard lams
Owner: (1936) Mrs Lee Knode, Washington, Pa
Code: P

**--------- STONEWARE ---------**

In [22]:
# --- STONEWARE ---
stoneware_df = section_data['STONEWARE']
# print(stoneware_df)

In [23]:
#STEP 1 (STONEWARE)
# Filter out rows with hyphenated alphanumerics of the combination type
stoneware_df_filtered = stoneware_df[~stoneware_df['Data'].apply(has_combination)]

# Display the filtered DataFrame
print("Stoneware DataFrame after filtering out combination types:")
print(stoneware_df_filtered)

Stoneware DataFrame after filtering out combination types:
                                                 Data
0                                                    
2                                                 7A6
3                          Two handled preserve crock
4   M: Swan and States 1800-30 Stonington, Connect...
5                                           Stoneware
..                                                ...
77                            Greyish brown stoneware
78                                R: Yolande Delasser
79  O: (1936) Elie Nadelman Collection, New York H...
80                                     NYC-Cer-St-164
81                                                   

[72 rows x 1 columns]


In [24]:
#STEP 2 (STONEWARE)
# Function to split the DataFrame by ID number

# Split the filtered DataFrame by ID number
stoneware_split_data = split_by_id(stoneware_df_filtered)

# Display the split data
for id_number, metadata in stoneware_split_data:
    print("ID:", id_number)
    print("Metadata:")
    for line in metadata:
        print(line)
    print()

ID: 7A6
Metadata:
Two handled preserve crock
M: Swan and States 1800-30 Stonington, Connecticut
Stoneware
R: Jerome Hoxie
O: (1936) Dr C.P. Williams, Stonington, Ct
Conn-Cer-St-11

ID: 9C3
Metadata:
Jar: detail
M: possibly William A. Macquoid and Company 1864-76 New York City, New York State
Greyish tan stoneware, salt glaze
R: Yolande Delasser
O: (1937) Israel Putnam, The Cobweb Shop, Brooklyn, NY
NYC-Cer-St-l00d

ID: 9D7
Metadata:
Crock
M: C. Crolius c. 1800 Manhattan Wells, New York State
Grey stoneware
R: Yolande Delasser
O: (1936) Elie Nadelman, Museum of Folk Arts, New York City, NY
NYC-Cer-St-174d

ID: 10A4
Metadata:
Jar
M: C. Crolius 1815-48 New York City, New York State
Stoneware
R: John Tarantino
O: (1940) Walter H. Powers, New York City, NY
NYC-Cer-St-300

ID: 10A5
Metadata:
Jar
M: C. Crolius 1799 New York City, New York State
Grey stoneware. High glaze
R: John Garay
O: (1940) New York Historical Society, New York City, NY
NYC-Ccr-St-301

ID: 11BI0
Metadata:
Jar
M: Reidinger

In [25]:
#STEP 3 (STONEWARE)
# Function to parse metadata and assign key-value pairs

# Split the filtered DataFrame by ID number
stoneware_split_data = split_by_id(stoneware_df_filtered)

# Display the split data with parsed metadata
for id_number, metadata in stoneware_split_data:
    print("ID:", id_number)
    print("Parsed Metadata:")
    parsed_metadata = parse_metadata(metadata)
    for key, value in parsed_metadata.items():
        if key == 'Materials':
            print(f"{key.capitalize()}: ", end="")  # Print "Materials:" without a newline
            for item in value:
                print(item, end=" ")  # Print each item on the same line
            print()  # Add a newline after printing all items
        else:
            print(f"{key.capitalize()}: {value}")
    print()

ID: 7A6
Parsed Metadata:
Name: Two handled preserve crock
Maker: Swan and States 1800-30 Stonington, Connecticut
Materials: 
Renderer: Jerome Hoxie
Owner: (1936) Dr C.P. Williams, Stonington, Ct
Code: Conn-Cer-St-11

ID: 9C3
Parsed Metadata:
Name: Jar: detail
Maker: possibly William A. Macquoid and Company 1864-76 New York City, New York State
Materials: 
Renderer: Yolande Delasser
Owner: (1937) Israel Putnam, The Cobweb Shop, Brooklyn, NY
Code: NYC-Cer-St-l00d

ID: 9D7
Parsed Metadata:
Name: Crock
Maker: C. Crolius c. 1800 Manhattan Wells, New York State
Materials: 
Renderer: Yolande Delasser
Owner: (1936) Elie Nadelman, Museum of Folk Arts, New York City, NY
Code: NYC-Cer-St-174d

ID: 10A4
Parsed Metadata:
Name: Jar
Maker: C. Crolius 1815-48 New York City, New York State
Materials: 
Renderer: John Tarantino
Owner: (1940) Walter H. Powers, New York City, NY
Code: NYC-Cer-St-300

ID: 10A5
Parsed Metadata:
Name: Jar
Maker: C. Crolius 1799 New York City, New York State
Materials: 
Render

**--------- IRONSTONE ---------**

In [26]:
# --- IRONSTONE ---
ironstone_df = section_data['IRONSTONE']
# print(ironstone_df)

In [27]:
#STEP 1 (IRONSTONE)
# Filter out rows with hyphenated alphanumerics of the combination type
ironstone_df_filtered = ironstone_df[~ironstone_df['Data'].apply(has_combination)]

# Display the filtered DataFrame
print("Ironstone DataFrame after filtering out combination types:")
print(ironstone_df_filtered)

Ironstone DataFrame after filtering out combination types:
                                              Data
0                                                 
2                                             16A7
3                                          Platter
4   M: Glascow Pottery c. 1875 Trenton, New Jersey
5                                        Ironstone
..                                             ...
76                                       Stoneware
77                               R: J. Howard Iams
78         O: (1937) Mrs Lee Knode, Washington, Pa
79                                      Pa-Cer-233
80                                                

[72 rows x 1 columns]


In [28]:
#STEP 2 (IRONSTONE)
# Function to split the DataFrame by ID number

# Split the filtered DataFrame by ID number
ironstone_split_data = split_by_id(ironstone_df_filtered)

# Display the split data
for id_number, metadata in ironstone_split_data:
    print("ID:", id_number)
    print("Metadata:")
    for line in metadata:
        print(line)
    print()

ID: 16A7
Metadata:
Platter
M: Glascow Pottery c. 1875 Trenton, New Jersey
Ironstone
R: Joseph Sudek
O: (1937) Passaic County Historical Society, Paterson, NJ
NJ-Cer-72

ID: 16A8
Metadata:
Platter
M: Crescent Pottery 19th c Trenton, New Jersey
China
R: Joseph Sudek
O: (1937) Joseph Sudek, East Paterson, NJ
NJ-Cer-82

ID: 16A9
Metadata:
Pitcher
M: Ott and Brewer 19th c Trenton, New Jersey
Whiteware
R: Joseph Sudek
O: (1937) Joseph Sudek, East Paterson, NJ
NJ-Ccr-87

ID: 16A10
Metadata:
Soup tureen
M: Willets Manufacturing Company 19th c Trenton, New Jersey
Whiteware
R; Joseph Sudek
O: (1937) Joseph Sudek, East Paterson, NJ
NJ-Ccr-89

ID: 16B6
Metadata:
Pitcher
M: Empire Pottery c. 1884 Trenton, New Jersey
Ironstone china
R: Roberta Spicer
O: (1939) James McCreery, Brooklyn, NY
NYC-Cer-13

ID: 16B7
Metadata:
Mug
M: late 19th c
Ironstone
R: Roberta Spicer
O: (1939) James McCreery, Brooklyn, NY
NYC-Cer-25
“16-B-8"

ID: 16B8
Metadata:
Bowl and pitcher
M: United States Pottery 1853-58 Benning

In [29]:
#STEP 3 (IRONSTONE)
# Function to parse metadata and assign key-value pairs

# Split the filtered DataFrame by ID number
ironstone_split_data = split_by_id(ironstone_df_filtered)

# Display the split data with parsed metadata
for id_number, metadata in ironstone_split_data:
    print("ID:", id_number)
    print("Parsed Metadata:")
    parsed_metadata = parse_metadata(metadata)
    for key, value in parsed_metadata.items():
        if key == 'Materials':
            print(f"{key.capitalize()}: ", end="")  # Print "Materials:" without a newline
            for item in value:
                print(item, end=" ")  # Print each item on the same line
            print()  # Add a newline after printing all items
        else:
            print(f"{key.capitalize()}: {value}")
    print()

ID: 16A7
Parsed Metadata:
Name: Platter
Maker: Glascow Pottery c. 1875 Trenton, New Jersey
Materials: 
Renderer: Joseph Sudek
Owner: (1937) Passaic County Historical Society, Paterson, NJ
Code: NJ-Cer-72

ID: 16A8
Parsed Metadata:
Name: Platter
Maker: Crescent Pottery 19th c Trenton, New Jersey
Materials: 
Renderer: Joseph Sudek
Owner: (1937) Joseph Sudek, East Paterson, NJ
Code: NJ-Cer-82

ID: 16A9
Parsed Metadata:
Name: Pitcher
Maker: Ott and Brewer 19th c Trenton, New Jersey
Materials: 
Renderer: Joseph Sudek
Owner: (1937) Joseph Sudek, East Paterson, NJ
Code: NJ-Ccr-87

ID: 16A10
Parsed Metadata:
Name: Soup tureen
Maker: Willets Manufacturing Company 19th c Trenton, New Jersey
Materials: 
Renderer: Joseph Sudek
Owner: (1937) Joseph Sudek, East Paterson, NJ
Code: NJ-Ccr-89

ID: 16B6
Parsed Metadata:
Name: Pitcher
Maker: Empire Pottery c. 1884 Trenton, New Jersey
Materials: 
Renderer: Roberta Spicer
Owner: (1939) James McCreery, Brooklyn, NY
Code: NYC-Cer-13

ID: 16B7
Parsed Metadata

**--------- PORCELAIN ---------**

In [30]:
# --- PORCELAIN ---
porcelain_df = section_data['PORCELAIN']
# print(porcelain_df)

In [31]:
#STEP 1 (PORCELAIN)
# Filter out rows with hyphenated alphanumerics of the combination type
porcelain_df_filtered = porcelain_df[~porcelain_df['Data'].apply(has_combination)]

# Display the filtered DataFrame
print("Porcelain DataFrame after filtering out combination types:")
print(porcelain_df_filtered)

Porcelain DataFrame after filtering out combination types:
                                              Data
0                                                 
2                                            16C11
3                                          Pitcher
4   M: Felix Hodt 1793 Connecticut or Rhode Island
5                                        R: Barnes
..                                             ...
75                                       Porcelain
76                             R: Byron A. Dingman
77          O: (1938) Mrs Smoots, Allison Park, Pa
78                                      Pa-Cer-395
79                                                

[70 rows x 1 columns]


In [32]:
#STEP 2 (PORCELAIN)
# Function to split the DataFrame by ID number

# Split the filtered DataFrame by ID number
porcelain_split_data = split_by_id(porcelain_df_filtered)

# Display the split data
for id_number, metadata in porcelain_split_data:
    print("ID:", id_number)
    print("Metadata:")
    for line in metadata:
        print(line)
    print()

ID: 16C11
Metadata:
Pitcher
M: Felix Hodt 1793 Connecticut or Rhode Island
R: Barnes
O: (1936) Ethnology Museum, Washington, DC
DC-Cer-12a

ID: 16D6
Metadata:
Mug
M: c. 1889 probably New Jersey
Porcelain
R: Samuel O. Klein
O: (1936) Florence Stevenson, East Orange, NJ
NJ-Cer-36

ID: 16D8
Metadata:
Pitcher
M: Bridgwood and Son 1892 Trenton, New Jersey
White porcelain
R: Joseph Sudek
O: (1936) Passaic County Historical Society, Paterson, NJ
NJ-Cer-54

ID: 17A5
Metadata:
Vase
M: c, 1870 New York City, New York State
Soft paste porcelain
R: Josephine Lindley
O: (1937) Mrs Austin E. Allen, West Palm Beach, Fla
Fla-Cer-l

ID: 17A12
Metadata:
Tie back
M: William H. Bloor, 1860 East Liverpool, Ohio
Parian ware
R: Richard Barnett
O: (1938) American Ceramic Society, Columbus Ohio
Ohio-Cer-25

ID: 17C12
Metadata:
Vase
M: c. 1871 purchased in Midway, Utah
Porcelain, hand painted
R: Gerald Transpota
O: (1937) Jane G. Corless, Los Angeles, Ca
So Cal-Cer-17

ID: 17E3
Metadata:
Pitcher
M: 1856-60 near

In [33]:
#STEP 3 (PORCELAIN)
# Function to parse metadata and assign key-value pairs

# Split the filtered DataFrame by ID number
porcelain_split_data = split_by_id(porcelain_df_filtered)

# Display the split data with parsed metadata
for id_number, metadata in porcelain_split_data:
    print("ID:", id_number)
    print("Parsed Metadata:")
    parsed_metadata = parse_metadata(metadata)
    for key, value in parsed_metadata.items():
        if key == 'Materials':
            print(f"{key.capitalize()}: ", end="")  # Print "Materials:" without a newline
            for item in value:
                print(item, end=" ")  # Print each item on the same line
            print()  # Add a newline after printing all items
        else:
            print(f"{key.capitalize()}: {value}")
    print()

ID: 16C11
Parsed Metadata:
Name: Pitcher
Maker: Felix Hodt 1793 Connecticut or Rhode Island
Materials: 
Renderer: Barnes
Owner: (1936) Ethnology Museum, Washington, DC
Code: DC-Cer-12a

ID: 16D6
Parsed Metadata:
Name: Mug
Maker: c. 1889 probably New Jersey
Materials: 
Renderer: Samuel O. Klein
Owner: (1936) Florence Stevenson, East Orange, NJ
Code: NJ-Cer-36

ID: 16D8
Parsed Metadata:
Name: Pitcher
Maker: Bridgwood and Son 1892 Trenton, New Jersey
Materials: 
Renderer: Joseph Sudek
Owner: (1936) Passaic County Historical Society, Paterson, NJ
Code: NJ-Cer-54

ID: 17A5
Parsed Metadata:
Name: Vase
Maker: c, 1870 New York City, New York State
Materials: 
Renderer: Josephine Lindley
Owner: (1937) Mrs Austin E. Allen, West Palm Beach, Fla
Code: Fla-Cer-l

ID: 17A12
Parsed Metadata:
Name: Tie back
Maker: William H. Bloor, 1860 East Liverpool, Ohio
Materials: 
Renderer: Richard Barnett
Owner: (1938) American Ceramic Society, Columbus Ohio
Code: Ohio-Cer-25

ID: 17C12
Parsed Metadata:
Name: Va

**--------- TILES AND PLAQUES ---------**

In [34]:
# --- TILES AND PLAQUES ---
tilesandplaques_df = section_data['TILES AND PLAQUES']
# print(tilesandplaques_df)

In [35]:
#STEP 1 (TILES AND PLAQUES)
# Filter out rows with hyphenated alphanumerics of the combination type
tilesandplaques_df_filtered = tilesandplaques_df[~tilesandplaques_df['Data'].apply(has_combination)]

# Display the filtered DataFrame
print("Tiles and Plaques DataFrame after filtering out combination types:")
print(tilesandplaques_df_filtered)

Tiles and Plaques DataFrame after filtering out combination types:
                                                 Data
0                                                    
1                                                18E2
2                         Tile: view of New Amsterdam
3                                             M: 1636
4                                         R: A. Zimet
..                                                ...
74                                     Glazed pottery
75                                      R: John Dixon
76  O: (1936) Historical Society of Pennsylvania, ...
77                                         Pa-Mscl-4b
78                                                   

[79 rows x 1 columns]


Note: testing due to "NYC-no class #" bug

In [36]:
# print(tilesandplaques_df_filtered.head(15))

In [37]:
#STEP 2 (TILES AND PLAQUES)
# Function to split the DataFrame by ID number

# Split the filtered DataFrame by ID number
tilesandplaques_split_data = split_by_id(tilesandplaques_df_filtered)

# Display the split data
for id_number, metadata in tilesandplaques_split_data:
    print("ID:", id_number)
    print("Metadata:")
    for line in metadata:
        print(line)
    print()

ID: 18E2
Metadata:
Tile: view of New Amsterdam
M: 1636
R: A. Zimet
O: New York Historical Society, New York City, NY
NYC-no class #


ID: 18E3
Metadata:
Ornament
M: c. 1860 Connecticut
Pottery
R: Dana Bartlett
O: (1937) A.B. Wheeler, Los Angeles, Ca
So Cal-Cer-4


ID: 18E4
Metadata:
Tile drain
M: Kohler Pottery 1868-1915 Pensacola. Florida
Earthenware clay
R: Annie B. Johnston
O: (1937) Florida State Museum. Gainesville. Fla
Fla-Ccr-37


ID: 18E5
Metadata:
Floor tile
M: M.A.W. and Company and Brosley, Salop 1756 (from The Isaac Drake House Washington Headquarters)
R: W.VI- Jennings
O: (1936) Original Drake House Washington, DC
NJ-Cer-3


ID: 18E6
Metadata:
Tiles
M: Zoar Pottery 1818-98 Zoar, Ohio
Glazed pottery
R: Angelo Bulone
O: (1938) Zoar Museum, Zoar, Ohio
Ohio-Cer-49


ID: 18E7
Metadata:
Fireplace
M: Baron Stiegel late 18th c
Glazed pottery
R: John Dixon
O: (1936) Historical Society of Pennsylvania, Philadelphia, Pa
Pa-Cer-59a


ID: 18E8
Metadata:
Fireplace tiles
M: Baron Stiegel

In [38]:
#STEP 3 (TILES AND PLAQUES)
# Function to parse metadata and assign key-value pairs

# Split the filtered DataFrame by ID number
tilesandplaques_split_data = split_by_id(tilesandplaques_df_filtered)

# Display the split data with parsed metadata
for id_number, metadata in tilesandplaques_split_data:
    print("ID:", id_number)
    print("Parsed Metadata:")
    parsed_metadata = parse_metadata(metadata)
    for key, value in parsed_metadata.items():
        if key == 'Materials':
            print(f"{key.capitalize()}: ", end="")  # Print "Materials:" without a newline
            for item in value:
                print(item, end=" ")  # Print each item on the same line
            print()  # Add a newline after printing all items
        else:
            print(f"{key.capitalize()}: {value}")
    print()

ID: 18E2
Parsed Metadata:
Name: Tile: view of New Amsterdam
Maker: 1636
Materials: 
Renderer: A. Zimet
Owner: New York Historical Society, New York City, NY
Code: NYC-no class #

ID: 18E3
Parsed Metadata:
Name: Ornament
Maker: c. 1860 Connecticut
Materials: Pottery 
Renderer: Dana Bartlett
Owner: (1937) A.B. Wheeler, Los Angeles, Ca
Code: So Cal-Cer-4

ID: 18E4
Parsed Metadata:
Name: Tile drain
Maker: Kohler Pottery 1868-1915 Pensacola. Florida
Materials: 
Renderer: Annie B. Johnston
Owner: (1937) Florida State Museum. Gainesville. Fla
Code: Fla-Ccr-37

ID: 18E5
Parsed Metadata:
Name: Floor tile
Maker: M.A.W. and Company and Brosley, Salop 1756 (from The Isaac Drake House Washington Headquarters)
Materials: 
Renderer: W.VI- Jennings
Owner: (1936) Original Drake House Washington, DC
Code: NJ-Cer-3

ID: 18E6
Parsed Metadata:
Name: Tiles
Maker: Zoar Pottery 1818-98 Zoar, Ohio
Materials: 
Renderer: Angelo Bulone
Owner: (1938) Zoar Museum, Zoar, Ohio
Code: Ohio-Cer-49

ID: 18E7
Parsed Metad

**--------- CERAMIC FIGURES ---------**

In [39]:
# --- CERAMIC FIGURES ---
ceramicfigures_df = section_data['CERAMIC FIGURES']
# print(ceramicfigures_df)

In [40]:
#STEP 1 (CERAMIC FIGURES)
# Filter out rows with hyphenated alphanumerics of the combination type
ceramicfigures_df_filtered = ceramicfigures_df[~ceramicfigures_df['Data'].apply(has_combination)]

# Display the filtered DataFrame
print("Ceramic Figures DataFrame after filtering out combination types:")
print(ceramicfigures_df_filtered)

Ceramic Figures DataFrame after filtering out combination types:
                                                 Data
0                                                    
2                                                19A3
3                                                Lion
4   M: John Sanders, Connecticut dated on back: 18...
5                                      Glazed ceramic
..                                                ...
75                                          Stoneware
76                                R: Yolande Delasser
77    O: (1939) George S. McKearin, Hoosick Falls, NY
78                                    NYC-Cer-St-261b
79                                                   

[70 rows x 1 columns]


In [41]:
#STEP 2 (CERAMIC FIGURES)
# Function to split the DataFrame by ID number

# Split the filtered DataFrame by ID number
ceramicfigures_split_data = split_by_id(ceramicfigures_df_filtered)

# Display the split data
for id_number, metadata in ceramicfigures_split_data:
    print("ID:", id_number)
    print("Metadata:")
    for line in metadata:
        print(line)
    print()

ID: 19A3
Metadata:
Lion
M: John Sanders, Connecticut dated on back: 1817 or 1877
Glazed ceramic
R: John Matulis
O: (1937) Wadsworth Atheneum, Hartford, Ct
Conn-Cer-38

ID: 19A4
Metadata:
Potpourri jar
M: Winkle Terra Cotta Company c. 1890 St Louis, Mo
R: Adolph Opstad
O: (1941) Paul Joseph, Chicago, 111
lll-Cer-28

ID: 19A6
Metadata:
Statuette
M: 1899 Bedford, Ohio
Porcelain
R: Ralph Atkinson
O: (1937) Mrs D. Zeber, Bellevue, Pa
Pa-Cer-282

ID: 19A9
Metadata:
Squirrel statuette
M: early 19th c Pennsylvania
Glazed earthenware
R: Yolande Delasser
O: (1937) Elie Nadelman, Museum of Folk Arts, New York City, NY
NYC-Cer-no class #

ID: 19A11
Metadata:
Statuette
M: 19 th c Pennsylvania
Earthenware
R: Frank Fumagalli
O: (1937) Elie Nadelman Collection, New York City, NY
NYC-Cer-41

ID: 19B3
Metadata:
Ceramic coach dog
M: Gallatin Works c. 1804 Greensboro, Pennsylvania
Unglazed potter’s clay
R: George Yanosko
O: (1940) Miss P. Abraham, Smithfield, Pa
Pa-Cer-177a

ID: 19B4
Metadata:
Lion and ba

In [42]:
#STEP 3 (CERAMIC FIGURES)
# Function to parse metadata and assign key-value pairs

# Split the filtered DataFrame by ID number
ceramicfigures_split_data = split_by_id(ceramicfigures_df_filtered)

# Display the split data with parsed metadata
for id_number, metadata in ceramicfigures_split_data:
    print("ID:", id_number)
    print("Parsed Metadata:")
    parsed_metadata = parse_metadata(metadata)
    for key, value in parsed_metadata.items():
        if key == 'Materials':
            print(f"{key.capitalize()}: ", end="")  # Print "Materials:" without a newline
            for item in value:
                print(item, end=" ")  # Print each item on the same line
            print()  # Add a newline after printing all items
        else:
            print(f"{key.capitalize()}: {value}")
    print()

ID: 19A3
Parsed Metadata:
Name: Lion
Maker: John Sanders, Connecticut dated on back: 1817 or 1877
Materials: 
Renderer: John Matulis
Owner: (1937) Wadsworth Atheneum, Hartford, Ct
Code: Conn-Cer-38

ID: 19A4
Parsed Metadata:
Name: Potpourri jar
Maker: Winkle Terra Cotta Company c. 1890 St Louis, Mo
Materials: lll-Cer-28 
Renderer: Adolph Opstad
Owner: (1941) Paul Joseph, Chicago, 111
Code: None

ID: 19A6
Parsed Metadata:
Name: Statuette
Maker: 1899 Bedford, Ohio
Materials: 
Renderer: Ralph Atkinson
Owner: (1937) Mrs D. Zeber, Bellevue, Pa
Code: Pa-Cer-282

ID: 19A9
Parsed Metadata:
Name: Squirrel statuette
Maker: early 19th c Pennsylvania
Materials: 
Renderer: Yolande Delasser
Owner: (1937) Elie Nadelman, Museum of Folk Arts, New York City, NY
Code: NYC-Cer-no class #

ID: 19A11
Parsed Metadata:
Name: Statuette
Maker: 19 th c Pennsylvania
Materials: 
Renderer: Frank Fumagalli
Owner: (1937) Elie Nadelman Collection, New York City, NY
Code: NYC-Cer-41

ID: 19B3
Parsed Metadata:
Name: Cer

**--------- CHALKWARE ---------**

In [43]:
# --- CHALKWARE ---
chalkware_df = section_data['CHALKWARE']
# print(chalkware_df)

In [44]:
#STEP 1 (CHALKWARE)
# Filter out rows with hyphenated alphanumerics of the combination type
chalkware_df_filtered = chalkware_df[~chalkware_df['Data'].apply(has_combination)]

# Display the filtered DataFrame
print("Chalkware DataFrame after filtering out combination types:")
print(chalkware_df_filtered)

Chalkware DataFrame after filtering out combination types:
                                                 Data
0                                                    
2                                               19C10
3                         Dove: plaster-of-Paris bank
4   M: c. 1860 New Jersey Plaster-of-Paris cast in...
5                                  R: Elisabeth Fulda
..                                                ...
75                                      M: mid 19th c
76                                          Chalkware
77                                      R: Mina Lowry
78  O: (1940) Miss Elena Wade Jack, New York City, NY
79                                     NYC-Mscl-Ch-49

[70 rows x 1 columns]


In [45]:
#STEP 2 (CHALKWARE)
# Function to split the DataFrame by ID number

# Split the filtered DataFrame by ID number
chalkware_split_data = split_by_id(chalkware_df_filtered)

# Display the split data
for id_number, metadata in chalkware_split_data:
    print("ID:", id_number)
    print("Metadata:")
    for line in metadata:
        print(line)
    print()

ID: 19C10
Metadata:
Dove: plaster-of-Paris bank
M: c. 1860 New Jersey Plaster-of-Paris cast in the round; painted
R: Elisabeth Fulda
O: (1937) Bruce Buttfield, New York City, NY
NYC-Mscl-Vic-1

ID: 19C12
Metadata:
Pigeon bank
M: 19th c Pennsylvania
Chalkwarc
R: Emanuel Jacobson
O; (1938) Mrs B.F. Ramsdell, Geneva, Ill
Ill-Ca-39

ID: 19D1
Metadata:
Deer figurine
M: early 19th c Pennsylvania
Painted chalkware
R; Milton Bevier
O: (1939) Mrs James Cady Ewell, Highland Park, Ill
Ill-Ca-47

ID: 19D3
Metadata:
Portrait bust
M; 19th c Pennsylvania
Chalkware
R; Mina Lowry
O; (1936) Mrs J.D. Rockefeller, New York City, NY
NYC-Mscl-Ch-7

ID: 19D7
Metadata:
Figurine: ‘Queen Victoria, Empress of India’
M: 19th c Pennsylvania
Chalkware
R; Mina Lowry
O: (1936) American Folk Art Gallery, New York City, NY
NYC-Mscl-Ch-37

ID: 19D12
Metadata:
Chalkware rooster
M; c. 1860 probably York County, Pennsylvania
Painted plaster
R: Elmer R. Kottcamp
O: (1940) Mrs W.H. Wierman, York, Pa
Pa-Mscl-304

ID: 19E12
Me

In [46]:
#STEP 3 (CHALKWARE)
# Function to parse metadata and assign key-value pairs

# Split the filtered DataFrame by ID number
chalkware_split_data = split_by_id(chalkware_df_filtered)

# Display the split data with parsed metadata
for id_number, metadata in chalkware_split_data:
    print("ID:", id_number)
    print("Parsed Metadata:")
    parsed_metadata = parse_metadata(metadata)
    for key, value in parsed_metadata.items():
        if key == 'Materials':
            print(f"{key.capitalize()}: ", end="")  # Print "Materials:" without a newline
            for item in value:
                print(item, end=" ")  # Print each item on the same line
            print()  # Add a newline after printing all items
        else:
            print(f"{key.capitalize()}: {value}")
    print()

ID: 19C10
Parsed Metadata:
Name: Dove: plaster-of-Paris bank
Maker: c. 1860 New Jersey Plaster-of-Paris cast in the round; painted
Materials: 
Renderer: Elisabeth Fulda
Owner: (1937) Bruce Buttfield, New York City, NY
Code: NYC-Mscl-Vic-1

ID: 19C12
Parsed Metadata:
Name: Pigeon bank
Maker: 19th c Pennsylvania
Materials: 
Renderer: Emanuel Jacobson
Owner: (1938) Mrs B.F. Ramsdell, Geneva, Ill
Code: Ill-Ca-39

ID: 19D1
Parsed Metadata:
Name: Deer figurine
Maker: early 19th c Pennsylvania
Materials: 
Renderer: Milton Bevier
Owner: (1939) Mrs James Cady Ewell, Highland Park, Ill
Code: Ill-Ca-47

ID: 19D3
Parsed Metadata:
Name: Portrait bust
Maker: 19th c Pennsylvania
Materials: 
Renderer: Mina Lowry
Owner: (1936) Mrs J.D. Rockefeller, New York City, NY
Code: NYC-Mscl-Ch-7

ID: 19D7
Parsed Metadata:
Name: Figurine: ‘Queen Victoria, Empress of India’
Maker: 19th c Pennsylvania
Materials: 
Renderer: Mina Lowry
Owner: (1936) American Folk Art Gallery, New York City, NY
Code: NYC-Mscl-Ch-37

I

# End Test
Note: need to follow next steps, 1) use main functions on section_data sample as a whole; 2) determine when to bring in other functions for cleaning OCR errors, i.e, "Ccr" should be "Cer" (this will be more important when extracting data from other sections of the Catalog)