# Exporting CeramSample to JSON
Note: amending steps from CeramSample analysis to export into JSON for data visualization testing

Exported JSONs:

chalkware_parsed_metadata.json (testing a data frame)

combined_data.json (all data frames into one json)

## Pre-Processing Step
Note: grabbed list of Catalog's state_abbr from previous python data-cleaning set (renderer state index)

In [7]:
# Dictionary mapping custom state abbreviations to state names
state_abbr_dict = {
    "Conn": "Connecticut",
    "Ct": "Connecticut",
    "Me": "Maine",
    "Ma": "Massachusetts",
    "Mass": "Massachusetts",
    "NH": "New Hampshire",
    "RI": "Rhode Island",
    "Vt": "Vermont",
    "NJ": "New Jersey",
    "NYS": "New York State",
    "NYC": "New York City",
    "Pa": "Pennsylvania",
    "IN": "Indiana",
    "Ill": "Illinois",
    "Mich": "Michigan",
    "Ohio": "Ohio",
    "Wis": "Wisconsin",
    "Iowa": "Iowa",
    "Ka": "Kansas",
    "Minn": "Minnesota",
    "Mn": "Minnesota",
    "Mo": "Missouri",
    "NE": "Nebraska",
    "ND": "North Dakota",
    "SD": "South Dakota",
    "Del": "Delaware",
    "DC": "District of Columbia",
    "Fla": "Florida",
    "Ga": "Georgia",
    "Md": "Maryland",
    "NC": "North Carolina",
    "SC": "South Carolina",
    "Va": "Virginia",
    "WV": "West Virginia",
    "Ala": "Alabama",
    "Ky": "Kentucky",
    "MS": "Mississippi",
    "Tenn": "Tennessee",
    "AR": "Arkansas",
    "La": "Louisiana",
    "OK": "Oklahoma",
    "Tex": "Texas",
    "Ariz": "Arizona",
    "Col": "Colorado",
    "Colo": "Colorado",
    "ID": "Idaho",
    "NM": "New Mexico",
    "MT": "Montana",
    "Utah": "Utah",
    "NV": "Nevada",
    "WY": "Wyoming",
    "AK": "Alaska",
    "Ca": "California",
    "Cal": "California",
    "So Cal": "Southern California",
    "HI": "Hawaii",
    "OR": "Oregon",
    "Wash": "Washington"
}

In [8]:
# # Example usage:
# state_abbr = "NYC"
# state_name = state_abbr_dict.get(state_abbr)
# print(state_name)  # Output: New York City

### define _state_abbreviations_

In [9]:
# Extract state abbreviations from the dictionary
state_abbreviations = list(state_abbr_dict.keys())

# # Example usage:
# print(state_abbreviations)

## Divide Sample into Sections
Note: valid_sections were capitalized in txt file

In [11]:
import pandas as pd

# Function to parse the text and extract section headers
def parse_catalog(text):
    # Initialize variables
    section_data = {}
    current_section = None
    current_section_data = []
    
    # Define the list of valid section headers
    valid_sections = ['EARTHENWARE', 'MAJOLICA', 'LUSTERWARE', 'STONEWARE', 'IRONSTONE', 'PORCELAIN', 'TILES AND PLAQUES', 'CERAMIC FIGURES', 'CHALKWARE']

    # Split the text into lines
    lines = text.strip().split('\n')

    # Iterate over each line
    for line in lines:
        line = line.strip()
        
        # Check if the line is a section header (uppercase) and is in the valid_sections array
        if line.isupper() and line in valid_sections:
            if current_section:
                # Convert the current_section_data to a DataFrame and store it
                section_data[current_section] = pd.DataFrame(current_section_data, columns=['Data'])
                current_section_data = []  # Reset current_section_data for the new section
            current_section = line
        elif current_section:
            # Append the line to the current_section_data
            current_section_data.append(line)

    # Convert the data of the last section to a DataFrame and store it
    if current_section:
        section_data[current_section] = pd.DataFrame(current_section_data, columns=['Data'])

    return section_data

# Read the contents of the text file
with open('CatCeram_sample.txt', 'r') as file:
    text = file.read()

# Parse the catalog data
section_data = parse_catalog(text)

# # Display the DataFrames for each section
# for section, df in section_data.items():
#     print("Section:", section)
#     print(df)
#     print()

In [12]:
# section_data.items()

## Pre-Check Functions

In [13]:
# # Determine if line contains a state abbr
# def has_state_abbreviation(text):
#     # Split the hyphenated alphanumeric by hyphen and check if the first part is in state_abbreviations
#     parts = text.split('-')
#     if parts[0] in state_abbreviations:
#         return True
#     return False

In [14]:
# # Example usage:
# hyphenated_alphanumeric = "NYC-123"
# if has_state_abbreviation(hyphenated_alphanumeric):
#     print("The hyphenated alphanumeric contains a state abbreviation.")
# else:
#     print("The hyphenated alphanumeric does not contain a state abbreviation.")

### Check for lines with hyphenation
#### 1. Accession Numbers
Note: determining if the line contains a state_abbreviation helps differentiate the Catalog's accession number from the microfiche (in this case, copied into the txt as an excess line to track random samples)
#### 2. Date Ranges
#### 3. Microfiche Location Numbers
Note: need this function to check again object names with 3 characters, i.e., "Jar"

In [15]:
# Function to check if a hyphenated alphanumeric contains a state abbreviation
def has_state_abbreviation(text):
    # Convert text to uppercase for case-insensitive comparison
    upper_text = text.upper()
    
    # Check if the text contains any of the state abbreviations
    for abbreviation in state_abbreviations:
        if abbreviation.upper() in upper_text:
            return True
    
    return False

# Function to check if a hyphenated alphanumeric contains a date range
def has_date_range(text):
    parts = text.split('-')
    if len(parts) == 2 and all(part.isdigit() for part in parts):
        return True
    return False

# Function to check if a hyphenated alphanumeric contains a combination of numbers and letters
def has_combination(line):
    """
    Check if the line matches the combination type pattern.

    Parameters:
        line (str): The line of text to check.

    Returns:
        bool: True if the line matches the combination type pattern, False otherwise.
    """
    # Remove quotation marks if they exist
    line = line.strip('"')
    parts = line.split('-')
    if len(parts) == 3:
        part1, part2, part3 = parts
        if part1.isdigit() and 1 <= int(part1) <= 20 and part2.isalpha() and len(part2) == 1 and 'A' <= part2 <= 'E' and part3.isdigit() and 1 <= int(part3) <= 12:
            return True
    return False

In [16]:
# # EXAMPLE: Iterate through the Earthenware DataFrame
# for index, row in earthenware_df.iterrows():
#     line = row['Data']
#     if '-' in line:
#         if has_state_abbreviation(line):
#             # Process state abbreviation type
#             print("State Abbreviation Type:", line)
#         elif has_date_range(line):
#             # Process date range type
#             print("Date Range Type:", line)
#         elif has_combination(line):
#             # Process combination type
#             print("Combination Type:", line)
#     else:
#         # Process other types of lines
#         print("Other Type:", line)

## Main Functions
Note: each section was made into a data frame to test and refine overall functions (helped determine nuances on smaller sets of data)

**[Next step: test main functions on section_data]**

**--------- EARTHENWARE ---------**

In [17]:
# --- EARTHENWARE ---
earthenware_df = section_data['EARTHENWARE']
# print(earthenware_df)

### Step 1: use _has_combination_ function
Note: this weeds out the extra microfiche line (sometimes a hyphenated line 1-D-8, other times a hyphenated line surrounded by quotation marks "1-D-8")

In [82]:
#STEP 1 (EARTHENWARE)
# Filter out rows with hyphenated alphanumerics of the combination type
earthenware_df_filtered = earthenware_df[~earthenware_df['Data'].apply(has_combination)]

# # Display the filtered DataFrame
# print("Earthenware DataFrame after filtering out combination types:")
# print(earthenware_df_filtered)

### Step 2: split sections into individual objects
Note: ID (or clean microfiche location number) used to split section, lines remaining are bucketed as metadata; addition: type assigned)

In [136]:
#STEP 2 (EARTHENWARE)
# Function to split the DataFrame by ID number
def split_by_id(df, object_type="new_type"):
    # Initialize variables
    split_data = []
    current_id = None
    current_metadata = []

    # Iterate through the DataFrame
    for index, row in df.iterrows():
        line = row['Data']
        words = line.split()
        if len(words) == 1 and words[0].isalnum() and 3 <= len(words[0]) <= 5 and any(char.isdigit() for char in words[0]):
            # If a valid ID is found, append the previous ID, type, and metadata to split_data
            if current_id is not None:
                split_data.append((current_id, object_type, current_metadata))  # Include 'object_type' here
            # Update current_id and reset current_metadata
            current_id = words[0].strip()
            current_metadata = []
        else:
            # Append the line to current_metadata
            current_metadata.append(line.strip())
    
    # Append the last ID, type, and metadata to split_data
    if current_id is not None:
        split_data.append((current_id, object_type, current_metadata))
    
    return split_data

# Example usage:
# Split the new filtered DataFrame by ID number with a different object_type
earthenware_split_data = split_by_id(earthenware_df_filtered, object_type="earthenware")

# # Display the split data
# for id_number, object_type, metadata in earthenware_split_data:
#     print("ID:", id_number)
#     print("Type:", object_type)
#     print("Metadata:") 
#     for line in metadata:
#         print(line)
#     print()

### Step 3: Parse metadata
Note: this step required the most fine tuning, especially for edge-cases like: accessions numbers that included "no class #" and if OCR pulled in semicolons instead of colons

**[Next step: will have to clean up accession numbers; maybe more successful after particular key-value pair is called]**

In [162]:
#STEP 3 (EARTHENWARE)
# Function to parse metadata and assign key-value pairs
def parse_metadata(metadata):
    parsed_data = {}
    name = None
    maker = None
    materials = []
    renderer = None
    owner = None
    code = None

# Iterate over each line in the metadata
    for line_index, line in enumerate(metadata):
        if name is None:
            # The first line without a colon or semicolon is considered as the object name
            name = line.strip()
        elif ':' in line or ';' in line:
            if ':' in line:
                separator = ':'
            elif ';' in line:
                separator = ';'

            key, value = line.split(separator, 1)
            key = key.strip()
            value = value.strip()
            if key == 'M':
                maker = value
            elif key == 'R':
                renderer = value
            elif key == 'O':
                owner = value
            else:
                parsed_data[key] = value
        elif line.strip():  # Check if the line is not empty
            # Check if the line fits the has_state_abbreviation type and "No class #"
            if has_state_abbreviation(line) and ("No class #" in line or "no class #" in line):
                code = line.strip()
            elif has_state_abbreviation(line):
                code = line.strip()
            elif "No class #" in line or "no class #" in line:  # Keep "No class #" in code
                code = line.strip()
            else:
                # Add the line to materials if it doesn't match any of the conditions above
                materials.append(line.strip())


    # Assign the collected metadata to the parsed data
    parsed_data['name'] = name
    parsed_data['maker'] = maker
    parsed_data['materials'] = materials
    parsed_data['renderer'] = renderer
    parsed_data['owner'] = owner
    parsed_data['code'] = code

    return parsed_data

# Split the filtered DataFrame by ID number
earthenware_split_data = split_by_id(earthenware_df_filtered, object_type="earthenware")

# Display the split data with parsed metadata
for id_number, object_type, metadata in earthenware_split_data:
    print("ID:", id_number)
    print("type:", object_type)
    print("metadata:")
    parsed_metadata = parse_metadata(metadata)
    for key, value in parsed_metadata.items():
        if key == 'materials':
            print(f"{key}: ", end="")  # Print "materials:" without a newline
            for item in value:
                print(item, end=" ")  # Print each item on the same line
            print()  # Add a newline after printing all items
        else:
            print(f"{key}: {value}")
    print()

ID: 1D8
type: earthenware
metadata:
name: Flower pot
maker: Kohler Pottery 1865-1915 Pensacola, Florida
materials: 
renderer: Annie B. Johnston
owner: (1937) Florida State Museum, Gainesville, Fla
code: Fla-Cer-34

ID: 2B12
type: earthenware
metadata:
name: Jar
maker: Burr Frost 1847 Missouri
materials: 
renderer: Clyde L. Cheney
owner: (1937) Judy Lund, Salt Lake City, Utah
code: Utah-Cer-3

ID: 3A6
type: earthenware
metadata:
name: Jar with cover
maker: early 19th c Long Island, New York State
materials: 
renderer: Alvin Shiren
owner: (1939) Metropolitan Museum of Art, New York City, NY
code: NYC-Cer-51

ID: 3E5
type: earthenware
metadata:
name: Jar
maker: Pennsylvania
materials: 
renderer: Yolande Delasser
owner: (1938) Alfred B. Maclay, New York City, NY
code: NYC-Cer-68

ID: 4C12
type: earthenware
metadata:
name: Jar
maker: John Eardley 186O’s St. George, Utah
materials: 
renderer: Clyde L. Cheney
owner: Miss Fern Seegmiller, St. George, Utah
code: Utah-Cer-22

ID: 5A3
type: earth

### Repeat Steps 1-3 for each data frame

**--------- MAJOLICA ---------**

In [21]:
# --- MAJOLICA ---
majolica_df = section_data['MAJOLICA']
# print(majolica_df)

In [83]:
#STEP 1 (MAJOLICA)
# Filter out rows with hyphenated alphanumerics of the combination type
majolica_df_filtered = majolica_df[~majolica_df['Data'].apply(has_combination)]

# # Display the filtered DataFrame
# print("Majolica DataFrame after filtering out combination types:")
# print(majolica_df_filtered)

In [176]:
#STEP 2 (MAJOLICA)
# Function to split the DataFrame by ID number

# Split the filtered DataFrame by ID number
majolica_split_data = split_by_id(majolica_df_filtered, object_type="majolica")

# # Display the split data
# for id_number, object_type, metadata in majolica_split_data:
#     print("ID:", id_number)
#     print("Type:", object_type)
#     print("Metadata:") 
#     for line in metadata:
#         print(line)
#     print()

In [164]:
#STEP 3 (MAJOLICA)
# Function to parse metadata and assign key-value pairs

# Split the filtered DataFrame by ID number
majolica_split_data = split_by_id(majolica_df_filtered, object_type="majolica")

# Display the split data with parsed metadata
for id_number, object_type, metadata in majolica_split_data:
    print("ID:", id_number)
    print("type:", object_type)
    print("metadata:")
    parsed_metadata = parse_metadata(metadata)
    for key, value in parsed_metadata.items():
        if key == 'materials':
            print(f"{key}: ", end="")  # Print "materials:" without a newline
            for item in value:
                print(item, end=" ")  # Print each item on the same line
            print()  # Add a newline after printing all items
        else:
            print(f"{key}: {value}")
    print()

ID: 6D3
type: majolica
metadata:
name: Pitcher
maker: None
materials: 
renderer: Della Button
owner: None
code: No class #

ID: 6D4
type: majolica
metadata:
name: Apple butter jar
maker: before 1820 Lebanon, Pennsylvania
materials: 
renderer: Harry Mann Waddell
owner: (1937) Dora B. Talaferro, San Diego, Ca
code: So Cal-Cer-15a

ID: 6D5
type: majolica
metadata:
name: Dog pitcher
maker: Griffin, Smith and Hill, c. 1860 Phoenixville, Pennsylvania
materials: 
renderer: Ernest A. Towers
owner: (1938) Theodore H. Buckalew, Wilmington, Del
code: Del-Cer-23

ID: 6D6
type: majolica
metadata:
name: Pitcher
maker: Griffin, Smith and Hill c. 1880-90 Phoenixville, Pennsylvania
materials: 
renderer: Ernest A. Towers
owner: (1938) Theodore H. Buckalew, Wilmington, Del
code: Del-Cer-24

ID: 6D7
type: majolica
metadata:
name: Pitcher
maker: Griffin, Smith and Hill c. 1880-90 Phoenixville, Pennsylvania
materials: 
renderer: Amos Brinton
owner: (1938) Mrs T.H. Buckalew, Wilmington, Del
code: Del-Cer-26


**--------- LUSTERWARE ---------**

In [25]:
# --- LUSTERWARE ---
lusterware_df = section_data['LUSTERWARE']
# print(lusterware_df)

In [86]:
#STEP 1 (LUSTERWARE)
# Filter out rows with hyphenated alphanumerics of the combination type
lusterware_df_filtered = lusterware_df[~lusterware_df['Data'].apply(has_combination)]

# # Display the filtered DataFrame
# print("Lusterware DataFrame after filtering out combination types:")
# print(lusterware_df_filtered)

In [140]:
#STEP 2 (LUSTERWARE)
# Function to split the DataFrame by ID number

# Split the filtered DataFrame by ID number
lusterware_split_data = split_by_id(lusterware_df_filtered, object_type="lusterware")

# # Display the split data
# for id_number, object_type, metadata in lusterware_split_data:
#     print("ID:", id_number)
#     print("Type:", object_type)
#     print("Metadata:") 
#     for line in metadata:
#         print(line)
#     print()

In [165]:
#STEP 3 (LUSTERWARE)
# Function to parse metadata and assign key-value pairs

# Split the filtered DataFrame by ID number
lusterware_split_data = split_by_id(lusterware_df_filtered, object_type="lusterware")

# Display the split data with parsed metadata
for id_number, object_type, metadata in lusterware_split_data:
    print("ID:", id_number)
    print("type:", object_type)
    print("metadata:")
    parsed_metadata = parse_metadata(metadata)
    for key, value in parsed_metadata.items():
        if key == 'materials':
            print(f"{key}: ", end="")  # Print "materials:" without a newline
            for item in value:
                print(item, end=" ")  # Print each item on the same line
            print()  # Add a newline after printing all items
        else:
            print(f"{key}: {value}")
    print()

ID: 6E2
type: lusterware
metadata:
name: Plates
maker: Benjamin Tucker 1832-38 Philadelphia, Pennsylvania
materials: 
renderer: Cora Parker
owner: (1938) Cora Parker, Coral Gables, Fla
code: Fla-Cer-46

ID: 6E3
type: lusterware
metadata:
name: Mug
maker: 1848 New England
materials: 
renderer: Robert Scheurer
owner: (1937) Mr and Mrs W.H. Richardson, Jersey City, NJ
code: NJ-Cer-9

ID: 6E4
type: lusterware
metadata:
name: Bowl
maker: c. 1800
materials: 
renderer: Henry Marsh
owner: None
code: NJ-Cer-34

ID: 6E5
type: lusterware
metadata:
name: Pitcher
maker: c. 1850-60
materials: Copper luster 
renderer: Samuel O. Klein
owner: (1936) Mrs Gladys Segar, Montclair, NJ
code: NJ-Cer-47

ID: 6E6
type: lusterware
metadata:
name: Creamer
maker: None
materials: 
renderer: Arthur Wegg
owner: (1937) Mr and Mrs W.R. Richardson, Jersey City, NJ
code: NJ-Cer-133

ID: 6E7
type: lusterware
metadata:
name: Plate
maker: Chelsea Ceramic Art Works 1872-89 Chelsea, Massachusetts
materials: 
renderer: J. How

**--------- STONEWARE ---------**

In [29]:
# --- STONEWARE ---
stoneware_df = section_data['STONEWARE']
# print(stoneware_df)

In [89]:
#STEP 1 (STONEWARE)
# Filter out rows with hyphenated alphanumerics of the combination type
stoneware_df_filtered = stoneware_df[~stoneware_df['Data'].apply(has_combination)]

# # Display the filtered DataFrame
# print("Stoneware DataFrame after filtering out combination types:")
# print(stoneware_df_filtered)

In [166]:
#STEP 2 (STONEWARE)
# Function to split the DataFrame by ID number

# Split the filtered DataFrame by ID number
stoneware_split_data = split_by_id(stoneware_df_filtered, object_type="stoneware")

# # Display the split data
# for id_number, object_type, metadata in stoneware_split_data:
#     print("ID:", id_number)
#     print("Type:", object_type)
#     print("Metadata:") 
#     for line in metadata:
#         print(line)
#     print()

In [167]:
#STEP 3 (STONEWARE)
# Function to parse metadata and assign key-value pairs

# Split the filtered DataFrame by ID number
stoneware_split_data = split_by_id(stoneware_df_filtered, object_type="stoneware")

# Display the split data with parsed metadata
for id_number, object_type, metadata in stoneware_split_data:
    print("ID:", id_number)
    print("type:", object_type)
    print("metadata:")
    parsed_metadata = parse_metadata(metadata)
    for key, value in parsed_metadata.items():
        if key == 'materials':
            print(f"{key}: ", end="")  # Print "materials:" without a newline
            for item in value:
                print(item, end=" ")  # Print each item on the same line
            print()  # Add a newline after printing all items
        else:
            print(f"{key}: {value}")
    print()

ID: 7A6
type: stoneware
metadata:
name: Two handled preserve crock
maker: Swan and States 1800-30 Stonington, Connecticut
materials: 
renderer: Jerome Hoxie
owner: (1936) Dr C.P. Williams, Stonington, Ct
code: Conn-Cer-St-11

ID: 9C3
type: stoneware
metadata:
name: Jar: detail
maker: possibly William A. Macquoid and Company 1864-76 New York City, New York State
materials: 
renderer: Yolande Delasser
owner: (1937) Israel Putnam, The Cobweb Shop, Brooklyn, NY
code: NYC-Cer-St-l00d

ID: 9D7
type: stoneware
metadata:
name: Crock
maker: C. Crolius c. 1800 Manhattan Wells, New York State
materials: 
renderer: Yolande Delasser
owner: (1936) Elie Nadelman, Museum of Folk Arts, New York City, NY
code: NYC-Cer-St-174d

ID: 10A4
type: stoneware
metadata:
name: Jar
maker: C. Crolius 1815-48 New York City, New York State
materials: 
renderer: John Tarantino
owner: (1940) Walter H. Powers, New York City, NY
code: NYC-Cer-St-300

ID: 10A5
type: stoneware
metadata:
name: Jar
maker: C. Crolius 1799 New

**--------- IRONSTONE ---------**

In [33]:
# --- IRONSTONE ---
ironstone_df = section_data['IRONSTONE']
# print(ironstone_df)

In [92]:
#STEP 1 (IRONSTONE)
# Filter out rows with hyphenated alphanumerics of the combination type
ironstone_df_filtered = ironstone_df[~ironstone_df['Data'].apply(has_combination)]

# # Display the filtered DataFrame
# print("Ironstone DataFrame after filtering out combination types:")
# print(ironstone_df_filtered)

In [168]:
#STEP 2 (IRONSTONE)
# Function to split the DataFrame by ID number

# Split the filtered DataFrame by ID number
ironstone_split_data = split_by_id(ironstone_df_filtered, object_type="ironstone")

# # Display the split data
# for id_number, object_type, metadata in ironstone_split_data:
#     print("ID:", id_number)
#     print("Type:", object_type)
#     print("Metadata:") 
#     for line in metadata:
#         print(line)
#     print()

In [169]:
#STEP 3 (IRONSTONE)
# Function to parse metadata and assign key-value pairs

# Split the filtered DataFrame by ID number
ironstone_split_data = split_by_id(ironstone_df_filtered, object_type="ironstone")

# Display the split data with parsed metadata
for id_number, object_type, metadata in ironstone_split_data:
    print("ID:", id_number)
    print("type:", object_type)
    print("metadata:")
    parsed_metadata = parse_metadata(metadata)
    for key, value in parsed_metadata.items():
        if key == 'materials':
            print(f"{key}: ", end="")  # Print "materials:" without a newline
            for item in value:
                print(item, end=" ")  # Print each item on the same line
            print()  # Add a newline after printing all items
        else:
            print(f"{key}: {value}")
    print()

ID: 16A7
type: ironstone
metadata:
name: Platter
maker: Glascow Pottery c. 1875 Trenton, New Jersey
materials: 
renderer: Joseph Sudek
owner: (1937) Passaic County Historical Society, Paterson, NJ
code: NJ-Cer-72

ID: 16A8
type: ironstone
metadata:
name: Platter
maker: Crescent Pottery 19th c Trenton, New Jersey
materials: 
renderer: Joseph Sudek
owner: (1937) Joseph Sudek, East Paterson, NJ
code: NJ-Cer-82

ID: 16A9
type: ironstone
metadata:
name: Pitcher
maker: Ott and Brewer 19th c Trenton, New Jersey
materials: 
renderer: Joseph Sudek
owner: (1937) Joseph Sudek, East Paterson, NJ
code: NJ-Ccr-87

ID: 16A10
type: ironstone
metadata:
name: Soup tureen
maker: Willets Manufacturing Company 19th c Trenton, New Jersey
materials: 
renderer: Joseph Sudek
owner: (1937) Joseph Sudek, East Paterson, NJ
code: NJ-Ccr-89

ID: 16B6
type: ironstone
metadata:
name: Pitcher
maker: Empire Pottery c. 1884 Trenton, New Jersey
materials: 
renderer: Roberta Spicer
owner: (1939) James McCreery, Brooklyn, 

**--------- PORCELAIN ---------**

In [37]:
# --- PORCELAIN ---
porcelain_df = section_data['PORCELAIN']
# print(porcelain_df)

In [95]:
#STEP 1 (PORCELAIN)
# Filter out rows with hyphenated alphanumerics of the combination type
porcelain_df_filtered = porcelain_df[~porcelain_df['Data'].apply(has_combination)]

# # Display the filtered DataFrame
# print("Porcelain DataFrame after filtering out combination types:")
# print(porcelain_df_filtered)

In [170]:
#STEP 2 (PORCELAIN)
# Function to split the DataFrame by ID number

# Split the filtered DataFrame by ID number
porcelain_split_data = split_by_id(porcelain_df_filtered, object_type="porcelain")

# # Display the split data
# for id_number, object_type, metadata in porcelain_split_data:
#     print("ID:", id_number)
#     print("Type:", object_type)
#     print("Metadata:") 
#     for line in metadata:
#         print(line)
#     print()

In [171]:
#STEP 3 (PORCELAIN)
# Function to parse metadata and assign key-value pairs

# Split the filtered DataFrame by ID number
porcelain_split_data = split_by_id(porcelain_df_filtered, object_type="porcelain")

# Display the split data with parsed metadata
for id_number, object_type, metadata in porcelain_split_data:
    print("ID:", id_number)
    print("type:", object_type)
    print("metadata:")
    parsed_metadata = parse_metadata(metadata)
    for key, value in parsed_metadata.items():
        if key == 'materials':
            print(f"{key}: ", end="")  # Print "materials:" without a newline
            for item in value:
                print(item, end=" ")  # Print each item on the same line
            print()  # Add a newline after printing all items
        else:
            print(f"{key}: {value}")
    print()

ID: 16C11
type: porcelain
metadata:
name: Pitcher
maker: Felix Hodt 1793 Connecticut or Rhode Island
materials: 
renderer: Barnes
owner: (1936) Ethnology Museum, Washington, DC
code: DC-Cer-12a

ID: 16D6
type: porcelain
metadata:
name: Mug
maker: c. 1889 probably New Jersey
materials: 
renderer: Samuel O. Klein
owner: (1936) Florence Stevenson, East Orange, NJ
code: NJ-Cer-36

ID: 16D8
type: porcelain
metadata:
name: Pitcher
maker: Bridgwood and Son 1892 Trenton, New Jersey
materials: 
renderer: Joseph Sudek
owner: (1936) Passaic County Historical Society, Paterson, NJ
code: NJ-Cer-54

ID: 17A5
type: porcelain
metadata:
name: Vase
maker: c, 1870 New York City, New York State
materials: 
renderer: Josephine Lindley
owner: (1937) Mrs Austin E. Allen, West Palm Beach, Fla
code: Fla-Cer-l

ID: 17A12
type: porcelain
metadata:
name: Tie back
maker: William H. Bloor, 1860 East Liverpool, Ohio
materials: 
renderer: Richard Barnett
owner: (1938) American Ceramic Society, Columbus Ohio
code: Ohi

**--------- TILES AND PLAQUES ---------**

In [41]:
# --- TILES AND PLAQUES ---
tilesandplaques_df = section_data['TILES AND PLAQUES']
# print(tilesandplaques_df)

In [98]:
#STEP 1 (TILES AND PLAQUES)
# Filter out rows with hyphenated alphanumerics of the combination type
tilesandplaques_df_filtered = tilesandplaques_df[~tilesandplaques_df['Data'].apply(has_combination)]

# # Display the filtered DataFrame
# print("Tiles and Plaques DataFrame after filtering out combination types:")
# print(tilesandplaques_df_filtered)

Note: testing due to "NYC-no class #" bug

In [43]:
# print(tilesandplaques_df_filtered.head(15))

In [172]:
#STEP 2 (TILES AND PLAQUES)
# Function to split the DataFrame by ID number

# Split the filtered DataFrame by ID number
tilesandplaques_split_data = split_by_id(tilesandplaques_df_filtered, object_type="tilesandplaques")

# # Display the split data
# for id_number, object_type, metadata in tilesandplaques_split_data:
#     print("ID:", id_number)
#     print("Type:", object_type)
#     print("Metadata:") 
#     for line in metadata:
#         print(line)
#     print()

In [173]:
#STEP 3 (TILES AND PLAQUES)
# Function to parse metadata and assign key-value pairs

# Split the filtered DataFrame by ID number
tilesandplaques_split_data = split_by_id(tilesandplaques_df_filtered, object_type="tilesandplaques")

# Display the split data with parsed metadata
for id_number, object_type, metadata in tilesandplaques_split_data:
    print("ID:", id_number)
    print("type:", object_type)
    print("metadata:")
    parsed_metadata = parse_metadata(metadata)
    for key, value in parsed_metadata.items():
        if key == 'materials':
            print(f"{key}: ", end="")  # Print "materials:" without a newline
            for item in value:
                print(item, end=" ")  # Print each item on the same line
            print()  # Add a newline after printing all items
        else:
            print(f"{key}: {value}")
    print()

ID: 18E2
type: tilesandplaques
metadata:
name: Tile: view of New Amsterdam
maker: 1636
materials: 
renderer: A. Zimet
owner: New York Historical Society, New York City, NY
code: NYC-no class #

ID: 18E3
type: tilesandplaques
metadata:
name: Ornament
maker: c. 1860 Connecticut
materials: Pottery 
renderer: Dana Bartlett
owner: (1937) A.B. Wheeler, Los Angeles, Ca
code: So Cal-Cer-4

ID: 18E4
type: tilesandplaques
metadata:
name: Tile drain
maker: Kohler Pottery 1868-1915 Pensacola. Florida
materials: 
renderer: Annie B. Johnston
owner: (1937) Florida State Museum. Gainesville. Fla
code: Fla-Ccr-37

ID: 18E5
type: tilesandplaques
metadata:
name: Floor tile
maker: M.A.W. and Company and Brosley, Salop 1756 (from The Isaac Drake House Washington Headquarters)
materials: 
renderer: W.VI- Jennings
owner: (1936) Original Drake House Washington, DC
code: NJ-Cer-3

ID: 18E6
type: tilesandplaques
metadata:
name: Tiles
maker: Zoar Pottery 1818-98 Zoar, Ohio
materials: 
renderer: Angelo Bulone
own

**--------- CERAMIC FIGURES ---------**

In [46]:
# --- CERAMIC FIGURES ---
ceramicfigures_df = section_data['CERAMIC FIGURES']
# print(ceramicfigures_df)

In [101]:
#STEP 1 (CERAMIC FIGURES)
# Filter out rows with hyphenated alphanumerics of the combination type
ceramicfigures_df_filtered = ceramicfigures_df[~ceramicfigures_df['Data'].apply(has_combination)]

# # Display the filtered DataFrame
# print("Ceramic Figures DataFrame after filtering out combination types:")
# print(ceramicfigures_df_filtered)

In [174]:
#STEP 2 (CERAMIC FIGURES)
# Function to split the DataFrame by ID number

# Split the filtered DataFrame by ID number
ceramicfigures_split_data = split_by_id(ceramicfigures_df_filtered, object_type="ceramicfigures")

# # Display the split data
# for id_number, object_type, metadata in ceramicfigures_split_data:
#     print("ID:", id_number)
#     print("Type:", object_type)
#     print("Metadata:") 
#     for line in metadata:
#         print(line)
#     print()

In [175]:
#STEP 3 (CERAMIC FIGURES)
# Function to parse metadata and assign key-value pairs

# Split the filtered DataFrame by ID number
ceramicfigures_split_data = split_by_id(ceramicfigures_df_filtered, object_type="ceramicfigures")

# Display the split data with parsed metadata
for id_number, object_type, metadata in ceramicfigures_split_data:
    print("ID:", id_number)
    print("type:", object_type)
    print("metadata:")
    parsed_metadata = parse_metadata(metadata)
    for key, value in parsed_metadata.items():
        if key == 'materials':
            print(f"{key}: ", end="")  # Print "materials:" without a newline
            for item in value:
                print(item, end=" ")  # Print each item on the same line
            print()  # Add a newline after printing all items
        else:
            print(f"{key}: {value}")
    print()

ID: 19A3
type: ceramicfigures
metadata:
name: Lion
maker: John Sanders, Connecticut dated on back: 1817 or 1877
materials: 
renderer: John Matulis
owner: (1937) Wadsworth Atheneum, Hartford, Ct
code: Conn-Cer-38

ID: 19A4
type: ceramicfigures
metadata:
name: Potpourri jar
maker: Winkle Terra Cotta Company c. 1890 St Louis, Mo
materials: lll-Cer-28 
renderer: Adolph Opstad
owner: (1941) Paul Joseph, Chicago, 111
code: None

ID: 19A6
type: ceramicfigures
metadata:
name: Statuette
maker: 1899 Bedford, Ohio
materials: 
renderer: Ralph Atkinson
owner: (1937) Mrs D. Zeber, Bellevue, Pa
code: Pa-Cer-282

ID: 19A9
type: ceramicfigures
metadata:
name: Squirrel statuette
maker: early 19th c Pennsylvania
materials: 
renderer: Yolande Delasser
owner: (1937) Elie Nadelman, Museum of Folk Arts, New York City, NY
code: NYC-Cer-no class #

ID: 19A11
type: ceramicfigures
metadata:
name: Statuette
maker: 19 th c Pennsylvania
materials: 
renderer: Frank Fumagalli
owner: (1937) Elie Nadelman Collection, N

**--------- CHALKWARE ---------**

In [50]:
# --- CHALKWARE ---
chalkware_df = section_data['CHALKWARE']
# print(chalkware_df)

In [104]:
#STEP 1 (CHALKWARE)
# Filter out rows with hyphenated alphanumerics of the combination type
chalkware_df_filtered = chalkware_df[~chalkware_df['Data'].apply(has_combination)]

# # Display the filtered DataFrame
# print("Chalkware DataFrame after filtering out combination types:")
# print(chalkware_df_filtered)

In [177]:
#STEP 2 (CHALKWARE)
# Function to split the DataFrame by ID number

# Split the filtered DataFrame by ID number
chalkware_split_data = split_by_id(chalkware_df_filtered, object_type="chalkware")

# # Display the split data
# for id_number, object_type, metadata in chalkware_split_data:
#     print("ID:", id_number)
#     print("Type:", object_type)
#     print("Metadata:") 
#     for line in metadata:
#         print(line)
#     print()

In [178]:
#STEP 3 (CHALKWARE)
# Function to parse metadata and assign key-value pairs

# Split the filtered DataFrame by ID number
chalkware_split_data = split_by_id(chalkware_df_filtered, object_type="chalkware")

# Display the split data with parsed metadata
for id_number, object_type, metadata in chalkware_split_data:
    print("ID:", id_number)
    print("type:", object_type)
    print("metadata:")
    parsed_metadata = parse_metadata(metadata)
    for key, value in parsed_metadata.items():
        if key == 'materials':
            print(f"{key}: ", end="")  # Print "materials:" without a newline
            for item in value:
                print(item, end=" ")  # Print each item on the same line
            print()  # Add a newline after printing all items
        else:
            print(f"{key}: {value}")
    print()

ID: 19C10
type: chalkware
metadata:
name: Dove: plaster-of-Paris bank
maker: c. 1860 New Jersey Plaster-of-Paris cast in the round; painted
materials: 
renderer: Elisabeth Fulda
owner: (1937) Bruce Buttfield, New York City, NY
code: NYC-Mscl-Vic-1

ID: 19C12
type: chalkware
metadata:
name: Pigeon bank
maker: 19th c Pennsylvania
materials: 
renderer: Emanuel Jacobson
owner: (1938) Mrs B.F. Ramsdell, Geneva, Ill
code: Ill-Ca-39

ID: 19D1
type: chalkware
metadata:
name: Deer figurine
maker: early 19th c Pennsylvania
materials: 
renderer: Milton Bevier
owner: (1939) Mrs James Cady Ewell, Highland Park, Ill
code: Ill-Ca-47

ID: 19D3
type: chalkware
metadata:
name: Portrait bust
maker: 19th c Pennsylvania
materials: 
renderer: Mina Lowry
owner: (1936) Mrs J.D. Rockefeller, New York City, NY
code: NYC-Mscl-Ch-7

ID: 19D7
type: chalkware
metadata:
name: Figurine: ‘Queen Victoria, Empress of India’
maker: 19th c Pennsylvania
materials: 
renderer: Mina Lowry
owner: (1936) American Folk Art Galle

# Follow-up
Exporting "json-like" data (split data-frames) into a single JSON

In [199]:
import json

def export_multiple_to_json(split_data_list, filename):
    """
    Export multiple split data sets to a single JSON file.

    Args:
        split_data_list (list of lists): List of split data sets.
                                         Each split data set is a list of tuples containing the split data.
                                         Each tuple should contain (ID, object_type, metadata).
        filename (str): Name of the JSON file to save the data.
    """
    json_data = []

    for split_data in split_data_list:
        for id_number, object_type, metadata in split_data:
            parsed_metadata = parse_metadata(metadata)
            metadata_dict = {"ID": id_number, "type": object_type, "metadata": parsed_metadata}
            json_data.append(metadata_dict)

    with open(filename, 'w') as json_file:
        json.dump(json_data, json_file, indent=4)

In [200]:
# Example usage:
export_multiple_to_json([earthenware_split_data, 
                        majolica_split_data, 
                        lusterware_split_data, 
                        stoneware_split_data, 
                        ironstone_split_data, 
                        porcelain_split_data, 
                        tilesandplaques_split_data, 
                        ceramicfigures_split_data, 
                        chalkware_split_data,],
    'combined_data.json'
)

In [201]:
import json

def count_items_in_json(filename):
    """
    Count the number of items in a JSON file.

    Args:
        filename (str): Name of the JSON file.

    Returns:
        int: Number of items in the JSON file.
    """
    with open(filename, 'r') as json_file:
        json_data = json.load(json_file)
    
    return len(json_data)

# Example usage:
count = count_items_in_json('combined_data.json')
print("Total number of items in the JSON file:", count)


Total number of items in the JSON file: 88
