# Exporting CeramSample to JSON
Note: amending steps from CeramSample analysis to export into JSON for data visualization testing

Exported JSONs:

chalkware_parsed_metadata.json (testing a data frame)

combined_data.json (all data frames into one json)

## Pre-Processing Step
Note: grabbed list of Catalog's state_abbr from previous python data-cleaning set (renderer state index)

In [1]:
# Dictionary mapping custom state abbreviations to state names
state_abbr_dict = {
    "Conn": "Connecticut",
    "Ct": "Connecticut",
    "Me": "Maine",
    "Ma": "Massachusetts",
    "Mass": "Massachusetts",
    "NH": "New Hampshire",
    "RI": "Rhode Island",
    "Vt": "Vermont",
    "NJ": "New Jersey",
    "NYS": "New York State",
    "NYC": "New York City",
    "Pa": "Pennsylvania",
    "IN": "Indiana",
    "Ill": "Illinois",
    "Mich": "Michigan",
    "Ohio": "Ohio",
    "Wis": "Wisconsin",
    "Iowa": "Iowa",
    "Ka": "Kansas",
    "Minn": "Minnesota",
    "Mn": "Minnesota",
    "Mo": "Missouri",
    "NE": "Nebraska",
    "ND": "North Dakota",
    "SD": "South Dakota",
    "Del": "Delaware",
    "DC": "District of Columbia",
    "Fla": "Florida",
    "Ga": "Georgia",
    "Md": "Maryland",
    "NC": "North Carolina",
    "SC": "South Carolina",
    "Va": "Virginia",
    "WV": "West Virginia",
    "Ala": "Alabama",
    "Ky": "Kentucky",
    "MS": "Mississippi",
    "Tenn": "Tennessee",
    "AR": "Arkansas",
    "La": "Louisiana",
    "OK": "Oklahoma",
    "Tex": "Texas",
    "Ariz": "Arizona",
    "Col": "Colorado",
    "Colo": "Colorado",
    "ID": "Idaho",
    "NM": "New Mexico",
    "MT": "Montana",
    "Utah": "Utah",
    "NV": "Nevada",
    "WY": "Wyoming",
    "AK": "Alaska",
    "Ca": "California",
    "Cal": "California",
    "So Cal": "Southern California",
    "HI": "Hawaii",
    "OR": "Oregon",
    "Wash": "Washington"
}

In [2]:
# # Example usage:
# state_abbr = "NYC"
# state_name = state_abbr_dict.get(state_abbr)
# print(state_name)  # Output: New York City

### define _state_abbreviations_

In [3]:
# Extract state abbreviations from the dictionary
state_abbreviations = list(state_abbr_dict.keys())

# # Example usage:
# print(state_abbreviations)

## Divide Sample into Sections
Note: valid_sections were capitalized in txt file

In [4]:
import pandas as pd

# Function to parse the text and extract section headers
def parse_catalog(text):
    # Initialize variables
    section_data = {}
    current_section = None
    current_section_data = []
    
    # Define the list of valid section headers
    valid_sections = ['EARTHENWARE', 'MAJOLICA', 'LUSTERWARE', 'STONEWARE', 'IRONSTONE', 'PORCELAIN', 'TILES AND PLAQUES', 'CERAMIC FIGURES', 'CHALKWARE']

    # Split the text into lines
    lines = text.strip().split('\n')

    # Iterate over each line
    for line in lines:
        line = line.strip()
        
        # Check if the line is a section header (uppercase) and is in the valid_sections array
        if line.isupper() and line in valid_sections:
            if current_section:
                # Convert the current_section_data to a DataFrame and store it
                section_data[current_section] = pd.DataFrame(current_section_data, columns=['Data'])
                current_section_data = []  # Reset current_section_data for the new section
            current_section = line
        elif current_section:
            # Append the line to the current_section_data
            current_section_data.append(line)

    # Convert the data of the last section to a DataFrame and store it
    if current_section:
        section_data[current_section] = pd.DataFrame(current_section_data, columns=['Data'])

    return section_data

# Read the contents of the text file
with open('CatCeram_sample.txt', 'r') as file:
    text = file.read()

# Parse the catalog data
section_data = parse_catalog(text)

# # Display the DataFrames for each section
# for section, df in section_data.items():
#     print("Section:", section)
#     print(df)
#     print()

In [5]:
# section_data.items()

## Pre-Check Functions

In [6]:
# # Determine if line contains a state abbr
# def has_state_abbreviation(text):
#     # Split the hyphenated alphanumeric by hyphen and check if the first part is in state_abbreviations
#     parts = text.split('-')
#     if parts[0] in state_abbreviations:
#         return True
#     return False

In [7]:
# # Example usage:
# hyphenated_alphanumeric = "NYC-123"
# if has_state_abbreviation(hyphenated_alphanumeric):
#     print("The hyphenated alphanumeric contains a state abbreviation.")
# else:
#     print("The hyphenated alphanumeric does not contain a state abbreviation.")

### Check for lines with hyphenation
#### 1. Accession Numbers
Note: determining if the line contains a state_abbreviation helps differentiate the Catalog's accession number from the microfiche (in this case, copied into the txt as an excess line to track random samples)
#### 2. Date Ranges
#### 3. Microfiche Location Numbers
Note: need this function to check again object names with 3 characters, i.e., "Jar"

In [8]:
# Function to check if a hyphenated alphanumeric contains a state abbreviation
def has_state_abbreviation(text):
    # Convert text to uppercase for case-insensitive comparison
    upper_text = text.upper()
    
    # Check if the text contains any of the state abbreviations
    for abbreviation in state_abbreviations:
        if abbreviation.upper() in upper_text:
            return True
    
    return False

# Function to check if a hyphenated alphanumeric contains a date range
def has_date_range(text):
    parts = text.split('-')
    if len(parts) == 2 and all(part.isdigit() for part in parts):
        return True
    return False

# Function to check if a hyphenated alphanumeric contains a combination of numbers and letters
def has_combination(line):
    """
    Check if the line matches the combination type pattern.

    Parameters:
        line (str): The line of text to check.

    Returns:
        bool: True if the line matches the combination type pattern, False otherwise.
    """
    # Remove quotation marks if they exist
    line = line.strip('"')
    parts = line.split('-')
    if len(parts) == 3:
        part1, part2, part3 = parts
        if part1.isdigit() and 1 <= int(part1) <= 20 and part2.isalpha() and len(part2) == 1 and 'A' <= part2 <= 'E' and part3.isdigit() and 1 <= int(part3) <= 12:
            return True
    return False

In [9]:
# # EXAMPLE: Iterate through the Earthenware DataFrame
# for index, row in earthenware_df.iterrows():
#     line = row['Data']
#     if '-' in line:
#         if has_state_abbreviation(line):
#             # Process state abbreviation type
#             print("State Abbreviation Type:", line)
#         elif has_date_range(line):
#             # Process date range type
#             print("Date Range Type:", line)
#         elif has_combination(line):
#             # Process combination type
#             print("Combination Type:", line)
#     else:
#         # Process other types of lines
#         print("Other Type:", line)

## Main Functions
Note: each section was made into a data frame to test and refine overall functions (helped determine nuances on smaller sets of data)

**[Next step: test main functions on section_data]**

**--------- EARTHENWARE ---------**

In [10]:
# --- EARTHENWARE ---
earthenware_df = section_data['EARTHENWARE']
# print(earthenware_df)

### Step 1: use _has_combination_ function
Note: this weeds out the extra microfiche line (sometimes a hyphenated line 1-D-8, other times a hyphenated line surrounded by quotation marks "1-D-8")

In [11]:
#STEP 1 (EARTHENWARE)
# Filter out rows with hyphenated alphanumerics of the combination type
earthenware_df_filtered = earthenware_df[~earthenware_df['Data'].apply(has_combination)]

# # Display the filtered DataFrame
# print("Earthenware DataFrame after filtering out combination types:")
# print(earthenware_df_filtered)

### Step 2: split sections into individual objects
Note: ID (or clean microfiche location number) used to split section, lines remaining are bucketed as metadata; addition: type assigned)

In [12]:
#STEP 2 (EARTHENWARE)
# Function to split the DataFrame by ID number
def split_by_id(df, object_type="new_type"):
    # Initialize variables
    split_data = []
    current_id = None
    current_metadata = []

    # Iterate through the DataFrame
    for index, row in df.iterrows():
        line = row['Data']
        words = line.split()
        if len(words) == 1 and words[0].isalnum() and 3 <= len(words[0]) <= 5 and any(char.isdigit() for char in words[0]):
            # If a valid ID is found, append the previous ID, type, and metadata to split_data
            if current_id is not None:
                split_data.append((current_id, object_type, current_metadata))  # Include 'object_type' here
            # Update current_id and reset current_metadata
            current_id = words[0].strip()
            current_metadata = []
        else:
            # Append the line to current_metadata
            current_metadata.append(line.strip())
    
    # Append the last ID, type, and metadata to split_data
    if current_id is not None:
        split_data.append((current_id, object_type, current_metadata))
    
    return split_data

# Example usage:
# Split the new filtered DataFrame by ID number with a different object_type
earthenware_split_data = split_by_id(earthenware_df_filtered, object_type="earthenware")

# Display the split data
for id_number, object_type, metadata in earthenware_split_data:
    print("ID:", id_number)
    print("Type:", object_type)
    print("Metadata:") 
    for line in metadata:
        print(line)
    print()

ID: 1D8
Type: earthenware
Metadata:
Flower pot
M: Kohler Pottery 1865-1915 Pensacola, Florida
Clay
R: Annie B. Johnston
O: (1937) Florida State Museum, Gainesville, Fla
Fla-Cer-34


ID: 2B12
Type: earthenware
Metadata:
Jar
M: Burr Frost 1847 Missouri
Clay pottery
R: Clyde L. Cheney
O: (1937) Judy Lund, Salt Lake City, Utah
Utah-Cer-3


ID: 3A6
Type: earthenware
Metadata:
Jar with cover
M: early 19th c Long Island, New York State
Glazed brown earthenware
R: Alvin Shiren
O: (1939) Metropolitan Museum of Art, New York City, NY
NYC-Cer-51


ID: 3E5
Type: earthenware
Metadata:
Jar
M: Pennsylvania
Earthenware
R: Yolande Delasser
O: (1938) Alfred B. Maclay, New York City, NY
NYC-Cer-68

ID: 4C12
Type: earthenware
Metadata:
Jar
M: John Eardley 186O’s St. George, Utah
Glazed clay
R: Clyde L. Cheney
O: Miss Fern Seegmiller, St. George, Utah
Utah-Cer-22

ID: 5A3
Type: earthenware
Metadata:
Pitcher
M: Bell late 18th c Strasburg, Virginia
Glazed earthenware
R: Hugh Ryan
O: (1940) Mrs Bess H. Muller

### Step 3: Parse metadata
Note: this step required the most fine tuning, especially for edge-cases like: accessions numbers that included "no class #" and if OCR pulled in semicolons instead of colons

**[Next step: will have to clean up accession numbers; maybe more successful after particular key-value pair is called]**

In [13]:
#STEP 3 (EARTHENWARE)
# Function to parse metadata and assign key-value pairs

def parse_metadata(metadata):
    parsed_data = {}
    name = None
    maker = None
    materials = []
    renderer = None
    owner = None
    code = None

    for line_index, line in enumerate(metadata):
        if name is None:
            name = line.strip()
        elif ':' in line or ';' in line:
            if ':' in line:
                separator = ':'
            elif ';' in line:
                separator = ';'

            key, value = line.split(separator, 1)
            key = key.strip()
            value = value.strip()
            if key == 'M':
                maker = value
            elif key == 'R':
                renderer = value
            elif key == 'O':
                owner = value
            else:
                parsed_data[key] = value
        elif line.strip():  # Check if the line is not empty
            # Check if the line fits the has_state_abbreviation type and "No class #"
            if has_state_abbreviation(line):
                code = line.strip()
                if not ("No class #" in line or "no class #" in line):
                    materials.append(line.strip())
#                     print("Added line to materials:", line.strip())  # Debugging statement
            else:
                # Add the line to materials if it doesn't match any of the conditions above
                materials.append(line.strip())
#                 print("Added line to materials:", line.strip())  # Debugging statement

    
    parsed_data['name'] = name
    parsed_data['maker'] = maker
    parsed_data['materials'] = materials
    parsed_data['renderer'] = renderer
    parsed_data['owner'] = owner
    parsed_data['code'] = code
    
    # Debugging print statements
    print("Parsed metadata:", parsed_data)
    
    return parsed_data


# Split the filtered DataFrame by ID number
earthenware_split_data = split_by_id(earthenware_df_filtered, object_type="earthenware")

# Display the split data with parsed metadata
for id_number, object_type, metadata in earthenware_split_data:
    print("ID:", id_number)
    print("type:", object_type)
    print("metadata:")
#     parsed_metadata = parse_metadata(metadata)
    for key, value in parsed_metadata.items():
        if key == 'materials':
            print(f"{key}: ", end="")  # Print "materials:" without a newline
            for item in value:
                print(item, end=" ")  # Print each item on the same line
            print()  # Add a newline after printing all items
        else:
            print(f"{key}: {value}")
    print()

ID: 1D8
type: earthenware
metadata:


NameError: name 'parsed_metadata' is not defined

### Repeat Steps 1-3 for each data frame

**--------- MAJOLICA ---------**

In [None]:
# --- MAJOLICA ---
majolica_df = section_data['MAJOLICA']
# print(majolica_df)

In [None]:
#STEP 1 (MAJOLICA)
# Filter out rows with hyphenated alphanumerics of the combination type
majolica_df_filtered = majolica_df[~majolica_df['Data'].apply(has_combination)]

# # Display the filtered DataFrame
# print("Majolica DataFrame after filtering out combination types:")
# print(majolica_df_filtered)

In [None]:
#STEP 2 (MAJOLICA)
# Function to split the DataFrame by ID number

# Split the filtered DataFrame by ID number
majolica_split_data = split_by_id(majolica_df_filtered, object_type="majolica")

# # Display the split data
# for id_number, object_type, metadata in majolica_split_data:
#     print("ID:", id_number)
#     print("Type:", object_type)
#     print("Metadata:") 
#     for line in metadata:
#         print(line)
#     print()

In [None]:
#STEP 3 (MAJOLICA)
# Function to parse metadata and assign key-value pairs

# Split the filtered DataFrame by ID number
majolica_split_data = split_by_id(majolica_df_filtered, object_type="majolica")

# Display the split data with parsed metadata
for id_number, object_type, metadata in majolica_split_data:
    print("ID:", id_number)
    print("type:", object_type)
    print("metadata:")
    parsed_metadata = parse_metadata(metadata)
    for key, value in parsed_metadata.items():
        if key == 'materials':
            print(f"{key}: ", end="")  # Print "materials:" without a newline
            for item in value:
                print(item, end=" ")  # Print each item on the same line
            print()  # Add a newline after printing all items
        else:
            print(f"{key}: {value}")
    print()

**--------- LUSTERWARE ---------**

In [None]:
# --- LUSTERWARE ---
lusterware_df = section_data['LUSTERWARE']
# print(lusterware_df)

In [None]:
#STEP 1 (LUSTERWARE)
# Filter out rows with hyphenated alphanumerics of the combination type
lusterware_df_filtered = lusterware_df[~lusterware_df['Data'].apply(has_combination)]

# # Display the filtered DataFrame
# print("Lusterware DataFrame after filtering out combination types:")
# print(lusterware_df_filtered)

In [None]:
#STEP 2 (LUSTERWARE)
# Function to split the DataFrame by ID number

# Split the filtered DataFrame by ID number
lusterware_split_data = split_by_id(lusterware_df_filtered, object_type="lusterware")

# # Display the split data
# for id_number, object_type, metadata in lusterware_split_data:
#     print("ID:", id_number)
#     print("Type:", object_type)
#     print("Metadata:") 
#     for line in metadata:
#         print(line)
#     print()

In [None]:
#STEP 3 (LUSTERWARE)
# Function to parse metadata and assign key-value pairs

# Split the filtered DataFrame by ID number
lusterware_split_data = split_by_id(lusterware_df_filtered, object_type="lusterware")

# Display the split data with parsed metadata
for id_number, object_type, metadata in lusterware_split_data:
    print("ID:", id_number)
    print("type:", object_type)
    print("metadata:")
    parsed_metadata = parse_metadata(metadata)
    for key, value in parsed_metadata.items():
        if key == 'materials':
            print(f"{key}: ", end="")  # Print "materials:" without a newline
            for item in value:
                print(item, end=" ")  # Print each item on the same line
            print()  # Add a newline after printing all items
        else:
            print(f"{key}: {value}")
    print()

**--------- STONEWARE ---------**

In [None]:
# --- STONEWARE ---
stoneware_df = section_data['STONEWARE']
# print(stoneware_df)

In [None]:
#STEP 1 (STONEWARE)
# Filter out rows with hyphenated alphanumerics of the combination type
stoneware_df_filtered = stoneware_df[~stoneware_df['Data'].apply(has_combination)]

# # Display the filtered DataFrame
# print("Stoneware DataFrame after filtering out combination types:")
# print(stoneware_df_filtered)

In [None]:
#STEP 2 (STONEWARE)
# Function to split the DataFrame by ID number

# Split the filtered DataFrame by ID number
stoneware_split_data = split_by_id(stoneware_df_filtered, object_type="stoneware")

# # Display the split data
# for id_number, object_type, metadata in stoneware_split_data:
#     print("ID:", id_number)
#     print("Type:", object_type)
#     print("Metadata:") 
#     for line in metadata:
#         print(line)
#     print()

In [None]:
#STEP 3 (STONEWARE)
# Function to parse metadata and assign key-value pairs

# Split the filtered DataFrame by ID number
stoneware_split_data = split_by_id(stoneware_df_filtered, object_type="stoneware")

# Display the split data with parsed metadata
for id_number, object_type, metadata in stoneware_split_data:
    print("ID:", id_number)
    print("type:", object_type)
    print("metadata:")
    parsed_metadata = parse_metadata(metadata)
    for key, value in parsed_metadata.items():
        if key == 'materials':
            print(f"{key}: ", end="")  # Print "materials:" without a newline
            for item in value:
                print(item, end=" ")  # Print each item on the same line
            print()  # Add a newline after printing all items
        else:
            print(f"{key}: {value}")
    print()

**--------- IRONSTONE ---------**

In [None]:
# --- IRONSTONE ---
ironstone_df = section_data['IRONSTONE']
# print(ironstone_df)

In [None]:
#STEP 1 (IRONSTONE)
# Filter out rows with hyphenated alphanumerics of the combination type
ironstone_df_filtered = ironstone_df[~ironstone_df['Data'].apply(has_combination)]

# # Display the filtered DataFrame
# print("Ironstone DataFrame after filtering out combination types:")
# print(ironstone_df_filtered)

In [None]:
#STEP 2 (IRONSTONE)
# Function to split the DataFrame by ID number

# Split the filtered DataFrame by ID number
ironstone_split_data = split_by_id(ironstone_df_filtered, object_type="ironstone")

# # Display the split data
# for id_number, object_type, metadata in ironstone_split_data:
#     print("ID:", id_number)
#     print("Type:", object_type)
#     print("Metadata:") 
#     for line in metadata:
#         print(line)
#     print()

In [None]:
#STEP 3 (IRONSTONE)
# Function to parse metadata and assign key-value pairs

# Split the filtered DataFrame by ID number
ironstone_split_data = split_by_id(ironstone_df_filtered, object_type="ironstone")

# Display the split data with parsed metadata
for id_number, object_type, metadata in ironstone_split_data:
    print("ID:", id_number)
    print("type:", object_type)
    print("metadata:")
    parsed_metadata = parse_metadata(metadata)
    for key, value in parsed_metadata.items():
        if key == 'materials':
            print(f"{key}: ", end="")  # Print "materials:" without a newline
            for item in value:
                print(item, end=" ")  # Print each item on the same line
            print()  # Add a newline after printing all items
        else:
            print(f"{key}: {value}")
    print()

**--------- PORCELAIN ---------**

In [None]:
# --- PORCELAIN ---
porcelain_df = section_data['PORCELAIN']
# print(porcelain_df)

In [None]:
#STEP 1 (PORCELAIN)
# Filter out rows with hyphenated alphanumerics of the combination type
porcelain_df_filtered = porcelain_df[~porcelain_df['Data'].apply(has_combination)]

# # Display the filtered DataFrame
# print("Porcelain DataFrame after filtering out combination types:")
# print(porcelain_df_filtered)

In [None]:
#STEP 2 (PORCELAIN)
# Function to split the DataFrame by ID number

# Split the filtered DataFrame by ID number
porcelain_split_data = split_by_id(porcelain_df_filtered, object_type="porcelain")

# # Display the split data
# for id_number, object_type, metadata in porcelain_split_data:
#     print("ID:", id_number)
#     print("Type:", object_type)
#     print("Metadata:") 
#     for line in metadata:
#         print(line)
#     print()

In [None]:
#STEP 3 (PORCELAIN)
# Function to parse metadata and assign key-value pairs

# Split the filtered DataFrame by ID number
porcelain_split_data = split_by_id(porcelain_df_filtered, object_type="porcelain")

# Display the split data with parsed metadata
for id_number, object_type, metadata in porcelain_split_data:
    print("ID:", id_number)
    print("type:", object_type)
    print("metadata:")
    parsed_metadata = parse_metadata(metadata)
    for key, value in parsed_metadata.items():
        if key == 'materials':
            print(f"{key}: ", end="")  # Print "materials:" without a newline
            for item in value:
                print(item, end=" ")  # Print each item on the same line
            print()  # Add a newline after printing all items
        else:
            print(f"{key}: {value}")
    print()

**--------- TILES AND PLAQUES ---------**

In [None]:
# --- TILES AND PLAQUES ---
tilesandplaques_df = section_data['TILES AND PLAQUES']
# print(tilesandplaques_df)

In [None]:
#STEP 1 (TILES AND PLAQUES)
# Filter out rows with hyphenated alphanumerics of the combination type
tilesandplaques_df_filtered = tilesandplaques_df[~tilesandplaques_df['Data'].apply(has_combination)]

# # Display the filtered DataFrame
# print("Tiles and Plaques DataFrame after filtering out combination types:")
# print(tilesandplaques_df_filtered)

Note: testing due to "NYC-no class #" bug

In [None]:
# print(tilesandplaques_df_filtered.head(15))

In [None]:
#STEP 2 (TILES AND PLAQUES)
# Function to split the DataFrame by ID number

# Split the filtered DataFrame by ID number
tilesandplaques_split_data = split_by_id(tilesandplaques_df_filtered, object_type="tilesandplaques")

# # Display the split data
# for id_number, object_type, metadata in tilesandplaques_split_data:
#     print("ID:", id_number)
#     print("Type:", object_type)
#     print("Metadata:") 
#     for line in metadata:
#         print(line)
#     print()

In [None]:
#STEP 3 (TILES AND PLAQUES)
# Function to parse metadata and assign key-value pairs

# Split the filtered DataFrame by ID number
tilesandplaques_split_data = split_by_id(tilesandplaques_df_filtered, object_type="tilesandplaques")

# Display the split data with parsed metadata
for id_number, object_type, metadata in tilesandplaques_split_data:
    print("ID:", id_number)
    print("type:", object_type)
    print("metadata:")
    parsed_metadata = parse_metadata(metadata)
    for key, value in parsed_metadata.items():
        if key == 'materials':
            print(f"{key}: ", end="")  # Print "materials:" without a newline
            for item in value:
                print(item, end=" ")  # Print each item on the same line
            print()  # Add a newline after printing all items
        else:
            print(f"{key}: {value}")
    print()

**--------- CERAMIC FIGURES ---------**

In [None]:
# --- CERAMIC FIGURES ---
ceramicfigures_df = section_data['CERAMIC FIGURES']
# print(ceramicfigures_df)

In [None]:
#STEP 1 (CERAMIC FIGURES)
# Filter out rows with hyphenated alphanumerics of the combination type
ceramicfigures_df_filtered = ceramicfigures_df[~ceramicfigures_df['Data'].apply(has_combination)]

# # Display the filtered DataFrame
# print("Ceramic Figures DataFrame after filtering out combination types:")
# print(ceramicfigures_df_filtered)

In [None]:
#STEP 2 (CERAMIC FIGURES)
# Function to split the DataFrame by ID number

# Split the filtered DataFrame by ID number
ceramicfigures_split_data = split_by_id(ceramicfigures_df_filtered, object_type="ceramicfigures")

# # Display the split data
# for id_number, object_type, metadata in ceramicfigures_split_data:
#     print("ID:", id_number)
#     print("Type:", object_type)
#     print("Metadata:") 
#     for line in metadata:
#         print(line)
#     print()

In [None]:
#STEP 3 (CERAMIC FIGURES)
# Function to parse metadata and assign key-value pairs

# Split the filtered DataFrame by ID number
ceramicfigures_split_data = split_by_id(ceramicfigures_df_filtered, object_type="ceramicfigures")

# Display the split data with parsed metadata
for id_number, object_type, metadata in ceramicfigures_split_data:
    print("ID:", id_number)
    print("type:", object_type)
    print("metadata:")
    parsed_metadata = parse_metadata(metadata)
    for key, value in parsed_metadata.items():
        if key == 'materials':
            print(f"{key}: ", end="")  # Print "materials:" without a newline
            for item in value:
                print(item, end=" ")  # Print each item on the same line
            print()  # Add a newline after printing all items
        else:
            print(f"{key}: {value}")
    print()

**--------- CHALKWARE ---------**

In [None]:
# --- CHALKWARE ---
chalkware_df = section_data['CHALKWARE']
# print(chalkware_df)

In [None]:
#STEP 1 (CHALKWARE)
# Filter out rows with hyphenated alphanumerics of the combination type
chalkware_df_filtered = chalkware_df[~chalkware_df['Data'].apply(has_combination)]

# # Display the filtered DataFrame
# print("Chalkware DataFrame after filtering out combination types:")
# print(chalkware_df_filtered)

In [None]:
#STEP 2 (CHALKWARE)
# Function to split the DataFrame by ID number

# Split the filtered DataFrame by ID number
chalkware_split_data = split_by_id(chalkware_df_filtered, object_type="chalkware")

# # Display the split data
# for id_number, object_type, metadata in chalkware_split_data:
#     print("ID:", id_number)
#     print("Type:", object_type)
#     print("Metadata:") 
#     for line in metadata:
#         print(line)
#     print()

In [None]:
#STEP 3 (CHALKWARE)
# Function to parse metadata and assign key-value pairs

# Split the filtered DataFrame by ID number
chalkware_split_data = split_by_id(chalkware_df_filtered, object_type="chalkware")

# Display the split data with parsed metadata
for id_number, object_type, metadata in chalkware_split_data:
    print("ID:", id_number)
    print("type:", object_type)
    print("metadata:")
    parsed_metadata = parse_metadata(metadata)
    for key, value in parsed_metadata.items():
        if key == 'materials':
            print(f"{key}: ", end="")  # Print "materials:" without a newline
            for item in value:
                print(item, end=" ")  # Print each item on the same line
            print()  # Add a newline after printing all items
        else:
            print(f"{key}: {value}")
    print()

# Follow-up
Exporting "json-like" data (split data-frames) into a single JSON

In [None]:
import json

def export_multiple_to_json(split_data_list, filename):
    """
    Export multiple split data sets to a single JSON file.

    Args:
        split_data_list (list of lists): List of split data sets.
                                         Each split data set is a list of tuples containing the split data.
                                         Each tuple should contain (ID, object_type, metadata).
        filename (str): Name of the JSON file to save the data.
    """
    json_data = []

    for split_data in split_data_list:
        for id_number, object_type, metadata in split_data:
            parsed_metadata = parse_metadata(metadata)
            metadata_dict = {"ID": id_number, "type": object_type, "metadata": parsed_metadata}
            json_data.append(metadata_dict)

    with open(filename, 'w') as json_file:
        json.dump(json_data, json_file, indent=4)

In [None]:
# Example usage:
export_multiple_to_json([earthenware_split_data, 
                        majolica_split_data, 
                        lusterware_split_data, 
                        stoneware_split_data, 
                        ironstone_split_data, 
                        porcelain_split_data, 
                        tilesandplaques_split_data, 
                        ceramicfigures_split_data, 
                        chalkware_split_data,],
    'combined_data.json'
)

In [None]:
import json

def count_items_in_json(filename):
    """
    Count the number of items in a JSON file.

    Args:
        filename (str): Name of the JSON file.

    Returns:
        int: Number of items in the JSON file.
    """
    with open(filename, 'r') as json_file:
        json_data = json.load(json_file)
    
    return len(json_data)

# Example usage:
count = count_items_in_json('combined_data.json')
print("Total number of items in the JSON file:", count)
