This script takes a plain text file in FGDC and extracts the Attribute labels, definitions, and sources. This is useful only for FGDC files that were shared as plain text. (Mostly common practice only in 1990s-early 2000s).

In [16]:
import re
import csv

# Path to your plain text file
text_file_path = 'Sand_Gravel_Pits_Abandoned_IN.txt'

# Output CSV file path
csv_file_path = 'output.csv'

# Read the text file
with open(text_file_path, 'r', encoding='utf-8') as file:
    text_data = file.read()

# Regular expression pattern to match each Attribute section with more flexible handling of whitespace
attribute_pattern = re.compile(
    r'Attribute:\s*'
    r'Attribute_Label:\s*(.*?)\s*'
    r'Attribute_Definition:\s*(.*?)\s*'
    r'Attribute_Definition_Source:\s*([^\n\r]*)',  # Capture everything up to the next newline
    re.DOTALL | re.MULTILINE
)

# Find all matches
matches = attribute_pattern.findall(text_data)

# Debugging: Print the number of matches found
print(f'\nTotal Attribute sections found: {len(matches)}')


Total Attribute sections found: 45


In [17]:
# List to store the extracted values
data = []

# Loop through each match and extract the needed values
for i, match in enumerate(matches):
    label, definition, source = match
    # Print debugging information about the source field
    print(f'\nMatch #{i+1}:')
    print(f'Label: {label.strip()}')
    print(f'Definition: {definition.strip()}')
    print(f'Source: {source.strip()} (Captured Source)')

    # Append the extracted values to the list
    data.append([label.strip(), definition.strip(), source.strip()])

# Write the extracted data to a CSV file
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
    csv_writer = csv.writer(csvfile)
    # Write the header
    csv_writer.writerow(['Attribute_Label', 'Attribute_Definition', 'Attribute_Definition_Source'])
    # Write the data rows
    csv_writer.writerows(data)

print(f'\nExtraction complete. Data saved to {csv_file_path}')


Match #1:
Label: Count
Definition: Auto number assigned by Access
Source: IGS Memorandum Report 98 (Captured Source)

Match #2:
Label: Sgnum
Definition: Database number
Source: IGS Memorandum Report 98 (Captured Source)

Match #3:
Label: Qtr
Definition: Quartering of section
Source: IGS Memorandum Report 98 (Captured Source)

Match #4:
Label: Qtr3
Definition: Smallest quarter of section
Source: IGS, Paul Irwin (Captured Source)

Match #5:
Label: Qtr2
Definition: Intermediate quarter of section
Source: IGS, Paul Irwin (Captured Source)

Match #6:
Label: Qtr1
Definition: Largest quarter of section
Source: IGS, Paul Irwin (Captured Source)

Match #7:
Label: Cnty
Definition: County
Source: IGS Memorandum Report 98 (Captured Source)

Match #8:
Label: Quad
Definition: Name of USGS 7.5-minute quadrangle
Source: IGS Memorandum Report 98 (Captured Source)

Match #9:
Label: Date_
Definition: Date data was recorded
Source: IGS Memorandum Report 98 (Captured Source)

Match #10:
Label: Styp
Defini