In [7]:
import os
import json
import csv
from datetime import datetime
import re

# Function to extract metadata from a JSON file
def extract_metadata(json_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        metadata = []
        for dataset in data['dataset']:
            title = dataset.get('title', '')
            modified = extract_year(dataset.get('modified', ''))
            landingPage = dataset.get('landingPage', '')
            accessLevel = dataset.get('accessLevel', '')
            description = dataset.get('description', '')
            year = extract_year_from_title(title)
            
            metadata.append(
                {
                 'Title': title, 
                 'Modified': modified, 
                 'Landing': landingPage, 
                 'Year': year,
                 'Access Level': accessLevel,
                 'Description': description
                }
            )
        return metadata

# Function to extract year from ISO format datetime string
def extract_year(modified):
    try:
        modified_datetime = datetime.strptime(modified, '%Y-%m-%dT%H:%M:%S.%fZ')
        return modified_datetime.year
    except:
        return ''

# Function to extract year from title using regex
def extract_year_from_title(title):
    try:
        year_range = re.findall(r'(\d{4})-(\d{4})', title)
        if year_range:
            return year_range[0][0]  # Return the first year in the range
    except:
        pass
    
    try:
        single_year_match = re.search(r'(17\d{2}|18\d{2}|19\d{2}|20\d{2})', title)
        if single_year_match:
            return single_year_match.group(0)
    except:
        pass
    
    return ''  # Return empty string if no year found

# Path to the folder containing the JSON files
json_folder = 'ireland'

# List to store all metadata
all_metadata = []

# Loop through each JSON file in the folder
for filename in os.listdir(json_folder):
    if filename.endswith('.json'):
        json_path = os.path.join(json_folder, filename)
        metadata = extract_metadata(json_path)
        all_metadata.extend(metadata)

# Path to the output CSV file
csv_file = 'geohive-harvest.csv'

# Write metadata to CSV
with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['Title', 'Description', 'Modified', 'Landing', 'Year', 'Access Level']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    writer.writeheader()
    for data in all_metadata:
        writer.writerow(data)

print("CSV file generated successfully!")


CSV file generated successfully!
