In [1]:
import pandas as pd
import json
import glob
import re

# Constants
STANDARD_COLUMNS = ['REVIEW_STAGE', 'Database', 'DOI', 'DOI_LINK', 'Title', 'Author', 'Year', 'Abstract']
NOT_FOUND_VALUE = 'NOT_FOUND'
DEFAULT_REVIEW_STAGE = 'unchecked'

# Function to extract the year from publication_date
def extract_year(publication_date):
    if publication_date:
        match = re.search(r'\b\d{4}\b', str(publication_date))
        if match:
            return match.group()
    return NOT_FOUND_VALUE

# Function to process each document in the JSON
def process_document(doc):
    record = {}
    record['Database'] = 'AIS'
    record['REVIEW_STAGE'] = DEFAULT_REVIEW_STAGE
    record['DOI'] = doc.get('publication_link', NOT_FOUND_VALUE)
    record['DOI_LINK'] = doc.get('url', NOT_FOUND_VALUE)
    record['Title'] = doc.get('title', NOT_FOUND_VALUE)
    
    # Process 'Author' field
    authors = doc.get('author', [])
    if authors:
        record['Author'] = '; '.join(authors)
    else:
        record['Author'] = NOT_FOUND_VALUE
    
    # Extract 'Year' from 'publication_date'
    publication_date = doc.get('publication_date', '')
    record['Year'] = extract_year(publication_date)
    
    # 'Abstract' may not be present; set to 'NOT_FOUND'
    record['Abstract'] = NOT_FOUND_VALUE  # Assuming 'Abstract' is not available in the JSON
    
    return record



In [2]:
# Read the existing Excel file
#combined_df = pd.read_excel('combined_results_cleaned.xlsx')

# List of JSON files to process (adjust the pattern as needed)
json_files = glob.glob('results_review_incl_RAG/ais_*.json')

# List to store processed records
new_records = []

# Process each JSON file
for json_file in json_files:
    print(f"Processing {json_file}")
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        docs = data.get('docs', [])
        for doc in docs:
            record = process_document(doc)
            new_records.append(record)

# Create a DataFrame from the new records
if new_records:
    new_df = pd.DataFrame(new_records, columns=STANDARD_COLUMNS)
    
    # Save the updated DataFrame back to the Excel file
    new_df.to_excel('ais_extracted.xlsx', index=False)
    print("New records added to ais_extracted.xlsx")
else:
    print("No new records to add.")

Processing results_review_incl_RAG\ais_1.json
Processing results_review_incl_RAG\ais_2.json
Processing results_review_incl_RAG\ais_3.json
Processing results_review_incl_RAG\ais_4.json
Processing results_review_incl_RAG\ais_5.json
New records added to ais_extracted.xlsx
