In [None]:
import pandas as pd
import json
import re

# Load the CSV file
csv_file_path = '/Users/akankshagupta/Downloads/FORRT/fred_mini.csv'
df = pd.read_csv(csv_file_path)

# Load the JSON file
json_file_path = '/Users/akankshagupta/Downloads/FORRT/first_15_preprints_with_doi_crossref.json'
with open(json_file_path, 'r') as json_file:
    json_data = json.load(json_file)

results = []

# Normalize DOIs 
def normalize_doi(doi):
    """
    Normalize DOI to lowercase, remove URL prefixes, and extract substring from '10.' onward.
    Returns None if invalid.
    """
    if not doi or not isinstance(doi, str):
        return None
    doi = doi.strip().lower()
    doi = re.sub(r'^https?://(dx\.)?doi\.org/', '', doi)
    match = re.search(r'10\.\S+', doi)
    return match.group(0) if match else None


# Iterate through JSON 
for entry in json_data:
    preprint_info = entry.get("preprint", {})
    references = entry.get("references", [])

    preprint_title = preprint_info.get("title")
    preprint_doi = normalize_doi(preprint_info.get("doi"))

    # Iterate through references
    for reference in references:
        ref_doi_raw = reference.get("doi")
        ref_doi = normalize_doi(ref_doi_raw)

        # Skip null DOIs
        if not ref_doi:
            continue

        # Match JSON reference DOI with CSV doi_o 
        matched_rows = df[df['doi_o'].str.strip().str.lower() == ref_doi]

        if not matched_rows.empty:
            for _, row in matched_rows.iterrows():
                doi_o = row['doi_o']
                doi_r = row['doi_r']

                # Check if doi_r appears is already cited in references
                all_ref_dois = [normalize_doi(r.get('doi')) for r in references if r.get('doi')]
                cited = normalize_doi(doi_r) in all_ref_dois

                # Add to results list
                results.append({
                    "preprint_title": preprint_title,
                    "preprint_doi": preprint_doi,
                    "reference_doi": ref_doi,
                    "doi_o": doi_o,
                    "doi_r": doi_r,
                    "cited_r": cited
                })

# Create DataFrame
results_df = pd.DataFrame(results)

