In [1]:
import json
import os
import glob
from typing import Dict, Any, List

In [2]:
def has_required_openfda_fields(record: Dict[str, Any]) -> bool:

    required_openfda_fields = [
        'brand_name', 'generic_name', 'manufacturer_name', 
        'product_ndc', 'route', 'product_type'
    ]
    
    openfda = record.get('openfda', {})
    
    for field in required_openfda_fields:
        value = openfda.get(field)
        if not value or (isinstance(value, list) and len(value) == 0):
            return False
    
    return True

In [3]:
def has_required_top_level_fields(record: Dict[str, Any]) -> bool:
    set_id = record.get('set_id')
    return set_id is not None and set_id != ''

In [4]:
def has_searchable_content(record: Dict[str, Any]) -> bool:
    text_fields = [
        'indications_and_usage', 'dosage_and_administration', 'warnings',
        'active_ingredient', 'inactive_ingredient', 'purpose', 'description',
        'adverse_reactions', 'contraindications', 'drug_interactions',
        'clinical_pharmacology', 'boxed_warning', 'stop_use', 'do_not_use'
    ]
    
    for field in text_fields:
        content = record.get(field)
        if content and isinstance(content, list) and len(content) > 0:
            for item in content:
                if isinstance(item, str) and item.strip():
                    return True
    
    return False

In [5]:
def is_valid_record(record: Dict[str, Any]) -> bool:
    return (
        has_required_openfda_fields(record) and 
        has_required_top_level_fields(record) and 
        has_searchable_content(record)
    )

In [6]:
def filter_openfda_file(input_file: str, output_file: str) -> Dict[str, int]:
    print(f"Processing: {os.path.basename(input_file)}")
    
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except Exception as e:
        print(f"Error reading {input_file}: {e}")
        return {'total': 0, 'valid': 0, 'filtered': 0}
    
    original_results = data.get('results', [])
    total_records = len(original_results)
    
    valid_records = []
    failed_openfda = 0
    failed_top_level = 0
    failed_content = 0
    
    for record in original_results:
        if is_valid_record(record):
            valid_records.append(record)
        else:
            if not has_required_openfda_fields(record):
                failed_openfda += 1
            elif not has_required_top_level_fields(record):
                failed_top_level += 1
            elif not has_searchable_content(record):
                failed_content += 1
    
    valid_count = len(valid_records)
    filtered_count = total_records - valid_count
    
    filtered_data = {
        "results": valid_records
    }
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(filtered_data, f, indent=2, ensure_ascii=False)
    
    stats = {
        'total': total_records,
        'valid': valid_count,
        'filtered': filtered_count,
        'failed_openfda': failed_openfda,
        'failed_top_level': failed_top_level,
        'failed_content': failed_content
    }
    
    print(f"  Total: {total_records}, Valid: {valid_count}, Filtered out: {filtered_count}")
    print(f"  Failed reasons - OpenFDA fields: {failed_openfda}, Top-level: {failed_top_level}, Content: {failed_content}")
    
    return stats

In [7]:
def filter_all_openfda_files(input_dir="../data/openfda/original", output_dir="../data/openfda/filtered/"):
    os.makedirs(output_dir, exist_ok=True)
    
    json_files = glob.glob(os.path.join(input_dir, "drug-label-*.json"))
    json_files.sort()
    
    print(f"Found {len(json_files)} JSON files to filter:")
    
    total_stats = {
        'total': 0,
        'valid': 0,
        'filtered': 0,
        'failed_openfda': 0,
        'failed_top_level': 0,
        'failed_content': 0
    }
    
    for json_file in json_files:
        filename = os.path.basename(json_file)
        output_file = os.path.join(output_dir, filename)
        
        file_stats = filter_openfda_file(json_file, output_file)
        
        for key in total_stats:
            total_stats[key] += file_stats.get(key, 0)
    
    print(f"Total records processed: {total_stats['total']:,}")
    print(f"Valid records kept: {total_stats['valid']:,} ({total_stats['valid']/total_stats['total']*100:.1f}%)")
    print(f"Records filtered out: {total_stats['filtered']:,} ({total_stats['filtered']/total_stats['total']*100:.1f}%)")
    print("\nFailure reasons:")
    print(f"  Missing OpenFDA fields: {total_stats['failed_openfda']:,}")
    print(f"  Missing top-level fields: {total_stats['failed_top_level']:,}")
    print(f"  Missing searchable content: {total_stats['failed_content']:,}")
    
    return total_stats

In [8]:
stats = filter_all_openfda_files()

print(f"Filtering completed!")

Found 13 JSON files to filter:
Processing: drug-label-0001-of-0013.json
  Total: 20000, Valid: 5233, Filtered out: 14767
  Failed reasons - OpenFDA fields: 14627, Top-level: 0, Content: 140
Processing: drug-label-0002-of-0013.json
  Total: 20000, Valid: 4535, Filtered out: 15465
  Failed reasons - OpenFDA fields: 15241, Top-level: 0, Content: 224
Processing: drug-label-0003-of-0013.json
  Total: 20000, Valid: 7794, Filtered out: 12206
  Failed reasons - OpenFDA fields: 11968, Top-level: 0, Content: 238
Processing: drug-label-0004-of-0013.json
  Total: 20000, Valid: 7833, Filtered out: 12167
  Failed reasons - OpenFDA fields: 11940, Top-level: 0, Content: 227
Processing: drug-label-0005-of-0013.json
  Total: 20000, Valid: 7355, Filtered out: 12645
  Failed reasons - OpenFDA fields: 12420, Top-level: 0, Content: 225
Processing: drug-label-0006-of-0013.json
  Total: 20000, Valid: 4977, Filtered out: 15023
  Failed reasons - OpenFDA fields: 14882, Top-level: 0, Content: 141
Processing: dru