In [9]:
from pysam import VariantFile
import re

In [10]:
def filter_immune_variants(input_vcf, output_vcf):
    """
    Filter ClinVar VCF for immune-related variants
    
    Args:
        input_vcf (str): Path to input ClinVar VCF
        output_vcf (str): Path to output filtered VCF
    """
    # Open input VCF
    vcf_in = VariantFile(input_vcf)
    
    # Create new VCF with same header
    vcf_out = VariantFile(output_vcf, 'w', header=vcf_in.header)
    
    # Compile regex pattern for immune-related terms
    immune_pattern = re.compile(
        r'immun|autoimmun|inflammation|lupus|arthritis|'
        r'thyroiditis|celiac|psoriasis|diabetes type 1|'
        r'multiple sclerosis|inflammatory bowel|'
        r'rheumatoid|vasculitis|myasthenia|'
        r'sarcoidosis|complement|HLA|cytokine',
        re.IGNORECASE
    )
    
    variant_count = 0
    # Iterate through variants
    for variant in vcf_in.fetch():
        # Check CLNDN (Clinical Disease Name) in INFO field
        if 'CLNDN' in variant.info:
            disease_names = variant.info['CLNDN']
            # Convert to list if not already
            if isinstance(disease_names, str):
                disease_names = [disease_names]
                
            # Check if any disease name matches immune pattern
            if any(immune_pattern.search(disease) for disease in disease_names):
                vcf_out.write(variant)
                variant_count += 1
                
                # Print example matches (first few)
                if variant_count <= 5:
                    print(f"Found immune-related variant:")
                    print(f"Position: {variant.chrom}:{variant.pos}")
                    print(f"Disease(s): {disease_names}")
                    print("---")
    
    print(f"Total immune-related variants found: {variant_count}")
    
    vcf_in.close()
    vcf_out.close()

In [12]:
# Run the filtering
input_file = "/Users/anonmacintosh/Projects/Splice/data/clinvar.vcf.gz"
output_file = "/Users/anonmacintosh/Projects/Splice/data/immune_variants.vcf"

filter_immune_variants(input_file, output_file)

[W::hts_idx_load3] The index file is older than the data file: /Users/anonmacintosh/Projects/Splice/data/clinvar.vcf.gz.tbi


Found immune-related variant:
Position: 1:1211567
Disease(s): ('Combined_immunodeficiency_due_to_OX40_deficiency',)
---
Found immune-related variant:
Position: 1:1211573
Disease(s): ('Combined_immunodeficiency_due_to_OX40_deficiency',)
---
Found immune-related variant:
Position: 1:1211577
Disease(s): ('Combined_immunodeficiency_due_to_OX40_deficiency',)
---
Found immune-related variant:
Position: 1:1211580
Disease(s): ('Combined_immunodeficiency_due_to_OX40_deficiency',)
---
Found immune-related variant:
Position: 1:1211581
Disease(s): ('Combined_immunodeficiency_due_to_OX40_deficiency',)
---
Total immune-related variants found: 57110
