In [4]:
from pysam import VariantFile, tabix_index
import re

In [5]:
def create_vcf_header():
    """Create standardized VCF header"""
    return """##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##reference=GRCh38
##contig=<ID=chr1>
##contig=<ID=chr2>
##contig=<ID=chr3>
##contig=<ID=chr4>
##contig=<ID=chr5>
##contig=<ID=chr6>
##contig=<ID=chr7>
##contig=<ID=chr8>
##contig=<ID=chr9>
##contig=<ID=chr10>
##contig=<ID=chr11>
##contig=<ID=chr12>
##contig=<ID=chr13>
##contig=<ID=chr14>
##contig=<ID=chr15>
##contig=<ID=chr16>
##contig=<ID=chr17>
##contig=<ID=chr18>
##contig=<ID=chr19>
##contig=<ID=chr20>
##contig=<ID=chr21>
##contig=<ID=chr22>
##contig=<ID=chrX>
##bcftools_normVersion=1.12+htslib-1.12
##bcftools_normCommand=norm -m-both -o norm1.vcf.gz plain.vcf.gz; Date=Sat Mar 12 14:42:06 2022
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"""

def convert_to_string(value):
    """Convert tuple or other types to string"""
    if value is None:
        return ""
    if isinstance(value, tuple):
        return value[0]
    return str(value)

def is_immune_related(disease_names):
    """Check if any disease name is immune related"""
    immune_pattern = re.compile(
        r'immun|autoimmun|inflammation|lupus|arthritis|'
        r'thyroiditis|celiac|psoriasis|diabetes type 1|'
        r'multiple sclerosis|inflammatory bowel|'
        r'rheumatoid|vasculitis|myasthenia|'
        r'sarcoidosis|complement|HLA|cytokine',
        re.IGNORECASE
    )
    return any(immune_pattern.search(convert_to_string(disease)) for disease in disease_names)

def is_valid_variant(variant):
    """Check if variant has all required fields"""
    if not variant.alts or not variant.ref or not variant.chrom or not variant.pos:
        return False
    return True

def format_variant_line(variant):
    """Format variant into VCF line"""
    if not is_valid_variant(variant):
        return None
        
    chrom = f"chr{variant.chrom}" if not variant.chrom.startswith('chr') else variant.chrom
    variant_id = variant.id if variant.id else '.'
    return f"{chrom}\t{variant.pos}\t{variant_id}\t{variant.ref}\t{variant.alts[0]}\t.\t.\t.\n"

def process_variant(variant):
    """Process single variant and return formatted line if immune-related"""
    if 'CLNDN' not in variant.info:
        return None
        
    disease_names = variant.info['CLNDN']
    if not isinstance(disease_names, list):
        disease_names = [disease_names]
        
    if not is_immune_related(disease_names):
        return None
        
    return format_variant_line(variant)

def prepare_vcf(vcf_path):
    """Index VCF file if needed"""
    try:
        tabix_index(vcf_path, preset='vcf')
    except Exception as e:
        print(f"Warning: Could not index file: {e}")

def filter_immune_variants(input_vcf, output_vcf):
    """Filter ClinVar VCF for immune-related variants"""
    prepare_vcf(input_vcf)
    vcf_in = VariantFile(input_vcf)
    variant_count = 0
    
    # Write header
    with open(output_vcf, 'w') as f:
        f.write(create_vcf_header())
    
    # Process variants
    with open(output_vcf, 'a') as f:
        for variant in vcf_in:
            vcf_line = process_variant(variant)
            if vcf_line is None:
                continue
                
            f.write(vcf_line)
            variant_count += 1
    
    print(f"Total immune-related variants found: {variant_count}")

In [6]:
# Run the filtering
input_file = "/Users/anonmacintosh/Projects/Splice/data/clinvar.vcf.gz"
output_file = "/Users/anonmacintosh/Projects/Splice/data/immune_variants.vcf"

filter_immune_variants(input_file, output_file)

TypeError: 'NoneType' object is not subscriptable