In [None]:
import vcf
from collections import defaultdict
import random
from Bio import SeqIO
from Bio.Seq import Seq


In [None]:
# 1. Parse VCF file
def parse_vcf(vcf_file):
    variants = []
    vcf_reader = vcf.Reader(open(vcf_file, 'r'))
    for record in vcf_reader:
        variants.append({
            'CHROM': record.CHROM,
            'POS': record.POS,
            'REF': record.REF,
            'ALT': str(record.ALT[0]),
            'QUAL': record.QUAL
        })
    return variants




In [None]:
# 2. Calculate allele frequencies
def calculate_allele_frequencies(vcf_file):
    vcf_reader = vcf.Reader(open(vcf_file, 'r'))
    frequencies = {}
    for record in vcf_reader:
        if record.samples[0].gt_type is not None:
            alt_freq = sum(s.gt_type for s in record.samples if s.gt_type is not None) / (2 * len(record.samples))
            frequencies[f"{record.CHROM}:{record.POS}"] = alt_freq
    return frequencies



In [None]:
# 3. Filter variants based on quality
def filter_variants_by_quality(vcf_file, quality_threshold):
    vcf_reader = vcf.Reader(open(vcf_file, 'r'))
    filtered_variants = [record for record in vcf_reader if record.QUAL >= quality_threshold]
    return filtered_variants



In [None]:
# 4. Identify unique variants
def find_unique_variants(vcf_files):
    variant_counts = defaultdict(int)
    unique_variants = defaultdict(list)
    
    for i, vcf_file in enumerate(vcf_files):
        vcf_reader = vcf.Reader(open(vcf_file, 'r'))
        for record in vcf_reader:
            variant_key = f"{record.CHROM}:{record.POS}:{record.REF}:{record.ALT[0]}"
            variant_counts[variant_key] += 1
            if variant_counts[variant_key] == 1:
                unique_variants[i].append(variant_key)
            elif variant_counts[variant_key] > 1 and variant_key in unique_variants[i]:
                unique_variants[i].remove(variant_key)
    
    return unique_variants



In [None]:
# 5. Annotate variants
def annotate_variants(vcf_file, gene_file):
    vcf_reader = vcf.Reader(open(vcf_file, 'r'))
    genes = SeqIO.to_dict(SeqIO.parse(gene_file, "fasta"))
    
    annotated_variants = []
    for record in vcf_reader:
        if record.CHROM in genes:
            gene_seq = genes[record.CHROM].seq
            pos = record.POS - 1  # 0-based indexing
            codon_start = (pos // 3) * 3
            ref_codon = gene_seq[codon_start:codon_start+3]
            alt_codon = ref_codon[:pos%3] + record.ALT[0] + ref_codon[pos%3+1:]
            
            ref_aa = Seq(ref_codon).translate()
            alt_aa = Seq(alt_codon).translate()
            
            if ref_aa == alt_aa:
                effect = "synonymous"
            elif alt_aa == '*':
                effect = "nonsense"
            else:
                effect = "missense"
            
            annotated_variants.append({
                'CHROM': record.CHROM,
                'POS': record.POS,
                'REF': record.REF,
                'ALT': str(record.ALT[0]),
                'EFFECT': effect
            })
    
    return annotated_variants



In [None]:
# 6. Generate consensus sequence
def generate_consensus(reference_file, vcf_file):
    reference = SeqIO.read(reference_file, "fasta")
    vcf_reader = vcf.Reader(open(vcf_file, 'r'))
    
    consensus = list(str(reference.seq))
    for record in vcf_reader:
        pos = record.POS - 1  # 0-based indexing
        consensus[pos] = str(record.ALT[0])
    
    return ''.join(consensus)



In [None]:
# 7. Calculate transition/transversion ratio
def calculate_ti_tv_ratio(vcf_file):
    transitions = 0
    transversions = 0
    vcf_reader = vcf.Reader(open(vcf_file, 'r'))
    
    for record in vcf_reader:
        if len(record.REF) == 1 and len(record.ALT[0]) == 1:
            if (record.REF, str(record.ALT[0])) in [('A','G'), ('G','A'), ('C','T'), ('T','C')]:
                transitions += 1
            else:
                transversions += 1
    
    return transitions / transversions if transversions > 0 else float('inf')



In [None]:
# 8. Identify regions of high variant density
def find_high_density_regions(vcf_file, window_size=1000, threshold=10):
    vcf_reader = vcf.Reader(open(vcf_file, 'r'))
    variant_positions = defaultdict(list)
    
    for record in vcf_reader:
        variant_positions[record.CHROM].append(record.POS)
    
    high_density_regions = []
    for chrom, positions in variant_positions.items():
        positions.sort()
        for i in range(len(positions)):
            window_start = positions[i]
            window_end = window_start + window_size
            variants_in_window = sum(1 for pos in positions[i:] if pos < window_end)
            if variants_in_window >= threshold:
                high_density_regions.append((chrom, window_start, window_end))
    
    return high_density_regions



In [None]:
# 9. Convert genomic coordinates to gene-relative coordinates
def genomic_to_gene_coordinates(vcf_file, gene_file):
    vcf_reader = vcf.Reader(open(vcf_file, 'r'))
    genes = SeqIO.to_dict(SeqIO.parse(gene_file, "fasta"))
    
    converted_variants = []
    for record in vcf_reader:
        if record.CHROM in genes:
            gene_start = genes[record.CHROM].features[0].location.start
            gene_relative_pos = record.POS - gene_start
            converted_variants.append({
                'GENE': record.CHROM,
                'GENE_POS': gene_relative_pos,
                'REF': record.REF,
                'ALT': str(record.ALT[0])
            })
    
    return converted_variants



In [None]:
# 10. Simulate variant calling process
def simulate_variant_calling(reference_sequence, read_length=100, coverage=30, error_rate=0.01):
    # Generate mock reads
    num_reads = int(len(reference_sequence) * coverage / read_length)
    reads = []
    for _ in range(num_reads):
        start = random.randint(0, len(reference_sequence) - read_length)
        read = list(reference_sequence[start:start+read_length])
        for i in range(len(read)):
            if random.random() < error_rate:
                read[i] = random.choice(['A', 'C', 'G', 'T'])
        reads.append(''.join(read))
    
    # Simple variant calling
    base_counts = defaultdict(lambda: defaultdict(int))
    for read in reads:
        for i, base in enumerate(read):
            base_counts[i][base] += 1
    
    variants = []
    for pos, counts in base_counts.items():
        ref_base = reference_sequence[pos]
        alt_bases = [base for base, count in counts.items() if base != ref_base and count > coverage/4]
        if alt_bases:
            variants.append({
                'POS': pos,
                'REF': ref_base,
                'ALT': alt_bases[0]
            })
    
    return variants



In [None]:
# Example usage:
# variants = parse_vcf("example.vcf")
# allele_freqs = calculate_allele_frequencies("example.vcf")
# filtered_variants = filter_variants_by_quality("example.vcf", 30)
# unique_vars = find_unique_variants(["sample1.vcf", "sample2.vcf", "sample3.vcf"])
# annotated_vars = annotate_variants("example.vcf", "genes.fasta")
# consensus = generate_consensus("reference.fasta", "variants.vcf")
# ti_tv_ratio = calculate_ti_tv_ratio("example.vcf")
# high_density = find_high_density_regions("example.vcf")
# gene_coords = genomic_to_gene_coordinates("example.vcf", "genes.fasta")
# simulated_variants = simulate_variant_calling("ATCGATCGATCG" * 1000)