In [None]:
import vcf
from collections import defaultdict
import random
from Bio import SeqIO
from Bio.Seq import Seq


In [None]:
# 1. Parse VCF file
def parse_vcf(vcf_file):
    variants = []
    vcf_reader = vcf.Reader(open(vcf_file, 'r'))
    for record in vcf_reader:
        variants.append({
            'CHROM': record.CHROM,
            'POS': record.POS,
            'REF': record.REF,
            'ALT': str(record.ALT[0]),
            'QUAL': record.QUAL
        })
    return variants




In [None]:
# 2. Calculate allele frequencies
def calculate_allele_frequencies(vcf_file):
    vcf_reader = vcf.Reader(open(vcf_file, 'r'))
    frequencies = {}
    for record in vcf_reader:
        if record.samples[0].gt_type is not None:
            alt_freq = sum(s.gt_type for s in record.samples if s.gt_type is not None) / (2 * len(record.samples))
            frequencies[f"{record.CHROM}:{record.POS}"] = alt_freq
    return frequencies



In [None]:
# 3. Filter variants based on quality
def filter_variants_by_quality(vcf_file, quality_threshold):
    vcf_reader = vcf.Reader(open(vcf_file, 'r'))
    filtered_variants = [record for record in vcf_reader if record.QUAL >= quality_threshold]
    return filtered_variants



In [None]:
# 4. Identify unique variants
def find_unique_variants(vcf_files):
    variant_counts = defaultdict(int)
    unique_variants = defaultdict(list)
    
    for i, vcf_file in enumerate(vcf_files):
        vcf_reader = vcf.Reader(open(vcf_file, 'r'))
        for record in vcf_reader:
            variant_key = f"{record.CHROM}:{record.POS}:{record.REF}:{record.ALT[0]}"
            variant_counts[variant_key] += 1
            if variant_counts[variant_key] == 1:
                unique_variants[i].append(variant_key)
            elif variant_counts[variant_key] > 1 and variant_key in unique_variants[i]:
                unique_variants[i].remove(variant_key)
    
    return unique_variants



In [None]:
# 5. Annotate variants
def annotate_variants(vcf_file, gene_file):
    vcf_reader = vcf.Reader(open(vcf_file, 'r'))
    genes = SeqIO.to_dict(SeqIO.parse(gene_file, "fasta"))
    
    annotated_variants = []
    for record in vcf_reader:
        if record.CHROM in genes:
            gene_seq = genes[record.CHROM].seq
            pos = record.POS - 1  # 0-based indexing
            codon_start = (pos // 3) * 3
            ref_codon = gene_seq[codon_start:codon_start+3]
            alt_codon = ref_codon[:pos%3] + record.ALT[0] + ref_codon[pos%3+1:]
            
            ref_aa = Seq(ref_codon).translate()
            alt_aa = Seq(alt_codon).translate()
            
            if ref_aa == alt_aa:
                effect = "synonymous"
            elif alt_aa == '*':
                effect = "nonsense"
            else:
                effect = "missense"
            
            annotated_variants.append({
                'CHROM': record.CHROM,
                'POS': record.POS,
                'REF': record.REF,
                'ALT': str(record.ALT[0]),
                'EFFECT': effect
            })
    
    return annotated_variants



In [None]:
# 6. Generate consensus sequence
def generate_consensus(reference_file, vcf_file):
    reference = SeqIO.read(reference_file, "fasta")
    vcf_reader = vcf.Reader(open(vcf_file, 'r'))
    
    consensus = list(str(reference.seq))
    for record in vcf_reader:
        pos = record.POS - 1  # 0-based indexing
        consensus[pos] = str(record.ALT[0])
    
    return ''.join(consensus)



In [None]:
# 7. Calculate transition/transversion ratio
def calculate_ti_tv_ratio(vcf_file):
    transitions = 0
    transversions = 0
    vcf_reader = vcf.Reader(open(vcf_file, 'r'))
    
    for record in vcf_reader:
        if len(record.REF) == 1 and len(record.ALT[0]) == 1:
            if (record.REF, str(record.ALT[0])) in [('A','G'), ('G','A'), ('C','T'), ('T','C')]:
                transitions += 1
            else:
                transversions += 1
    
    return transitions / transversions if transversions > 0 else float('inf')



In [None]:
# 8. Identify regions of high variant density
def find_high_density_regions(vcf_file, window_size=1000, threshold=10):
    vcf_reader = vcf.Reader(open(vcf_file, 'r'))
    variant_positions = defaultdict(list)
    
    for record in vcf_reader:
        variant_positions[record.CHROM].append(record.POS)
    
    high_density_regions = []
    for chrom, positions in variant_positions.items():
        positions.sort()
        for i in range(len(positions)):
            window_start = positions[i]
            window_end = window_start + window_size
            variants_in_window = sum(1 for pos in positions[i:] if pos < window_end)
            if variants_in_window >= threshold:
                high_density_regions.append((chrom, window_start, window_end))
    
    return high_density_regions



In [None]:
# 9. Convert genomic coordinates to gene-relative coordinates
def genomic_to_gene_coordinates(vcf_file, gene_file):
    vcf_reader = vcf.Reader(open(vcf_file, 'r'))
    genes = SeqIO.to_dict(SeqIO.parse(gene_file, "fasta"))
    
    converted_variants = []
    for record in vcf_reader:
        if record.CHROM in genes:
            gene_start = genes[record.CHROM].features[0].location.start
            gene_relative_pos = record.POS - gene_start
            converted_variants.append({
                'GENE': record.CHROM,
                'GENE_POS': gene_relative_pos,
                'REF': record.REF,
                'ALT': str(record.ALT[0])
            })
    
    return converted_variants



In [None]:
# 10. Simulate variant calling process
def simulate_variant_calling(reference_sequence, read_length=100, coverage=30, error_rate=0.01):
    # Generate mock reads
    num_reads = int(len(reference_sequence) * coverage / read_length)
    reads = []
    for _ in range(num_reads):
        start = random.randint(0, len(reference_sequence) - read_length)
        read = list(reference_sequence[start:start+read_length])
        for i in range(len(read)):
            if random.random() < error_rate:
                read[i] = random.choice(['A', 'C', 'G', 'T'])
        reads.append(''.join(read))
    
    # Simple variant calling
    base_counts = defaultdict(lambda: defaultdict(int))
    for read in reads:
        for i, base in enumerate(read):
            base_counts[i][base] += 1
    
    variants = []
    for pos, counts in base_counts.items():
        ref_base = reference_sequence[pos]
        alt_bases = [base for base, count in counts.items() if base != ref_base and count > coverage/4]
        if alt_bases:
            variants.append({
                'POS': pos,
                'REF': ref_base,
                'ALT': alt_bases[0]
            })
    
    return variants



In [None]:
# 11. Implement a function to detect potential structural variants using a read-pair approach. The function should identify read pairs with abnormal insert sizes or orientations.
import pysam
import numpy as np
from collections import defaultdict

def detect_structural_variants(bam_file, mean_insert_size, std_insert_size, min_support=3):
    structural_variants = defaultdict(list)
    
    with pysam.AlignmentFile(bam_file, "rb") as bam:
        for read in bam.fetch():
            if read.is_proper_pair and not read.is_secondary and not read.is_supplementary:
                if read.template_length > mean_insert_size + 3 * std_insert_size:
                    sv_type = "Deletion"
                    sv_size = abs(read.template_length) - mean_insert_size
                elif read.template_length < mean_insert_size - 3 * std_insert_size:
                    sv_type = "Insertion"
                    sv_size = mean_insert_size - abs(read.template_length)
                else:
                    continue
                
                start = min(read.reference_start, read.next_reference_start)
                end = max(read.reference_end, read.reference_start + abs(read.template_length))
                
                structural_variants[(read.reference_name, sv_type)].append((start, end, sv_size))
    
    # Filter by minimum support
    filtered_svs = {}
    for (chrom, sv_type), svs in structural_variants.items():
        if len(svs) >= min_support:
            filtered_svs[(chrom, sv_type)] = np.median(svs, axis=0)
    
    return filtered_svs

# Example usage
bam_file = "example.bam"
mean_insert_size = 350
std_insert_size = 50

structural_variants = detect_structural_variants(bam_file, mean_insert_size, std_insert_size)

for (chrom, sv_type), (start, end, size) in structural_variants.items():
    print(f"{chrom}\t{sv_type}\t{start:.0f}\t{end:.0f}\t{size:.0f}")

In [None]:
# 12. Create a simple machine learning model (e.g., using scikit-learn) to classify variants as benign or pathogenic based on features like conservation scores and allele frequencies.
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

def train_variant_classifier(data_file):
    # Load data
    data = pd.read_csv(data_file)
    
    # Prepare features and target
    X = data.drop(['variant_id', 'class'], axis=1)
    y = data['class']
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
    
    # Train model
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)
    
    # Evaluate model
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    
    return clf, scaler

# Example usage
data_file = "variant_data.csv"
model, scaler = train_variant_classifier(data_file)

# Classify new variants
new_variants = pd.DataFrame({
    'conservation_score': [0.8, 0.3, 0.9],
    'allele_frequency': [0.001, 0.1, 0.0001],
    'missense_score': [0.7, 0.2, 0.95]
})

new_variants_scaled = scaler.transform(new_variants)
predictions = model.predict(new_variants_scaled)

print("Predictions for new variants:")
print(predictions)

In [None]:
# 13. Develop a script to simulate the process of returning genetic results to participants, considering factors like variant pathogenicity, clinical actionability, and participant consent preferences.
import random

class GeneticResultsManager:
    def __init__(self):
        self.participants = {}
        self.consent_levels = {
            'all': ['pathogenic', 'likely_pathogenic', 'uncertain_significance', 'likely_benign', 'benign'],
            'actionable': ['pathogenic', 'likely_pathogenic'],
            'none': []
        }
    
    def add_participant(self, participant_id, consent_level):
        self.participants[participant_id] = {
            'consent_level': consent_level,
            'results': []
        }
    
    def add_result(self, participant_id, variant, classification):
        if participant_id in self.participants:
            self.participants[participant_id]['results'].append({
                'variant': variant,
                'classification': classification
            })
    
    def generate_report(self, participant_id):
        if participant_id not in self.participants:
            return f"Participant {participant_id} not found."
        
        participant = self.participants[participant_id]
        consent_level = participant['consent_level']
        allowed_classifications = self.consent_levels[consent_level]
        
        report = f"Genetic Results Report for Participant {participant_id}\n"
        report += f"Consent Level: {consent_level}\n\n"
        
        for result in participant['results']:
            if result['classification'] in allowed_classifications:
                report += f"Variant: {result['variant']}\n"
                report += f"Classification: {result['classification']}\n\n"
        
        return report

# Example usage
manager = GeneticResultsManager()

# Add participants with different consent levels
manager.add_participant('P001', 'all')
manager.add_participant('P002', 'actionable')
manager.add_participant('P003', 'none')

# Simulate adding results
variants = ['BRCA1:c.181T>G', 'TP53:c.215C>G', 'PTEN:c.388C>T']
classifications = ['pathogenic', 'likely_pathogenic', 'uncertain_significance', 'likely_benign', 'benign']

for participant in manager.participants:
    for _ in range(3):
        variant = random.choice(variants)
        classification = random.choice(classifications)
        manager.add_result(participant, variant, classification)

# Generate and print reports
for participant in manager.participants:
    print(manager.generate_report(participant))
    print('-' * 50)

In [None]:
# 14. Implement a basic population stratification analysis to identify potential biases in a variant dataset, considering different ethnic groups or populations.
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

def population_stratification_analysis(genotype_data, population_labels):
    # Perform PCA
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(genotype_data)
    
    # Create a dataframe with PCA results and population labels
    result_df = pd.DataFrame(data=pca_result, columns=['PC1', 'PC2'])
    result_df['Population'] = population_labels
    
    # Plot results
    plt.figure(figsize=(10, 8))
    populations = result_df['Population'].unique()
    colors = plt.cm.rainbow(np.linspace(0, 1, len(populations)))
    
    for population, color in zip(populations, colors):
        mask = result_df['Population'] == population
        plt.scatter(result_df.loc[mask, 'PC1'], result_df.loc[mask, 'PC2'], 
                    c=[color], label=population, alpha=0.7)
    
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.legend()
    plt.title('Population Stratification Analysis')
    plt.savefig('population_stratification.png')
    plt.close()
    
    print("Population stratification plot saved as 'population_stratification.png'")
    
    # Calculate Fst (simplified version)
    total_variance = np.var(genotype_data, axis=0)
    within_population_variance = np.mean([np.var(genotype_data[population_labels == pop], axis=0) 
                                          for pop in populations], axis=0)
    fst = (total_variance - within_population_variance) / total_variance
    
    return np.mean(fst)

# Example usage
np.random.seed(42)
n_samples = 1000
n_variants = 100

# Simulate genotype data for three populations
pop1 = np.random.binomial(2, 0.3, size=(n_samples // 3, n_variants))
pop2 = np.random.binomial(2, 0.5, size=(n_samples // 3, n_variants))
pop3 = np.random.binomial(2, 0.7, size=(n_samples // 3, n_variants))

genotype_data = np.vstack((pop1, pop2, pop3))
population_labels = np.repeat(['Pop1', 'Pop2', 'Pop3'], n_samples // 3)

fst = population_stratification_analysis(genotype_data, population_labels)
print(f"Average Fst: {fst:.4f}")

In [None]:
# 15. Create a function to anonymize genetic data by replacing personally identifiable information with hash codes while preserving the ability to link related records.
import pandas as pd
import hashlib

def anonymize_genetic_data(data, identifiers, salt=""):
    anonymized_data = data.copy()
    id_mapping = {}
    
    for identifier in identifiers:
        if identifier in anonymized_data.columns:
            anonymized_data[identifier] = anonymized_data[identifier].apply(
                lambda x: hashlib.sha256((str(x) + salt).encode()).hexdigest()
            )
            
            # Create a mapping to preserve links between related records
            id_mapping[identifier] = dict(zip(data[identifier], anonymized_data[identifier]))
    
    return anonymized_data, id_mapping

def link_related_records(anonymized_data, id_mapping, link_column):
    linked_data = anonymized_data.copy()
    
    if link_column in id_mapping:
        linked_data['original_' + link_column] = linked_data[link_column].map(
            {v: k for k, v in id_mapping[link_column].items()}
        )
    
    return linked_data

# Example usage
data = pd.DataFrame({
    'patient_id': ['P001', 'P002', 'P003', 'P001', 'P002'],
    'sample_id': ['S001', 'S002', 'S003', 'S004', 'S005'],
    'age': [35, 42, 28, 35, 42],
    'gender': ['M', 'F', 'M', 'M', 'F'],
    'genotype': ['AA', 'AT', 'TT', 'AG', 'GG']
})

identifiers = ['patient_id', 'sample_id']
anonymized_data, id_mapping = anonymize_genetic_data(data, identifiers)

print("Anonymized data:")
print(anonymized_data)

linked_data = link_related_records(anonymized_data, id_mapping, 'patient_id')

print("\nLinked data:")
print(linked_data)

In [None]:
# Example usage:
# variants = parse_vcf("example.vcf")
# allele_freqs = calculate_allele_frequencies("example.vcf")
# filtered_variants = filter_variants_by_quality("example.vcf", 30)
# unique_vars = find_unique_variants(["sample1.vcf", "sample2.vcf", "sample3.vcf"])
# annotated_vars = annotate_variants("example.vcf", "genes.fasta")
# consensus = generate_consensus("reference.fasta", "variants.vcf")
# ti_tv_ratio = calculate_ti_tv_ratio("example.vcf")
# high_density = find_high_density_regions("example.vcf")
# gene_coords = genomic_to_gene_coordinates("example.vcf", "genes.fasta")
# simulated_variants = simulate_variant_calling("ATCGATCGATCG" * 1000)