In [None]:
from Bio.Seq import Seq
from Bio import SeqIO
import random
import re

In [None]:
# 1. Predict amino acid change
def predict_aa_change(reference_seq, position, alt_base):
    codon_start = (position // 3) * 3
    ref_codon = reference_seq[codon_start:codon_start + 3]
    alt_codon = ref_codon[:position % 3] + alt_base + ref_codon[position % 3 + 1:]
    ref_aa = Seq(ref_codon).translate()
    alt_aa = Seq(alt_codon).translate()
    return f"{ref_aa}{position//3 + 1}{alt_aa}"

In [None]:
# 2. Classify variants as coding or non-coding
def classify_variant(variant_position, gene_starts, gene_ends):
    for start, end in zip(gene_starts, gene_ends):
        if start <= variant_position <= end:
            return "Coding"
    return "Non-coding"


In [None]:
# 3. Identify variants in regulatory regions
def identify_regulatory_variants(variants, regulatory_regions):
    regulatory_variants = []
    for variant in variants:
        for region_start, region_end, region_type in regulatory_regions:
            if region_start <= variant['position'] <= region_end:
                regulatory_variants.append({**variant, 'regulatory_type': region_type})
                break
    return regulatory_variants



In [None]:
# 4. Calculate SIFT score (simulated)
def calculate_sift_score(reference_aa, alternate_aa):
    # This is a simplified simulation of SIFT score calculation
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    if reference_aa == alternate_aa:
        return 1.0
    elif reference_aa in amino_acids and alternate_aa in amino_acids:
        return random.uniform(0, 1)
    else:
        return 0.0



In [None]:
# 5. Determine if variant causes frameshift
def is_frameshift(ref_allele, alt_allele):
    return abs(len(ref_allele) - len(alt_allele)) % 3 != 0



In [None]:
# 6. Annotate variants with gene names
def annotate_with_gene_names(variants, gene_coords):
    annotated_variants = []
    for variant in variants:
        for gene, (start, end) in gene_coords.items():
            if start <= variant['position'] <= end:
                annotated_variants.append({**variant, 'gene': gene})
                break
        else:
            annotated_variants.append({**variant, 'gene': 'Intergenic'})
    return annotated_variants



In [None]:
# 7. Predict impact of splice site variants
def predict_splice_impact(variant, exon_starts, exon_ends):
    for start, end in zip(exon_starts, exon_ends):
        if variant['position'] in range(start - 2, start + 2) or variant['position'] in range(end - 1, end + 3):
            return "Potential splice site disruption"
    return "No predicted splice site impact"



In [None]:
# 8. Identify variants affecting transcription factor binding sites
def identify_tfbs_variants(variants, tfbs_motifs):
    tfbs_variants = []
    for variant in variants:
        for motif, sequence in tfbs_motifs.items():
            ref_sequence = variant['ref_sequence']
            alt_sequence = ref_sequence[:variant['position']] + variant['alt_allele'] + ref_sequence[variant['position']+1:]
            if sequence in ref_sequence and sequence not in alt_sequence:
                tfbs_variants.append({**variant, 'disrupted_tfbs': motif})
            elif sequence not in ref_sequence and sequence in alt_sequence:
                tfbs_variants.append({**variant, 'created_tfbs': motif})
    return tfbs_variants



In [None]:
# 9. Calculate conservation score (simulated)
def calculate_conservation_score(position, species_sequences):
    # This is a simplified simulation of conservation score calculation
    bases_at_position = [seq[position] for seq in species_sequences]
    most_common = max(set(bases_at_position), key=bases_at_position.count)
    conservation_score = bases_at_position.count(most_common) / len(species_sequences)
    return conservation_score



In [None]:
# 10. Integrate annotations from multiple sources
def integrate_annotations(variants, gene_impacts, conservation_scores, regulatory_regions):
    integrated_annotations = []
    for variant in variants:
        annotation = {
            'position': variant['position'],
            'ref_allele': variant['ref_allele'],
            'alt_allele': variant['alt_allele'],
            'gene_impact': gene_impacts.get(variant['position'], 'Unknown'),
            'conservation_score': conservation_scores.get(variant['position'], 'Unknown'),
            'regulatory_region': 'None'
        }
        for start, end, region_type in regulatory_regions:
            if start <= variant['position'] <= end:
                annotation['regulatory_region'] = region_type
                break
        integrated_annotations.append(annotation)
    return integrated_annotations


In [None]:
# 11. Implement a machine learning model (e.g., Random Forest) to predict variant pathogenicity using features like conservation scores, allele frequencies, and protein impact scores. Use scikit-learn and evaluate the model's performance.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

def train_pathogenicity_predictor(data_file):
    # Load data
    data = pd.read_csv(data_file)
    
    # Prepare features and target
    X = data[['conservation_score', 'allele_frequency', 'protein_impact_score']]
    y = data['pathogenicity']
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
    
    # Train model
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)
    
    # Evaluate model
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    
    return clf, scaler

# Example usage
data_file = "variant_data.csv"
model, scaler = train_pathogenicity_predictor(data_file)

# Predict pathogenicity for new variants
new_variants = pd.DataFrame({
    'conservation_score': [0.8, 0.3, 0.9],
    'allele_frequency': [0.001, 0.1, 0.0001],
    'protein_impact_score': [0.7, 0.2, 0.95]
})

new_variants_scaled = scaler.transform(new_variants)
predictions = model.predict(new_variants_scaled)

print("Predictions for new variants:")
print(predictions)

In [None]:
# 12. Create a function that integrates data from multiple omics layers (e.g., genomics, transcriptomics, epigenomics) to provide a comprehensive annotation for a given variant. Use pandas for data manipulation and numpy for numerical operations.
import pandas as pd

def integrate_omics_data(variant, genomic_data, transcriptomic_data, epigenomic_data):
    integrated_annotation = {
        'variant': variant,
        'genomic_data': {},
        'transcriptomic_data': {},
        'epigenomic_data': {}
    }
    
    # Genomic data integration
    if variant in genomic_data:
        integrated_annotation['genomic_data'] = {
            'allele_frequency': genomic_data[variant]['allele_frequency'],
            'conservation_score': genomic_data[variant]['conservation_score']
        }
    
    # Transcriptomic data integration
    gene = genomic_data[variant]['gene']
    if gene in transcriptomic_data:
        integrated_annotation['transcriptomic_data'] = {
            'expression_level': transcriptomic_data[gene]['expression_level'],
            'splicing_impact': transcriptomic_data[gene]['splicing_impact']
        }
    
    # Epigenomic data integration
    chrom, pos = variant.split(':')
    pos = int(pos)
    for region in epigenomic_data:
        if region['chrom'] == chrom and region['start'] <= pos <= region['end']:
            integrated_annotation['epigenomic_data'] = {
                'chromatin_state': region['chromatin_state'],
                'methylation_level': region['methylation_level']
            }
            break
    
    return integrated_annotation

# Example usage
genomic_data = {
    'chr1:100': {'allele_frequency': 0.01, 'conservation_score': 0.9, 'gene': 'GENE1'},
    'chr2:200': {'allele_frequency': 0.05, 'conservation_score': 0.5, 'gene': 'GENE2'}
}

transcriptomic_data = {
    'GENE1': {'expression_level': 100, 'splicing_impact': 'high'},
    'GENE2': {'expression_level': 50, 'splicing_impact': 'low'}
}

epigenomic_data = [
    {'chrom': 'chr1', 'start': 1, 'end': 1000, 'chromatin_state': 'active', 'methylation_level': 0.2},
    {'chrom': 'chr2', 'start': 1, 'end': 1000, 'chromatin_state': 'repressed', 'methylation_level': 0.8}
]

variant = 'chr1:100'
integrated_annotation = integrate_omics_data(variant, genomic_data, transcriptomic_data, epigenomic_data)
print(pd.json_normalize(integrated_annotation))

In [None]:
# 13. Develop a visualization function that creates a lollipop plot showing the distribution and impact scores of variants along a gene sequence. Use matplotlib for plotting.
import matplotlib.pyplot as plt

def plot_variant_lollipop(gene_length, variants, impact_scores):
    plt.figure(figsize=(12, 6))
    
    # Plot gene body
    plt.plot([0, gene_length], [0, 0], color='gray', linewidth=2)
    
    # Plot variants as lollipops
    plt.stem(variants, impact_scores, linefmt='C0-', markerfmt='C0o', basefmt=' ')
    
    plt.title(f'Variant Distribution and Impact Scores Along Gene Sequence')
    plt.xlabel('Position in Gene Sequence')
    plt.ylabel('Variant Impact Score')
    plt.ylim(bottom=0)
    plt.xlim(0, gene_length)
    
    plt.tight_layout()
    plt.savefig('variant_lollipop_plot.png')
    plt.close()
    
    print("Variant lollipop plot saved as 'variant_lollipop_plot.png'")

# Example usage
gene_length = 1000
variants = [100, 250, 400, 600, 750, 900]
impact_scores = [0.2, 0.8, 0.5, 0.9, 0.3, 0.7]

plot_variant_lollipop(gene_length, variants, impact_scores)

In [None]:
# 14. Implement a deep learning model using keras or pytorch to predict variant effects directly from DNA sequences. Include both convolutional and recurrent layers in your architecture.
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Dense, Flatten, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

def encode_sequence(seq):
    encoding = {'A': [1,0,0,0], 'C': [0,1,0,0], 'G': [0,0,1,0], 'T': [0,0,0,1]}
    return np.array([encoding[base] for base in seq])

def create_dataset(sequences, labels):
    X = np.array([encode_sequence(seq) for seq in sequences])
    y = to_categorical(labels)
    return X, y

def build_model(seq_length):
    model = Sequential([
        Conv1D(32, 3, activation='relu', input_shape=(seq_length, 4)),
        MaxPooling1D(2),
        Conv1D(64, 3, activation='relu'),
        MaxPooling1D(2),
        Flatten(),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(3, activation='softmax')  # 3 classes: benign, pathogenic, VUS
    ])
    
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Example usage
sequences = [
    'ATGCATGCATGC',
    'GCTAGCTAGCTA',
    'CGATCGATCGAT',
    # ... more sequences ...
]

labels = [0, 1, 2, ...]  # 0: benign, 1: pathogenic, 2: VUS

X, y = create_dataset(sequences, labels)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = build_model(seq_length=12)
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test accuracy: {accuracy:.4f}")

# Predict effect of new variants
new_sequences = [
    'ATGCATGCATGC',
    'GCTAGCTAGCTA',
]
new_X = np.array([encode_sequence(seq) for seq in new_sequences])
predictions = model.predict(new_X)
print("Predictions for new sequences:")
print(predictions)

In [None]:
# 15. Create a function that performs dimensionality reduction (e.g., PCA or t-SNE) on multi-omics data and visualizes the results, coloring points by variant pathogenicity. Use scikit-learn for dimensionality reduction and seaborn for visualization.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

def visualize_multi_omics_data(multi_omics_data, pathogenicity, method='pca'):
    # Combine multi-omics data
    combined_data = np.hstack([data for data in multi_omics_data.values()])
    
    # Perform dimensionality reduction
    if method == 'pca':
        reducer = PCA(n_components=2)
    elif method == 'tsne':
        reducer = TSNE(n_components=2, random_state=42)
    else:
        raise ValueError("Unsupported method. Choose 'pca' or 'tsne'.")
    
    reduced_data = reducer.fit_transform(combined_data)
    
    # Create a dataframe for plotting
    plot_data = pd.DataFrame({
        'x': reduced_data[:, 0],
        'y': reduced_data[:, 1],
        'pathogenicity': pathogenicity
    })
    
    # Plot the results
    plt.figure(figsize=(10, 8))
    for category in plot_data['pathogenicity'].unique():
        subset = plot_data[plot_data['pathogenicity'] == category]
        plt.scatter(subset['x'], subset['y'], label=category, alpha=0.7)
    
    plt.title(f'Multi-omics Data Visualization using {method.upper()}')
    plt.xlabel('Component 1')
    plt.ylabel('Component 2')
    plt.legend()
    plt.tight_layout()
    plt.savefig(f'multi_omics_visualization_{method}.png')
    plt.close()
    
    print(f"Multi-omics visualization saved as 'multi_omics_visualization_{method}.png'")

# Example usage
np.random.seed(42)

# Simulate multi-omics data
n_samples = 1000
multi_omics_data = {
    'genomic': np.random.rand(n_samples, 10),
    'transcriptomic': np.random.rand(n_samples, 15),
    'epigenomic': np.random.rand(n_samples, 8)
}

# Simulate pathogenicity labels
pathogenicity = np.random.choice(['benign', 'pathogenic', 'VUS'], n_samples)

# Visualize using PCA
visualize_multi_omics_data(multi_omics_data, pathogenicity, method='pca')

# Visualize using t-SNE
visualize_multi_omics_data(multi_omics_data, pathogenicity, method='tsne')

In [None]:
# Example usage:
# aa_change = predict_aa_change("ATGGCCTGA", 3, "C")
# variant_type = classify_variant(1000, [500, 2000], [1500, 2500])
# reg_variants = identify_regulatory_variants([{'position': 100}, {'position': 200}], [(50, 150, 'promoter'), (180, 220, 'enhancer')])
# sift_score = calculate_sift_score('A', 'V')
# frameshift = is_frameshift('AT', 'A')
# annotated_vars = annotate_with_gene_names([{'position': 1000}], {'Gene1': (500, 1500), 'Gene2': (2000, 2500)})
# splice_impact = predict_splice_impact({'position': 1001}, [1000, 2000], [1500, 2500])
# tfbs_vars = identify_tfbs_variants([{'position': 10, 'ref_sequence': 'ATCGATCG', 'alt_allele': 'G'}], {'TF1': 'GATC'})
# cons_score = calculate_conservation_score(5, ['ATCGATCG', 'ATCTATCG', 'ATCGATCG'])
# integrated_annot = integrate_annotations([{'position': 100, 'ref_allele': 'A', 'alt_allele': 'G'}], {100: 'Missense'}, {100: 0.9}, [(50, 150, 'promoter')])
