In [None]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from goatools import obo_parser, go_enrichment
from scipy.stats import pearsonr
import networkx as nx
from scipy.spatial.distance import pdist, squareform


In [None]:
# 1. Normalize RNA-seq data using TPM
def calculate_tpm(counts, lengths):
    rate = counts / lengths
    return rate / np.sum(rate) * 1e6


In [None]:
# 2. Identify differentially expressed genes
def identify_de_genes(condition1, condition2, alpha=0.05):
    de_genes = []
    for gene in condition1.index:
        t_stat, p_value = stats.ttest_ind(condition1.loc[gene], condition2.loc[gene])
        if p_value < alpha:
            de_genes.append((gene, t_stat, p_value))
    return pd.DataFrame(de_genes, columns=['Gene', 't_statistic', 'p_value'])


In [None]:
# 3. Perform GO enrichment analysis
def go_enrichment_analysis(gene_list, go_annotations, go_obo):
    go = obo_parser.GODag(go_obo)
    study = set(gene_list)
    population = set(go_annotations.keys())
    g = go_enrichment.GOEnrichmentStudy(population, go_annotations, go, propagate_counts=True, alpha=0.05)
    results = g.run_study(study)
    return results


In [None]:
# 4. Cluster gene expression data using k-means
def cluster_gene_expression(expression_data, n_clusters=5):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(expression_data)
    return pd.Series(clusters, index=expression_data.index, name='Cluster')


In [None]:
# 5. Visualize gene expression as heatmap
def plot_expression_heatmap(expression_data):
    plt.figure(figsize=(12, 8))
    sns.heatmap(expression_data, cmap='viridis', center=0)
    plt.title('Gene Expression Heatmap')
    plt.xlabel('Samples')
    plt.ylabel('Genes')
    plt.show()


In [None]:
# 6. Perform PCA on gene expression data
def perform_pca(expression_data, n_components=2):
    pca = PCA(n_components=n_components)
    pca_result = pca.fit_transform(expression_data.T)
    return pd.DataFrame(pca_result, columns=[f'PC{i+1}' for i in range(n_components)], index=expression_data.columns)


In [None]:
# 7. Calculate correlation between gene expression and phenotype
def gene_phenotype_correlation(expression_data, phenotype):
    correlations = {}
    for gene in expression_data.index:
        corr, p_value = pearsonr(expression_data.loc[gene], phenotype)
        correlations[gene] = {'correlation': corr, 'p_value': p_value}
    return pd.DataFrame.from_dict(correlations, orient='index')


In [None]:
# 8. Identify co-expressed gene modules (simplified WGCNA)
def identify_coexpression_modules(expression_data, threshold=0.8):
    corr_matrix = expression_data.T.corr()
    adjacency_matrix = (corr_matrix.abs() > threshold).astype(int)
    G = nx.from_pandas_adjacency(adjacency_matrix)
    modules = list(nx.connected_components(G))
    return modules


In [None]:
# 9. Perform pathway analysis
def pathway_analysis(de_genes, pathway_database):
    enriched_pathways = []
    for pathway, genes in pathway_database.items():
        overlap = set(de_genes).intersection(genes)
        if overlap:
            enrichment_score = len(overlap) / len(genes)
            enriched_pathways.append((pathway, list(overlap), enrichment_score))
    return pd.DataFrame(enriched_pathways, columns=['Pathway', 'Overlapping_Genes', 'Enrichment_Score'])


In [None]:
# 10. Simulate RNA-seq count data
def simulate_rnaseq_data(n_genes=1000, n_samples=10, n_de_genes=100, fold_change=2):
    counts = np.random.negative_binomial(10, 0.5, size=(n_genes, n_samples))
    de_genes = np.random.choice(n_genes, n_de_genes, replace=False)
    counts[de_genes, n_samples//2:] *= fold_change
    return pd.DataFrame(counts, columns=[f'Sample_{i+1}' for i in range(n_samples)])


In [None]:
# 11. Implement a function to perform basic quality control and preprocessing of scRNA-seq data, including cell and gene filtering based on expression thresholds.
import numpy as np
import pandas as pd
import scanpy as sc

def preprocess_scrna_seq(data, min_genes=200, min_cells=3, max_percent_mt=5):
    """
    Perform basic quality control and preprocessing of scRNA-seq data.
    
    Parameters:
    data (AnnData): The input scRNA-seq data
    min_genes (int): Minimum number of genes expressed required for a cell to pass filtering
    min_cells (int): Minimum number of cells in which a gene must be expressed to pass filtering
    max_percent_mt (float): Maximum percentage of mitochondrial genes allowed
    
    Returns:
    AnnData: Preprocessed scRNA-seq data
    """
    
    # Create a copy of the data
    adata = data.copy()
    
    # Calculate quality metrics
    sc.pp.calculate_qc_metrics(adata, inplace=True, percent_top=None, log1p=False, n_genes_by_counts=True)
    
    # Filter cells
    adata = adata[adata.obs.n_genes_by_counts > min_genes, :]
    
    # Filter genes
    sc.pp.filter_genes(adata, min_cells=min_cells)
    
    # Calculate percentage of mitochondrial genes
    adata.var['mt'] = adata.var_names.str.startswith('MT-')
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    
    # Filter cells based on mitochondrial percentage
    adata = adata[adata.obs.pct_counts_mt < max_percent_mt, :]
    
    # Normalize data
    sc.pp.normalize_total(adata, target_sum=1e4)
    
    # Logarithmize the data
    sc.pp.log1p(adata)
    
    return adata

# Example usage
# Assuming you have a AnnData object named 'raw_data'
preprocessed_data = preprocess_scrna_seq(raw_data)
print(f"Original shape: {raw_data.shape}")
print(f"Preprocessed shape: {preprocessed_data.shape}")

In [None]:
# 12. Create a script to identify cell types in scRNA-seq data using a combination of clustering and marker gene expression analysis.
import scanpy as sc
import matplotlib.pyplot as plt

def identify_cell_types(adata, n_pcs=50, resolution=0.5, marker_genes=None):
    """
    Identify cell types in scRNA-seq data using clustering and marker gene expression.
    
    Parameters:
    adata (AnnData): The preprocessed scRNA-seq data
    n_pcs (int): Number of principal components to use
    resolution (float): Resolution parameter for Leiden clustering
    marker_genes (dict): Dictionary of known marker genes for each cell type
    
    Returns:
    AnnData: Annotated scRNA-seq data
    """
    
    # Perform PCA
    sc.tl.pca(adata, n_comps=n_pcs)
    
    # Compute neighborhood graph
    sc.pp.neighbors(adata, n_neighbors=10, n_pcs=n_pcs)
    
    # Perform Leiden clustering
    sc.tl.leiden(adata, resolution=resolution)
    
    # Perform UMAP for visualization
    sc.tl.umap(adata)
    
    # Find marker genes
    sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')
    
    # Annotate cell types based on marker genes
    if marker_genes is not None:
        cell_type_scores = {}
        for cell_type, markers in marker_genes.items():
            cell_type_scores[cell_type] = adata[:, markers].X.mean(axis=1)
        
        adata.obs['cell_type'] = pd.DataFrame(cell_type_scores).idxmax(axis=1)
    
    # Visualize results
    sc.pl.umap(adata, color=['leiden', 'cell_type'], save='_cell_types.pdf')
    
    return adata

# Example usage
marker_genes = {
    'T cells': ['CD3D', 'CD3E', 'CD3G'],
    'B cells': ['CD19', 'MS4A1', 'CD79A'],
    'Monocytes': ['CD14', 'LYZ', 'CST3'],
    'NK cells': ['NCAM1', 'NKG7', 'KLRD1']
}

annotated_data = identify_cell_types(preprocessed_data, marker_genes=marker_genes)
print(annotated_data.obs['cell_type'].value_counts())

In [None]:
# 13. Develop a program to detect and quantify alternative splicing events from RNA-seq data, focusing on exon skipping events.
import pysam
import numpy as np
from collections import defaultdict

def detect_exon_skipping(bam_file, gtf_file):
    """
    Detect and quantify exon skipping events from RNA-seq data.
    
    Parameters:
    bam_file (str): Path to the BAM file containing aligned RNA-seq reads
    gtf_file (str): Path to the GTF file containing gene annotations
    
    Returns:
    dict: Dictionary of exon skipping events and their PSI values
    """
    
    # Load gene annotations
    exons = defaultdict(lambda: defaultdict(list))
    with open(gtf_file, 'r') as gtf:
        for line in gtf:
            if line.startswith('#'):
                continue
            fields = line.strip().split('\t')
            if fields[2] == 'exon':
                gene_id = fields[8].split('gene_id "')[1].split('"')[0]
                exon_start, exon_end = int(fields[3]), int(fields[4])
                exons[fields[0]][gene_id].append((exon_start, exon_end))
    
    # Sort exons for each gene
    for chrom in exons:
        for gene in exons[chrom]:
            exons[chrom][gene].sort()
    
    # Detect exon skipping events
    skipping_events = {}
    bam = pysam.AlignmentFile(bam_file, "rb")
    
    for chrom in exons:
        for gene in exons[chrom]:
            for i in range(1, len(exons[chrom][gene]) - 1):
                exon1, skipped_exon, exon2 = exons[chrom][gene][i-1:i+2]
                
                inclusion_count = bam.count(chrom, skipped_exon[0], skipped_exon[1])
                exclusion_count = bam.count(chrom, exon1[1], exon2[0])
                
                if inclusion_count + exclusion_count > 0:
                    psi = inclusion_count / (inclusion_count + exclusion_count)
                    skipping_events[f"{gene}_{chrom}:{skipped_exon[0]}-{skipped_exon[1]}"] = psi
    
    bam.close()
    return skipping_events

# Example usage
bam_file = "aligned_reads.bam"
gtf_file = "annotations.gtf"

exon_skipping_events = detect_exon_skipping(bam_file, gtf_file)

for event, psi in exon_skipping_events.items():
    print(f"Exon skipping event: {event}, PSI: {psi:.2f}")

In [None]:
# 14. Write a function to calculate and visualize the Percent Spliced In (PSI) values for a set of alternative splicing events across multiple samples.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def calculate_visualize_psi(psi_data, output_file):
    """
    Calculate and visualize PSI values for alternative splicing events across multiple samples.
    
    Parameters:
    psi_data (dict): Dictionary with samples as keys and dictionaries of events and PSI values as values
    output_file (str): Path to save the output heatmap
    """
    
    # Convert the data to a DataFrame
    psi_df = pd.DataFrame(psi_data)
    
    # Calculate mean PSI and sort events
    psi_df['mean_psi'] = psi_df.mean(axis=1)
    psi_df_sorted = psi_df.sort_values('mean_psi', ascending=False).drop('mean_psi', axis=1)
    
    # Create heatmap
    plt.figure(figsize=(12, 8))
    sns.heatmap(psi_df_sorted, cmap='YlOrRd', vmin=0, vmax=1, center=0.5)
    plt.title('PSI Values Across Samples')
    plt.ylabel('Splicing Events')
    plt.xlabel('Samples')
    plt.tight_layout()
    plt.savefig(output_file)
    plt.close()
    
    return psi_df_sorted

# Example usage
psi_data = {
    'Sample1': {'Event1': 0.8, 'Event2': 0.2, 'Event3': 0.5},
    'Sample2': {'Event1': 0.7, 'Event2': 0.3, 'Event3': 0.6},
    'Sample3': {'Event1': 0.9, 'Event2': 0.1, 'Event3': 0.4}
}

psi_df = calculate_visualize_psi(psi_data, 'psi_heatmap.png')
print(psi_df)

In [None]:
# 15. Implement a basic multi-omics integration workflow that correlates gene expression data with DNA methylation data and visualizes the results.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

def integrate_expression_methylation(expression_data, methylation_data, output_file):
    """
    Integrate gene expression and DNA methylation data, calculate correlations, and visualize results.
    
    Parameters:
    expression_data (pd.DataFrame): Gene expression data (genes x samples)
    methylation_data (pd.DataFrame): DNA methylation data (CpG sites x samples)
    output_file (str): Path to save the output scatter plot
    
    Returns:
    pd.DataFrame: Correlation results
    """
    
    # Ensure both datasets have the same samples
    common_samples = list(set(expression_data.columns) & set(methylation_data.columns))
    expression_data = expression_data[common_samples]
    methylation_data = methylation_data[common_samples]
    
    # Calculate mean methylation for each gene
    gene_to_cpg = {gene: [cpg for cpg in methylation_data.index if gene in cpg] 
                   for gene in expression_data.index}
    mean_methylation = pd.DataFrame({gene: methylation_data.loc[cpgs].mean() 
                                     for gene, cpgs in gene_to_cpg.items() if cpgs})
    
    # Calculate correlation between expression and methylation
    correlations = []
    for gene in mean_methylation.index:
        if gene in expression_data.index:
            corr, p_value = stats.pearsonr(expression_data.loc[gene], mean_methylation.loc[gene])
            correlations.append({'gene': gene, 'correlation': corr, 'p_value': p_value})
    
    correlation_results = pd.DataFrame(correlations).set_index('gene')
    
    # Visualize results
    plt.figure(figsize=(10, 8))
    sns.scatterplot(data=correlation_results, x='correlation', y='-log10(p_value)', 
                    hue=correlation_results.index)
    plt.title('Gene Expression vs DNA Methylation Correlation')
    plt.xlabel('Pearson Correlation Coefficient')
    plt.ylabel('-log10(p-value)')
    plt.tight_layout()
    plt.savefig(output_file)
    plt.close()
    
    return correlation_results

# Example usage
expression_data = pd.DataFrame(np.random.rand(100, 20), 
                               index=[f'Gene{i}' for i in range(100)],
                               columns=[f'Sample{i}' for i in range(20)])

methylation_data = pd.DataFrame(np.random.rand(500, 20), 
                                index=[f'CpG_Gene{i//5}_{i%5}' for i in range(500)],
                                columns=[f'Sample{i}' for i in range(20)])

correlation_results = integrate_expression_methylation(expression_data, methylation_data, 'multi_omics_integration.png')
print(correlation_results.sort_values('correlation', ascending=False).head())

In [None]:
# Example usage:
# gene_lengths = pd.Series(np.random.randint(500, 5000, 1000))
# raw_counts = pd.DataFrame(np.random.randint(0, 1000, size=(1000, 10)))
# tpm_normalized = raw_counts.apply(lambda x: calculate_tpm(x, gene_lengths))

# condition1 = pd.DataFrame(np.random.normal(10, 2, size=(1000, 5)))
# condition2 = pd.DataFrame(np.random.normal(10, 2, size=(1000, 5)))
# de_genes = identify_de_genes(condition1, condition2)

# gene_list = ['GENE1', 'GENE2', 'GENE3']
# go_annotations = {'GENE1': ['GO:0006915'], 'GENE2': ['GO:0008283'], 'GENE3': ['GO:0007049']}
# go_obo = 'path/to/go.obo'
# enrichment_results = go_enrichment_analysis(gene_list, go_annotations, go_obo)

# expression_data = pd.DataFrame(np.random.normal(10, 2, size=(1000, 20)))
# clusters = cluster_gene_expression(expression_data)

# plot_expression_heatmap(expression_data.iloc[:50])

# pca_results = perform_pca(expression_data)

# phenotype = pd.Series(np.random.normal(50, 10, 20))
# correlations = gene_phenotype_correlation(expression_data, phenotype)

# coexpression_modules = identify_coexpression_modules(expression_data)

# de_genes = ['GENE1', 'GENE2', 'GENE3']
# pathway_db = {'Pathway1': ['GENE1', 'GENE4'], 'Pathway2': ['GENE2', 'GENE3', 'GENE5']}
# pathway_results = pathway_analysis(de_genes, pathway_db)

# simulated_data = simulate_rnaseq_data()