In [None]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from goatools import obo_parser, go_enrichment
from scipy.stats import pearsonr
import networkx as nx
from scipy.spatial.distance import pdist, squareform


In [None]:
# 1. Normalize RNA-seq data using TPM
def calculate_tpm(counts, lengths):
    rate = counts / lengths
    return rate / np.sum(rate) * 1e6


In [None]:
# 2. Identify differentially expressed genes
def identify_de_genes(condition1, condition2, alpha=0.05):
    de_genes = []
    for gene in condition1.index:
        t_stat, p_value = stats.ttest_ind(condition1.loc[gene], condition2.loc[gene])
        if p_value < alpha:
            de_genes.append((gene, t_stat, p_value))
    return pd.DataFrame(de_genes, columns=['Gene', 't_statistic', 'p_value'])


In [None]:
# 3. Perform GO enrichment analysis
def go_enrichment_analysis(gene_list, go_annotations, go_obo):
    go = obo_parser.GODag(go_obo)
    study = set(gene_list)
    population = set(go_annotations.keys())
    g = go_enrichment.GOEnrichmentStudy(population, go_annotations, go, propagate_counts=True, alpha=0.05)
    results = g.run_study(study)
    return results


In [None]:
# 4. Cluster gene expression data using k-means
def cluster_gene_expression(expression_data, n_clusters=5):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(expression_data)
    return pd.Series(clusters, index=expression_data.index, name='Cluster')


In [None]:
# 5. Visualize gene expression as heatmap
def plot_expression_heatmap(expression_data):
    plt.figure(figsize=(12, 8))
    sns.heatmap(expression_data, cmap='viridis', center=0)
    plt.title('Gene Expression Heatmap')
    plt.xlabel('Samples')
    plt.ylabel('Genes')
    plt.show()


In [None]:
# 6. Perform PCA on gene expression data
def perform_pca(expression_data, n_components=2):
    pca = PCA(n_components=n_components)
    pca_result = pca.fit_transform(expression_data.T)
    return pd.DataFrame(pca_result, columns=[f'PC{i+1}' for i in range(n_components)], index=expression_data.columns)


In [None]:
# 7. Calculate correlation between gene expression and phenotype
def gene_phenotype_correlation(expression_data, phenotype):
    correlations = {}
    for gene in expression_data.index:
        corr, p_value = pearsonr(expression_data.loc[gene], phenotype)
        correlations[gene] = {'correlation': corr, 'p_value': p_value}
    return pd.DataFrame.from_dict(correlations, orient='index')


In [None]:
# 8. Identify co-expressed gene modules (simplified WGCNA)
def identify_coexpression_modules(expression_data, threshold=0.8):
    corr_matrix = expression_data.T.corr()
    adjacency_matrix = (corr_matrix.abs() > threshold).astype(int)
    G = nx.from_pandas_adjacency(adjacency_matrix)
    modules = list(nx.connected_components(G))
    return modules


In [None]:
# 9. Perform pathway analysis
def pathway_analysis(de_genes, pathway_database):
    enriched_pathways = []
    for pathway, genes in pathway_database.items():
        overlap = set(de_genes).intersection(genes)
        if overlap:
            enrichment_score = len(overlap) / len(genes)
            enriched_pathways.append((pathway, list(overlap), enrichment_score))
    return pd.DataFrame(enriched_pathways, columns=['Pathway', 'Overlapping_Genes', 'Enrichment_Score'])


In [None]:
# 10. Simulate RNA-seq count data
def simulate_rnaseq_data(n_genes=1000, n_samples=10, n_de_genes=100, fold_change=2):
    counts = np.random.negative_binomial(10, 0.5, size=(n_genes, n_samples))
    de_genes = np.random.choice(n_genes, n_de_genes, replace=False)
    counts[de_genes, n_samples//2:] *= fold_change
    return pd.DataFrame(counts, columns=[f'Sample_{i+1}' for i in range(n_samples)])


In [None]:
# Example usage:
# gene_lengths = pd.Series(np.random.randint(500, 5000, 1000))
# raw_counts = pd.DataFrame(np.random.randint(0, 1000, size=(1000, 10)))
# tpm_normalized = raw_counts.apply(lambda x: calculate_tpm(x, gene_lengths))

# condition1 = pd.DataFrame(np.random.normal(10, 2, size=(1000, 5)))
# condition2 = pd.DataFrame(np.random.normal(10, 2, size=(1000, 5)))
# de_genes = identify_de_genes(condition1, condition2)

# gene_list = ['GENE1', 'GENE2', 'GENE3']
# go_annotations = {'GENE1': ['GO:0006915'], 'GENE2': ['GO:0008283'], 'GENE3': ['GO:0007049']}
# go_obo = 'path/to/go.obo'
# enrichment_results = go_enrichment_analysis(gene_list, go_annotations, go_obo)

# expression_data = pd.DataFrame(np.random.normal(10, 2, size=(1000, 20)))
# clusters = cluster_gene_expression(expression_data)

# plot_expression_heatmap(expression_data.iloc[:50])

# pca_results = perform_pca(expression_data)

# phenotype = pd.Series(np.random.normal(50, 10, 20))
# correlations = gene_phenotype_correlation(expression_data, phenotype)

# coexpression_modules = identify_coexpression_modules(expression_data)

# de_genes = ['GENE1', 'GENE2', 'GENE3']
# pathway_db = {'Pathway1': ['GENE1', 'GENE4'], 'Pathway2': ['GENE2', 'GENE3', 'GENE5']}
# pathway_results = pathway_analysis(de_genes, pathway_db)

# simulated_data = simulate_rnaseq_data()