## Data Loading

In [1]:
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path
from typing import List, Dict, Tuple
import warnings
warnings.filterwarnings('ignore')

# Import custom modules
from new_script.data_reader import GTExDataReader
from new_script.preprocessing import DataPreprocessor
from new_script.network_analysis import NetworkAnalysis
from new_script.mdc_analysis import MDCAnalyzer
from new_script.visualization import NetworkVisualizer

In [2]:
reader = GTExDataReader()
expression_df, gene_metadata = reader.read_gct_file(
                "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct"
            )

Found data directory: /Users/edeneldar/CoExpression_reProduction/data
Reading GCT file: data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct
GCT version: #1.2
Dimensions: 56200 genes, 17382 samples
Preparing to read 17382 samples
Processed 1000/56200 genes (1.8%)...
Processed 2000/56200 genes (3.6%)...
Processed 3000/56200 genes (5.3%)...
Processed 4000/56200 genes (7.1%)...
Processed 5000/56200 genes (8.9%)...
Processed 6000/56200 genes (10.7%)...
Processed 7000/56200 genes (12.5%)...
Processed 8000/56200 genes (14.2%)...
Processed 9000/56200 genes (16.0%)...
Processed 10000/56200 genes (17.8%)...
Processed 11000/56200 genes (19.6%)...
Processed 12000/56200 genes (21.4%)...
Processed 13000/56200 genes (23.1%)...
Processed 14000/56200 genes (24.9%)...
Processed 15000/56200 genes (26.7%)...
Processed 16000/56200 genes (28.5%)...
Processed 17000/56200 genes (30.2%)...
Processed 18000/56200 genes (32.0%)...
Processed 19000/56200 genes (33.8%)...
Processed 20000/56200 genes (35.6%).

In [3]:
sample_attrs = reader.read_sample_attributes()
subject_attrs = reader.read_subject_phenotypes()
protein_coding = reader.read_protein_coding_genes()



Reading sample attributes: data/GTEx_Analysis_v8_Annotations_SampleAttributesDS.tsv
Loaded sample attributes: (22951, 62)
Reading subject phenotypes: data/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.tsv
Loaded subject phenotypes: (980, 3)
Reading HGNC data: data/hgnc_complete_set.tsv
Found 19293 protein coding genes


In [4]:
available_tissues = reader.get_available_tissues(sample_attrs)
print(f"Available tissues: {available_tissues}")
loaded_samples = set(expression_df.columns)
tissues_with_samples = []
tissue_sample_counts = {}

for tissue in available_tissues:
    all_tissue_samples = reader.get_tissue_samples(sample_attrs, tissue)
    loaded_tissue_samples = [s for s in all_tissue_samples if s in loaded_samples]
    if len(loaded_tissue_samples) >= 10:
        tissues_with_samples.append(tissue)
        tissue_sample_counts[tissue] = len(loaded_tissue_samples)

tissues_with_samples.sort(key=lambda x: tissue_sample_counts[x], reverse=True)

if not tissues_with_samples:
    print("No tissues have sufficient samples in the loaded data.")

Available tissues: ['Adipose - Subcutaneous', 'Adipose - Visceral (Omentum)', 'Adrenal Gland', 'Artery - Aorta', 'Artery - Coronary', 'Artery - Tibial', 'Bladder', 'Brain - Amygdala', 'Brain - Anterior cingulate cortex (BA24)', 'Brain - Caudate (basal ganglia)', 'Brain - Cerebellar Hemisphere', 'Brain - Cerebellum', 'Brain - Cortex', 'Brain - Frontal Cortex (BA9)', 'Brain - Hippocampus', 'Brain - Hypothalamus', 'Brain - Nucleus accumbens (basal ganglia)', 'Brain - Putamen (basal ganglia)', 'Brain - Spinal cord (cervical c-1)', 'Brain - Substantia nigra', 'Breast - Mammary Tissue', 'Cells - Cultured fibroblasts', 'Cells - EBV-transformed lymphocytes', 'Cells - Leukemia cell line (CML)', 'Cervix - Ectocervix', 'Cervix - Endocervix', 'Colon - Sigmoid', 'Colon - Transverse', 'Esophagus - Gastroesophageal Junction', 'Esophagus - Mucosa', 'Esophagus - Muscularis', 'Fallopian Tube', 'Heart - Atrial Appendage', 'Heart - Left Ventricle', 'Kidney - Cortex', 'Kidney - Medulla', 'Liver', 'Lung', '

In [26]:
tissues = ['Muscle - Skeletal',
     'Whole Blood',
     'Skin - Sun Exposed (Lower leg)',
     'Skin - Not Sun Exposed (Suprapubic)',
     'Adipose - Subcutaneous',
     'Thyroid',
     'Artery - Tibial',
     'Nerve - Tibial',
     'Lung',
     'Brain - Cerebellum',
     'Heart - Atrial Appendage',
     'Brain - Cortex',
     'Adipose - Visceral (Omentum)']

In [28]:
tissue_samples = {}
groups = ['young', 'old']

for tissue in tissues:
    tissue_samples[tissue] = {}
    for group in groups:
        tissue_samples[tissue][group] = reader.filter_samples_by_metadata(
            sample_attrs, 
            subject_attrs,
            tissue = tissue, 
            age_group = group, 
            min_rin = 5.6
        )


After RIN >= 5.6: 19828 samples
After tissue filter (Muscle - Skeletal): 1019 samples
   Extracted 1019 subject IDs from 1019 samples
   Samples with valid subject IDs: 1019
   Samples with phenotype data: 1019
After age group filter (young): 653 samples
After RIN >= 5.6: 19828 samples
After tissue filter (Muscle - Skeletal): 1019 samples
   Extracted 1019 subject IDs from 1019 samples
   Samples with valid subject IDs: 1019
   Samples with phenotype data: 1019
After age group filter (old): 366 samples
After RIN >= 5.6: 19828 samples
After tissue filter (Whole Blood): 929 samples
   Extracted 929 subject IDs from 929 samples
   Samples with valid subject IDs: 929
   Samples with phenotype data: 929
After age group filter (young): 612 samples
After RIN >= 5.6: 19828 samples
After tissue filter (Whole Blood): 929 samples
   Extracted 929 subject IDs from 929 samples
   Samples with valid subject IDs: 929
   Samples with phenotype data: 929
After age group filter (old): 317 samples
After 

In [30]:
# Save the filtered samples for later use
available_samples = set(expression_df.columns)
for tissue, groups in tissue_samples.items():
    for group, samples in groups.items():
        path = Path(f"filtered_samples/{tissue.replace(' ', '_')}_{group}.csv")
        path.parent.mkdir(parents=True, exist_ok=True)
        pd.Series(samples).to_csv(path, index=False)


In [19]:
# Check what samples we actually have in the expression data
available_samples = set(expression_df.columns)
print(f"Total samples in expression data: {len(available_samples)}")

# Check how many of the muscle samples are actually in our data
young_muscle_available = [s for s in young_muscle_samples_filtered if s in available_samples]
old_muscle_available = [s for s in old_muscle_samples_filtered if s in available_samples]

print(f"Young muscle samples found: {len(young_muscle_samples_filtered)}")
print(f"Young muscle samples available in data: {len(young_muscle_available)}")
print(f"Old muscle samples found: {len(old_muscle_samples_filtered)}")
print(f"Old muscle samples available in data: {len(old_muscle_available)}")

# Use only the available samples
all_muscle_samples = young_muscle_available + old_muscle_available
print(f"Total muscle samples we can use: {len(all_muscle_samples)}")

Total samples in expression data: 17382
Young muscle samples found: 653
Young muscle samples available in data: 510
Old muscle samples found: 366
Old muscle samples available in data: 292
Total muscle samples we can use: 802


In [20]:
# Use only samples that are actually available in our expression data
available_samples = set(expression_df.columns)
young_muscle_available = [s for s in young_muscle_samples_filtered if s in available_samples]
old_muscle_available = [s for s in old_muscle_samples_filtered if s in available_samples]
all_muscle_samples = young_muscle_available + old_muscle_available

print(f"Using {len(all_muscle_samples)} muscle samples ({len(young_muscle_available)} young, {len(old_muscle_available)} old)")

preprocessor = DataPreprocessor()

Using 802 muscle samples (510 young, 292 old)


In [18]:
expression_df[all_muscle_samples[0]]

ENSG00000223972.5        0.000
ENSG00000227232.5        1.036
ENSG00000278267.1        0.000
ENSG00000243485.5        0.000
ENSG00000237613.2        0.000
                       ...    
ENSG00000198695.2    17820.000
ENSG00000210194.1       59.830
ENSG00000198727.2    33600.000
ENSG00000210195.2        1.042
ENSG00000210196.2        2.024
Name: GTEX-111CU-2026-SM-5GZZC, Length: 56200, dtype: float64

In [25]:
filtered_muscle_expression_df = expression_df[all_muscle_samples]

processed_df, processed_info = preprocessor.preprocess_expression_data(
    filtered_muscle_expression_df,
    protein_coding,
    apply_log = True,
    filter_low_expr = True,
    filter_low_var = True,
    detect_outliers = True,
    min_expression=0.1,
    min_variance=0.1,
    outlier_contamination=0.05
)

Starting expression data preprocessing pipeline...
Filtered to 19124 protein coding genes (from 56200)
Applying log2(TPM + 1) transformation...
Filtered to 13638 genes (removed 5486 genes with low expression in > 20.0% of samples)
Filtered to 19124 protein coding genes (from 56200)
Applying log2(TPM + 1) transformation...
Filtered to 13638 genes (removed 5486 genes with low expression in > 20.0% of samples)
Filtered to 12325 genes with variance >= 0.1
Detecting outliers using TruncatedSVD with 20 components...
Filtered to 12325 genes with variance >= 0.1
Detecting outliers using TruncatedSVD with 20 components...
Detected 4 outlier samples
SVD explained variance ratio: [0.92396903 0.0163448  0.00956844 0.0042763  0.00315717]
Removed 4 outlier samples
Remaining samples: 798
Applying quantile normalization...
Detected 4 outlier samples
SVD explained variance ratio: [0.92396903 0.0163448  0.00956844 0.0042763  0.00315717]
Removed 4 outlier samples
Remaining samples: 798
Applying quantile 

In [24]:
processed_df.head()

Unnamed: 0,GTEX-111CU-2026-SM-5GZZC,GTEX-113JC-2726-SM-5EGIS,GTEX-117YW-2426-SM-5Q5AE,GTEX-117YX-2526-SM-5EQ4Q,GTEX-1192X-0426-SM-5GIEE,GTEX-11DXW-0726-SM-5H12J,GTEX-11DXZ-2426-SM-5N9DT,GTEX-11DZ1-0926-SM-5EQ5R,GTEX-11EM3-2126-SM-5H11M,GTEX-11EQ9-2126-SM-5PNVW,...,GTEX-ZF28-0526-SM-4WKGW,GTEX-ZF29-2426-SM-DO92G,GTEX-ZLV1-2126-SM-4WWD2,GTEX-ZPCL-2026-SM-57WFD,GTEX-ZQG8-1226-SM-51MRX,GTEX-ZVT3-0526-SM-5GIE9,GTEX-ZXG5-0326-SM-5GICH,GTEX-ZYFG-2426-SM-5GIE8,GTEX-ZYW4-0526-SM-5GZZ5,GTEX-ZYY3-0526-SM-5E45G
ENSG00000187634.11,0.330743,0.627497,0.538273,0.168728,0.267884,0.791886,0.625735,0.453782,0.586393,0.197103,...,0.48168,0.18156,0.297362,1.11328,0.642248,0.273588,0.608981,0.373668,0.342296,0.166149
ENSG00000188976.10,5.92293,5.679323,5.755679,6.224135,6.032826,5.611052,5.771384,6.161743,6.013712,6.333352,...,6.17017,5.771384,6.531977,5.754625,6.27522,5.666519,6.209774,6.360078,5.881591,6.336509
ENSG00000187961.13,1.995472,1.775191,1.347724,1.941918,1.731807,1.506002,1.633968,1.918709,1.883142,1.897592,...,1.6783,1.494642,2.057592,2.113478,2.082115,2.026758,1.363784,2.03528,1.24512,1.999909
ENSG00000187583.10,0.294629,0.992736,1.53459,0.799456,0.49138,0.484509,0.72515,0.215263,0.652858,0.144389,...,0.203059,0.513514,0.415634,0.39981,0.471369,0.646954,0.396115,0.379115,0.510835,0.269754
ENSG00000187642.9,6.209774,8.516954,8.426669,7.130217,6.079198,7.12183,7.443118,3.591562,7.78268,7.547938,...,4.368726,7.226472,6.985076,6.633469,7.07337,7.205314,5.637189,5.926524,6.428333,4.486818
