In [1]:
# cell 1

import pandas as pd
import numpy as np
import os
import re
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

# 1. Load count matrix using TPM
data_path = '/Users/esrataner/Documents/DATA1030/count_matrix_120'
file_list = [f for f in os.listdir(data_path) if f.endswith('.tsv')]

def read_tpm_file(filename):
    df = pd.read_csv(os.path.join(data_path, filename), sep='\t',
                     usecols=['gene_id', 'TPM'])
    df = df.set_index('gene_id')
    sample_id = filename.replace('.tsv', '')
    df.columns = [sample_id]
    return df

dfs = [read_tpm_file(f) for f in file_list]
merged = pd.concat(dfs, axis=1)  # genes × samples
print("Raw merged shape:", merged.shape)

# 2. remove all spike-in genes 
# these are technical control genes - not needed for ML 
spikein_mask = merged.index.str.lower().str.contains(
    "spikein|ercc"
)

print("Spike-in genes detected:", spikein_mask.sum())

merged = merged.loc[~spikein_mask]
print("After removing spike-ins:", merged.shape)

# 3. Remove all-zero genes
zero_across_all = (merged == 0).all(axis=1)
print("Genes zero everywhere:", zero_across_all.sum())

expr = merged[~zero_across_all]
print("After removing all-zero genes:", expr.shape)

# 4. log1p(TPM) transform — biological normalization
expr_log = np.log1p(expr)

# NOTE: try graphing log1p and log 
# depends on if data is close to zero or not 

Raw merged shape: (59526, 120)
Spike-in genes detected: 97
After removing spike-ins: (59429, 120)
Genes zero everywhere: 14289
After removing all-zero genes: (45140, 120)


In [2]:
# cell 2

meta = pd.read_csv(
    '/Users/esrataner/Documents/DATA1030/tsv/experiment_report_2025_120.tsv',
    sep='\t', skiprows=1
)

# extract sex
meta['Sex'] = meta['Biosample summary'].str.extract(r'(?i)\b(female|male)\b')
meta['Sex'] = meta['Sex'].str.lower()

# parse numeric age
def parse_age(a):
    if pd.isna(a):
        return None
    nums = re.findall(r'\d+', str(a))
    return int(nums[0]) if nums else None

meta['Age'] = meta['Biosample age'].apply(parse_age)

# Age bins
def age_bin(x):
    if pd.isna(x): return None
    if x < 65: return "60-64"
    if x < 70: return "65-69"
    if x < 75: return "70-74"
    if x < 80: return "75-79"
    if x < 85: return "80-84"
    if x < 90: return "85-89"
    return "90+"

meta['Age_Ordinal'] = meta['Age'].apply(age_bin)

age_order = ["60-64","65-69","70-74","75-79","80-84","85-89","90+"]
meta['Age_Ordinal'] = pd.Categorical(
    meta['Age_Ordinal'], categories=age_order, ordered=True
)

# FIX!!  Extract *all* ENCFF file IDs (not just first)
# Find ALL ENCFF IDs
meta['File_IDs'] = meta['Files'].str.findall(r'ENCFF\w+')

# Expand metadata so each row contains ONE ENCFF ID
meta_expanded = meta.explode('File_IDs')

# Keep only relevant fields
meta_expanded = meta_expanded[
    ['Accession', 'Sex', 'Age_Ordinal', 'Biosample accession', 'File_IDs']
]

# Drop rows missing file IDs
meta_expanded = meta_expanded.dropna(subset=['File_IDs'])

print("Expanded metadata shape:", meta_expanded.shape)
# Expanded metadata shape: (1220, 5)

# exp 
# in the raw ENCODE metadata, each biological sample (ENCSR) has multiple ENCFF files 
# (technical sequencing files: BAMs, fastqs, quantifications, etc)

# After expanding, u get 1 row per ENCFF file
# so if each ENCSR has ~10 ENCFF files ->  120 biological samples become ~1220 rows.
# so 1220 is NOT 120 samples - it is technical files, NOT samples

# MOST IMPORTANTLY - do NOT need GroupShuffleSplit
# 1. each ENCSR is represented only once
# 2. no ENCFF technical replicates remain#
# 3. df_full has 1 row per biological sample





Expanded metadata shape: (1220, 5)


In [3]:
# cell 3 

# List of ENCFF IDs found in expression matrix
expr_samples = expr_log.columns.tolist()

# Match metadata rows where File_IDs map to expression samples
meta_matched = meta_expanded[meta_expanded['File_IDs'].isin(expr_samples)].copy()
print("Matched metadata rows:", meta_matched.shape)

# Step 1: Relabel expression matrix columns (ENCFF → ENCSR)
rename_dict = dict(zip(meta_matched['File_IDs'], meta_matched['Accession']))
expr_labeled = expr_log.rename(columns=rename_dict)

# Step 2: Identify replicate sample IDs (appear twice)
replicate_accessions = expr_labeled.columns[
    expr_labeled.columns.duplicated()
].unique()

print("Replicate sample accessions:", replicate_accessions.tolist())

# Step 3: DROP the replicate sample completely
expr_no_reps = expr_labeled.drop(columns=list(replicate_accessions))
meta_no_reps = meta_matched[
    ~meta_matched['Accession'].isin(replicate_accessions)
].copy()

print("Expression shape after removing replicates:", expr_no_reps.shape)
print("Metadata shape after removing replicates:", meta_no_reps.shape)

# Step 4: Align expression with metadata
expr_T = expr_no_reps.T
expr_T.index.name = 'Accession'
expr_T = expr_T.reset_index()

# Deduplicate final metadata
meta_unique = meta_no_reps.drop_duplicates(subset=['Accession'])

# Step 5: Merge into final ML-ready dataset
df_full = expr_T.merge(
    meta_unique.drop(columns=['File_IDs']),
    on='Accession',
    how='inner'
).set_index('Accession')

print("Final merged dataset shape:", df_full.shape)
print(df_full.head())
# cell 4 

# TRAIN / VAL / TEST SPLIT
from sklearn.model_selection import train_test_split

# (1) Define features + target
X = df_full.drop(columns=["Sex", "Biosample accession"]) #string carried over 
y = df_full["Sex"].map({"female": 0, "male": 1}).astype(int)

y = y.astype(int)

# (2) First split: Train vs Temp (Val+Test)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

# (3) Split Temp into Validation + Test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

# Print shapes
print("Dataset Splits:")
print("Train:", X_train.shape, "Target:", y_train.shape)
print("Val:  ", X_val.shape,   "Target:", y_val.shape)
print("Test: ", X_test.shape,  "Target:", y_test.shape)


Matched metadata rows: (120, 5)
Replicate sample accessions: []
Expression shape after removing replicates: (45140, 120)
Metadata shape after removing replicates: (120, 5)
Final merged dataset shape: (120, 45143)
             13023     26893  30031  30958     30964  ENSG00000000003.14  \
Accession                                                                  
ENCSR800PJQ    0.0  0.506818    0.0    0.0  0.000000            1.348073   
ENCSR133PLR    0.0  0.000000    0.0    0.0  2.324347            1.340250   
ENCSR418WMG    0.0  0.000000    0.0    0.0  0.000000            1.181727   
ENCSR013HWB    0.0  0.438255    0.0    0.0  0.000000            1.313724   
ENCSR693KOP    0.0  0.000000    0.0    0.0  0.000000            1.081805   

             ENSG00000000005.5  ENSG00000000419.12  ENSG00000000457.13  \
Accession                                                                
ENCSR800PJQ           0.000000            2.157559            1.444563   
ENCSR133PLR           0.122218  

In [4]:
# cell 4 

# TRAIN / VAL / TEST SPLIT
from sklearn.model_selection import train_test_split

# (1) Define features + target
X = df_full.drop(columns=["Sex", "Biosample accession"]) #string carried over 
y = df_full["Sex"].map({"female": 0, "male": 1}).astype(int)

y = y.astype(int)

# (2) First split: Train vs Temp (Val+Test)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

# (3) Split Temp into Validation + Test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

# Print shapes
print("Dataset Splits:")
print("Train:", X_train.shape, "Target:", y_train.shape)
print("Val:  ", X_val.shape,   "Target:", y_val.shape)
print("Test: ", X_test.shape,  "Target:", y_test.shape)


Dataset Splits:
Train: (84, 45141) Target: (84,)
Val:   (18, 45141) Target: (18,)
Test:  (18, 45141) Target: (18,)


 ## Preprocessor + SVM Pipeline Builder

In [None]:


from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

gene_cols = [c for c in X.columns if c != "Age_Ordinal"]

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), gene_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), ["Age_Ordinal"])
])

# class weights for imbalance
neg = (y_train == 0).sum()
pos = (y_train == 1).sum()
class_weight = {0: 1.0, 1: neg / pos}
print("Class weights:", class_weight)

def make_svm_pipeline(random_state=42):
    return Pipeline([
        ("preprocess", preprocessor),
        ("model", LinearSVC(
            class_weight=class_weight,
            random_state=random_state,
            max_iter=5000
        ))
    ])


Class weights: {0: 1.0, 1: np.float64(3.0)}


In [None]:
# RandomizedSearchCV for SVM

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform
from sklearn.metrics import average_precision_score, classification_report

param_dist_svm = {
    "model__C": loguniform(1e-3, 1e3)
}

search_svm = RandomizedSearchCV(
    estimator=make_svm_pipeline(),
    param_distributions=param_dist_svm,
    n_iter=40,
    scoring="average_precision",
    cv=5,
    n_jobs=-1,
    random_state=42,
    verbose=2
)

search_svm.fit(X_train, y_train)

best_svm = search_svm.best_estimator_

print("\nBest CV PR-AUC:", search_svm.best_score_)
print("Best Hyperparameters:", search_svm.best_params_)

val_pred_margin = best_svm.decision_function(X_val)
val_ap = average_precision_score(y_val, val_pred_margin)

print("\nVALIDATION PR-AUC:", val_ap)
print("\nClassification Report:\n",
      classification_report(y_val, (val_pred_margin > 0).astype(int)))


Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV] END ........................model__C=0.1767016940294795; total time=   0.9s
[CV] END ........................model__C=0.1767016940294795; total time=   0.9s
[CV] END ........................model__C=0.1767016940294795; total time=   0.9s
[CV] END ........................model__C=0.1767016940294795; total time=   0.9s
[CV] END ........................model__C=0.1767016940294795; total time=   0.9s
[CV] END .........................model__C=506.1576888752306; total time=   0.9s
[CV] END .........................model__C=506.1576888752306; total time=   0.9s
[CV] END .........................model__C=506.1576888752306; total time=   0.9s
[CV] END .........................model__C=506.1576888752306; total time=   1.0s
[CV] END .........................model__C=506.1576888752306; total time=   1.0s
[CV] END ........................model__C=24.658329458549105; total time=   0.9s
[CV] END ........................model__C=24.65

In [None]:
# Train Full Model + Test Evaluation

import pandas as pd

# Combine train + validation sets
X_train_full = pd.concat([X_train, X_val])
y_train_full = pd.concat([y_train, y_val])

final_svm = make_svm_pipeline()
final_svm.set_params(**search_svm.best_params_)
final_svm.fit(X_train_full, y_train_full)

# test evaluation !!!
test_margin = final_svm.decision_function(X_test)
test_ap = average_precision_score(y_test, test_margin)

print("\nFINAL TEST PR-AUC (SVM):", test_ap)
print("\nClassification Report (Test):\n",
      classification_report(y_test, (test_margin > 0).astype(int)))



FINAL TEST PR-AUC (SVM): 0.8269230769230769

Classification Report (Test):
               precision    recall  f1-score   support

           0       0.86      0.43      0.57        14
           1       0.27      0.75      0.40         4

    accuracy                           0.50        18
   macro avg       0.56      0.59      0.49        18
weighted avg       0.73      0.50      0.53        18



In [None]:
# SVM coefficient interpretation 

import numpy as np

svm_model = final_svm.named_steps["model"]
pre = final_svm.named_steps["preprocess"]

feature_names = pre.get_feature_names_out()
coef = svm_model.coef_.flatten()

coef_df = pd.DataFrame({
    "feature": feature_names,
    "coef": coef,
    "abs_coef": np.abs(coef)
}).sort_values("abs_coef", ascending=False)

print("Top 20 |coef| features:")
display(coef_df.head(20))

# extract gene features only
coef_genes = coef_df[coef_df["feature"].str.startswith("num__")].copy()
coef_genes["gene"] = coef_genes["feature"].str.replace("^num__", "", regex=True)

# Masculinizing vs Feminizing
coef_genes["direction"] = np.where(coef_genes["coef"] > 0, "male", "female")

top_masc = coef_genes[coef_genes["coef"] > 0].sort_values("coef", ascending=False).head(30)
top_fem  = coef_genes[coef_genes["coef"] < 0].sort_values("coef", ascending=True).head(30)

print("\nTop Masculinizing Genes:")
display(top_masc)

print("\nTop Feminizing Genes:")
display(top_fem)


Top 20 |coef| features:


Unnamed: 0,feature,coef,abs_coef
45144,cat__Age_Ordinal_90+,-0.004167,0.004167
35846,num__ENSG00000260197.1,0.002956,0.002956
24532,num__ENSG00000230904.1,0.00295,0.00295
28139,num__ENSG00000238067.1,0.00295,0.00295
22845,num__ENSG00000227494.2,0.002947,0.002947
13895,num__ENSG00000176728.7,0.002935,0.002935
26035,num__ENSG00000233864.7,0.002934,0.002934
4256,num__ENSG00000114374.12,0.002928,0.002928
6127,num__ENSG00000129824.15,0.002925,0.002925
15254,num__ENSG00000183878.15,0.002922,0.002922



Top Masculinizing Genes:


Unnamed: 0,feature,coef,abs_coef,gene,direction
35846,num__ENSG00000260197.1,0.002956,0.002956,ENSG00000260197.1,male
24532,num__ENSG00000230904.1,0.00295,0.00295,ENSG00000230904.1,male
28139,num__ENSG00000238067.1,0.00295,0.00295,ENSG00000238067.1,male
22845,num__ENSG00000227494.2,0.002947,0.002947,ENSG00000227494.2,male
13895,num__ENSG00000176728.7,0.002935,0.002935,ENSG00000176728.7,male
26035,num__ENSG00000233864.7,0.002934,0.002934,ENSG00000233864.7,male
4256,num__ENSG00000114374.12,0.002928,0.002928,ENSG00000114374.12,male
6127,num__ENSG00000129824.15,0.002925,0.002925,ENSG00000129824.15,male
15254,num__ENSG00000183878.15,0.002922,0.002922,ENSG00000183878.15,male
11422,num__ENSG00000165246.14,0.002912,0.002912,ENSG00000165246.14,male



Top Feminizing Genes:


Unnamed: 0,feature,coef,abs_coef,gene,direction
23958,num__ENSG00000229807.11,-0.002866,0.002866,ENSG00000229807.11,female
115,num__ENSG00000005889.15,-0.002168,0.002168,ENSG00000005889.15,female
39419,num__ENSG00000270641.1,-0.001963,0.001963,ENSG00000270641.1,female
5737,num__ENSG00000126012.11,-0.001906,0.001906,ENSG00000126012.11,female
21818,num__ENSG00000225470.7,-0.001612,0.001612,ENSG00000225470.7,female
24988,num__ENSG00000231793.5,-0.001591,0.001591,ENSG00000231793.5,female
26583,num__ENSG00000234969.1,-0.001523,0.001523,ENSG00000234969.1,female
19907,num__ENSG00000215301.10,-0.001519,0.001519,ENSG00000215301.10,female
27635,num__ENSG00000237027.1,-0.001483,0.001483,ENSG00000237027.1,female
30556,num__ENSG00000248757.2,-0.001469,0.001469,ENSG00000248757.2,female


In [None]:

# SVM Uncertainty Across Random Seeds

results = []
random_states = [0, 1, 2, 3, 4]

for seed in random_states:
    print(f"\n===== RANDOM SEED {seed} =====")


    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=0.30, random_state=seed, stratify=y
    )
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.50, random_state=seed, stratify=y_temp
    )

    # RSCV for this seed
    search_seed = RandomizedSearchCV(
        estimator=make_svm_pipeline(random_state=seed),
        param_distributions=param_dist_svm,
        n_iter=40,
        scoring="average_precision",
        cv=5,
        random_state=seed,
        n_jobs=-1,
        verbose=0
    )

    search_seed.fit(X_train, y_train)

    # validation PR-AUC
    val_margin = search_seed.best_estimator_.decision_function(X_val)
    val_ap = average_precision_score(y_val, val_margin)

    X_train_full = pd.concat([X_train, X_val])
    y_train_full = pd.concat([y_train, y_val])

    final_seed_svm = make_svm_pipeline(random_state=seed)
    final_seed_svm.set_params(**search_seed.best_params_)
    final_seed_svm.fit(X_train_full, y_train_full)

    test_margin = final_seed_svm.decision_function(X_test)
    test_ap = average_precision_score(y_test, test_margin)

    results.append({
        "seed": seed,
        "cv_ap": search_seed.best_score_,
        "val_ap": val_ap,
        "test_ap": test_ap
    })


results_df = pd.DataFrame(results)
display(results_df)

print("\nMean Test PR-AUC:", results_df["test_ap"].mean())
print("Std Dev Test PR-AUC:", results_df["test_ap"].std())
print("Min Test PR-AUC:", results_df["test_ap"].min())
print("Max Test PR-AUC:", results_df["test_ap"].max())



===== RANDOM SEED 0 =====

===== RANDOM SEED 1 =====

===== RANDOM SEED 2 =====

===== RANDOM SEED 3 =====

===== RANDOM SEED 4 =====


Unnamed: 0,seed,cv_ap,val_ap,test_ap
0,0,0.475159,0.605397,0.65
1,1,0.65256,0.327679,0.42359
2,2,0.34458,0.513919,0.85
3,3,0.596099,0.731551,0.486742
4,4,0.551978,0.571429,0.747024



Mean Test PR-AUC: 0.6314711954711955
Std Dev Test PR-AUC: 0.17720868832973305
Min Test PR-AUC: 0.4235897435897436
Max Test PR-AUC: 0.85
