In [3]:
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestRegressor

# 1) Load your data
df = pd.read_csv("/Users/ozgetimur/Desktop/halfpepstab/hlapepstab/pepdist_with_halflife.csv")

# identify all the distance cols up front
dist_cols = [c for c in df.columns if c.startswith("dist_")]

# ——————————————————————————————————————————————
# Step 1: Drop “far” distances (median above cutoff, e.g. 8 Å)
# ——————————————————————————————————————————————
median_cutoff = 8.0
medians = df[dist_cols].median()
far_cols = medians[medians > median_cutoff].index.tolist()

df_step1 = df.drop(columns=far_cols)
print(f"Step 1: dropped {len(far_cols)} far-distance features (median > {median_cutoff})")

# recompute dist_cols
dist_cols = [c for c in df_step1.columns if c.startswith("dist_")]

# ——————————————————————————————————————————————
# Step 2: Drop low-variance distances (variance < threshold, e.g. 0.01)
# ——————————————————————————————————————————————
var_thresh = 0.01
vt = VarianceThreshold(threshold=var_thresh)
vt.fit(df_step1[dist_cols])

low_var = [
    col for col, keep in zip(dist_cols, vt.get_support())
    if not keep
]
df_step2 = df_step1.drop(columns=low_var)
print(f"Step 2: dropped {len(low_var)} low-variance features (var < {var_thresh})")

# recompute dist_cols
dist_cols = [c for c in df_step2.columns if c.startswith("dist_")]

# ——————————————————————————————————————————————
# Step 3: Model-based selection via RF importances (> threshold)
# ——————————————————————————————————————————————
X = df_step2[dist_cols]
y = df_step2["half_life"]

rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=5,
    n_jobs=-1,
    random_state=42
)
rf.fit(X, y)

importances = pd.Series(rf.feature_importances_, index=dist_cols)
importance_threshold = 1e-3
keep = importances[importances > importance_threshold].index.tolist()

df_final = df_step2[["peptide_seq", "half_life"] + keep]
print(f"Step 3: kept {len(keep)} features with importance > {importance_threshold}")

# final check
print("Total distance features remaining:", len(keep))
print("Final feature set:")
print(df_final.head())
df_final.to_csv("pepdist_final.csv", index=False)


Step 1: dropped 25771 far-distance features (median > 8.0)
Step 2: dropped 5 low-variance features (var < 0.01)
Step 3: kept 279 features with importance > 0.001
Total distance features remaining: 279
Final feature set:
  peptide_seq  half_life  dist_867  dist_869  dist_903  dist_905  dist_8860  \
0   FDAVLYYHM    45658.0  8.059381  7.808241  8.222604  7.487797   7.554459   
1   YIVGYYSAL      102.0  8.029833  7.882887  8.135388  7.524385   7.508428   
2   MMLVPLITV      848.0  7.899705  7.796799  8.086010  7.490925   7.619325   
3   VMNHKNKFM    45689.0  7.684510  7.684874  7.853494  7.370010   7.574330   
4   RTLAAMPEE    45839.0  7.842597  7.832381  7.982258  7.485770   7.618934   

   dist_8863  dist_8896  dist_8898  ...  dist_23583  dist_23619  dist_23832  \
0   7.687625   7.392616   7.729680  ...    6.245349    7.548094    7.890672   
1   7.404747   7.323658   7.604412  ...    6.314113    7.603737    8.147997   
2   7.876839   7.445069   7.902825  ...    6.152515    7.442309    7