In [1]:
%cd ~/REVIVAL2
%load_ext autoreload
%autoreload 2
%load_ext blackcellmagic

/disk2/fli/REVIVAL2


In [16]:
import pandas as pd
import numpy as np
from glob import glob
from scipy.stats import spearmanr
from REVIVAL.util import get_file_name

# Initialize an empty list to store results
results = []

# Loop through the CSV files and calculate Spearman correlation
# for lib in sorted(glob("/disk2/fli/REVIVAL2/data/meta/not_scaled/*.csv")):
for lib in [
    "/disk2/fli/REVIVAL2/data/meta/not_scaled/ParLQ.csv",
    "/disk2/fli/REVIVAL2/data/meta/not_scaled/Rma-CB.csv",
    "/disk2/fli/REVIVAL2/data/meta/not_scaled/Rma-CSi.csv"
    ]:
    lib_name = get_file_name(lib)
    lib_df = pd.read_csv(lib)
    af3_df = pd.read_csv(f'/disk2/fli/REVIVAL2/zs/af3/score_joint/{lib_name}.csv')
    df_nan = af3_df[af3_df.isna().any(axis=1)]
    if len(df_nan) > 0:
        print(f"NaN values found in {lib_name} for af3_df")
        print(len(df_nan))
    avg_c = [c for c in af3_df.columns if "avg" in c and "disordered" not in c and "chain_pae_min_AA" not in c and "chain_pae_min_BB" not in c]
    std_c = [c for c in af3_df.columns if "std" in c and "disordered" not in c and "chain_pae_min_AA" not in c and "chain_pae_min_BB" not in c]
    agg_c = [c for c in af3_df.columns if "agg" in c and "disordered" not in c and "chain_pae_min_AA" not in c and "chain_pae_min_BB" not in c]

    merge_cols = ["var", "fitness"]

    if "selectivity" in lib_df.columns:
        merge_cols.append("selectivity")
    # Merge dataframes on the 'var' column
    merged_df = pd.merge(af3_df[["var"] + avg_c + std_c], lib_df[merge_cols], on="var", how="outer")

    # Calculate Spearman correlation for each column in avg_c
    for c in avg_c:
        lib_df_valid = merged_df.copy()
        valid_rows = lib_df_valid[["fitness", "selectivity", c]].dropna()
        correlation, p_value = spearmanr(valid_rows["fitness"].values, valid_rows[c].values)
        correlation_select, p_value_select = spearmanr(valid_rows["selectivity"].values, valid_rows[c].values)
        # correlation, p_value = spearmanr(merged_df["fitness"].values, merged_df[c].values)
        if "agg" not in c:
            std_col = merged_df[c.replace("avg", "std")].values
            results.append({
                "Library": lib_name,
                "Score_Type": c,
                "variability": std_col.mean(),
                "Spearman_fitness": correlation,
                "Spearman_selectivity": correlation_select,
                "P_Value_fitness": p_value,
                "P_Value_selectivity": p_value_select


            })
        else:
            results.append({
                "Library": lib_name,
                "Score_Type": c,
                "variability": np.nan,
                "Spearman_Correlation ": correlation,
                "P_Value": p_value
        })


# Convert results into a DataFrame and display it as a table
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Library,Score_Type,variability,Spearman_fitness,Spearman_selectivity,P_Value_fitness,P_Value_selectivity
0,ParLQ,ranking_score_avg,0.016154,0.345636,0.361723,3.395007e-15,1.359243e-16
1,ParLQ,ptm_avg,0.005519,0.257161,0.293227,7.677797e-09,3.583369e-11
2,ParLQ,iptm_avg,0.018891,0.346173,0.361397,3.058197e-15,1.453569e-16
3,ParLQ,chain_ptm_A_avg,0.00196,0.143946,0.186299,0.001398825,3.328245e-05
4,ParLQ,chain_iptm_A_avg,0.018891,0.346173,0.361397,3.058197e-15,1.453569e-16
5,ParLQ,chain_pae_min_AB_avg,0.017209,-0.308884,-0.3194,2.716203e-12,4.397361e-13
6,ParLQ,chain_iptm_AB_avg,0.018891,0.346173,0.361397,3.058197e-15,1.453569e-16
7,ParLQ,chain_ptm_B_avg,0.009098,0.313839,0.334696,1.162148e-12,2.730959e-14
8,ParLQ,chain_iptm_B_avg,0.018891,0.346173,0.361397,3.058197e-15,1.453569e-16
9,ParLQ,chain_pae_min_BA_avg,0.085382,-0.378982,-0.398379,3.495376e-18,4.3642279999999995e-20


In [17]:
results_df["Score_Type"].unique()

array(['ranking_score_avg', 'ptm_avg', 'iptm_avg', 'chain_ptm_A_avg',
       'chain_iptm_A_avg', 'chain_pae_min_AB_avg', 'chain_iptm_AB_avg',
       'chain_ptm_B_avg', 'chain_iptm_B_avg', 'chain_pae_min_BA_avg',
       'chain_iptm_BA_avg', 'mean_site_score_avg'], dtype=object)

In [19]:
results_df[results_df["Score_Type"]=="iptm_avg"]

Unnamed: 0,Library,Score_Type,variability,Spearman_fitness,Spearman_selectivity,P_Value_fitness,P_Value_selectivity
2,ParLQ,iptm_avg,0.018891,0.346173,0.361397,3.058197e-15,1.453569e-16
14,Rma-CB,iptm_avg,0.007233,0.245978,0.255609,0.002412106,0.001594128
26,Rma-CSi,iptm_avg,0.003464,0.3024,0.282517,0.0001691773,0.0004604007


In [24]:
results_df[results_df["Score_Type"]=="chain_iptm_A_avg"]

Unnamed: 0,Library,Score_Type,variability,Spearman_fitness,Spearman_selectivity,P_Value_fitness,P_Value_selectivity
4,ParLQ,chain_iptm_A_avg,0.018891,0.346173,0.361397,3.058197e-15,1.453569e-16
16,Rma-CB,chain_iptm_A_avg,0.007233,0.245978,0.255609,0.002412106,0.001594128
28,Rma-CSi,chain_iptm_A_avg,0.003464,0.3024,0.282517,0.0001691773,0.0004604007


In [22]:
results_df[results_df["Score_Type"]=="chain_iptm_B_avg"]

Unnamed: 0,Library,Score_Type,variability,Spearman_fitness,Spearman_selectivity,P_Value_fitness,P_Value_selectivity
8,ParLQ,chain_iptm_B_avg,0.018891,0.346173,0.361397,3.058197e-15,1.453569e-16
20,Rma-CB,chain_iptm_B_avg,0.007233,0.245978,0.255609,0.002412106,0.001594128
32,Rma-CSi,chain_iptm_B_avg,0.003464,0.3024,0.282517,0.0001691773,0.0004604007


In [21]:
results_df[results_df["Score_Type"]=="mean_site_score_avg"]

Unnamed: 0,Library,Score_Type,variability,Spearman_fitness,Spearman_selectivity,P_Value_fitness,P_Value_selectivity
11,ParLQ,mean_site_score_avg,0.088921,0.045262,0.015269,0.3173768,0.736009
23,Rma-CB,mean_site_score_avg,0.184314,0.280152,0.101593,0.0005161212,0.216068
35,Rma-CSi,mean_site_score_avg,0.134788,0.399499,0.319551,4.096605e-07,6.7e-05


In [25]:
import pandas as pd
import numpy as np
from glob import glob
from scipy.stats import spearmanr
from REVIVAL.util import get_file_name

# Initialize an empty list to store results
results = []

# Loop through the CSV files and calculate Spearman correlation
# for lib in sorted(glob("/disk2/fli/REVIVAL2/data/meta/not_scaled/*.csv")):
for lib in [
    "/disk2/fli/REVIVAL2/data/meta/not_scaled/ParLQ.csv",
    "/disk2/fli/REVIVAL2/data/meta/not_scaled/Rma-CB.csv",
    "/disk2/fli/REVIVAL2/data/meta/not_scaled/Rma-CSi.csv"
    ]:
    lib_name = get_file_name(lib)
    lib_df = pd.read_csv(lib)
    af3_df = pd.read_csv(f'/disk2/fli/REVIVAL2/zs/af3/score_seperate/{lib_name}.csv')
    df_nan = af3_df[af3_df.isna().any(axis=1)]
    if len(df_nan) > 0:
        print(f"NaN values found in {lib_name} for af3_df")
        print(len(df_nan))
    avg_c = [c for c in af3_df.columns if "avg" in c and "disordered" not in c and "chain_pae_min_AA" not in c and "chain_pae_min_BB" not in c]
    std_c = [c for c in af3_df.columns if "std" in c and "disordered" not in c and "chain_pae_min_AA" not in c and "chain_pae_min_BB" not in c]
    agg_c = [c for c in af3_df.columns if "agg" in c and "disordered" not in c and "chain_pae_min_AA" not in c and "chain_pae_min_BB" not in c]

    merge_cols = ["var", "fitness"]

    if "selectivity" in lib_df.columns:
        merge_cols.append("selectivity")
    # Merge dataframes on the 'var' column
    merged_df = pd.merge(af3_df[["var"] + avg_c + std_c], lib_df[merge_cols], on="var", how="outer")

    # Calculate Spearman correlation for each column in avg_c
    for c in avg_c:
        lib_df_valid = merged_df.copy()
        valid_rows = lib_df_valid[["fitness", "selectivity", c]].dropna()
        correlation, p_value = spearmanr(valid_rows["fitness"].values, valid_rows[c].values)
        correlation_select, p_value_select = spearmanr(valid_rows["selectivity"].values, valid_rows[c].values)
        # correlation, p_value = spearmanr(merged_df["fitness"].values, merged_df[c].values)
        if "agg" not in c:
            std_col = merged_df[c.replace("avg", "std")].values
            results.append({
                "Library": lib_name,
                "Score_Type": c,
                "variability": std_col.mean(),
                "Spearman_fitness": correlation,
                "Spearman_selectivity": correlation_select,
                "P_Value_fitness": p_value,
                "P_Value_selectivity": p_value_select


            })
        else:
            results.append({
                "Library": lib_name,
                "Score_Type": c,
                "variability": np.nan,
                "Spearman_Correlation ": correlation,
                "P_Value": p_value
        })


# Convert results into a DataFrame and display it as a table
results_df = pd.DataFrame(results)
results_df

  correlation, p_value = spearmanr(valid_rows["fitness"].values, valid_rows[c].values)
  correlation_select, p_value_select = spearmanr(valid_rows["selectivity"].values, valid_rows[c].values)
  correlation, p_value = spearmanr(valid_rows["fitness"].values, valid_rows[c].values)
  correlation_select, p_value_select = spearmanr(valid_rows["selectivity"].values, valid_rows[c].values)
  correlation, p_value = spearmanr(valid_rows["fitness"].values, valid_rows[c].values)
  correlation_select, p_value_select = spearmanr(valid_rows["selectivity"].values, valid_rows[c].values)
  correlation, p_value = spearmanr(valid_rows["fitness"].values, valid_rows[c].values)
  correlation_select, p_value_select = spearmanr(valid_rows["selectivity"].values, valid_rows[c].values)
  correlation, p_value = spearmanr(valid_rows["fitness"].values, valid_rows[c].values)
  correlation_select, p_value_select = spearmanr(valid_rows["selectivity"].values, valid_rows[c].values)


Unnamed: 0,Library,Score_Type,variability,Spearman_fitness,Spearman_selectivity,P_Value_fitness,P_Value_selectivity
0,ParLQ,ranking_score_avg,5.889163e-03,0.338275,0.328890,1.393332e-14,7.987235e-14
1,ParLQ,ptm_avg,2.800962e-03,0.262727,0.271946,3.528431e-09,9.350645e-10
2,ParLQ,iptm_avg,6.809198e-03,0.338146,0.323402,1.427638e-14,2.157609e-13
3,ParLQ,chain_ptm_A_avg,1.829667e-03,0.162490,0.186185,3.041771e-04,3.366068e-05
4,ParLQ,chain_iptm_A_avg,1.135162e-02,0.188348,0.185988,2.714092e-05,3.432251e-05
...,...,...,...,...,...,...,...
64,Rma-CSi,chain_iptm_CA_avg,2.438294e-03,0.420939,0.236975,8.156569e-08,3.502937e-03
65,Rma-CSi,chain_pae_min_CB_avg,6.014754e-01,-0.132082,-0.183355,1.071361e-01,2.470908e-02
66,Rma-CSi,chain_iptm_CB_avg,4.776319e-02,0.184371,0.260465,2.390908e-02,1.285940e-03
67,Rma-CSi,chain_pae_min_CC_avg,1.110223e-16,,,,


In [26]:
results_df[results_df["Score_Type"]=="iptm_avg"]

Unnamed: 0,Library,Score_Type,variability,Spearman_fitness,Spearman_selectivity,P_Value_fitness,P_Value_selectivity
2,ParLQ,iptm_avg,0.006809,0.338146,0.323402,1.427638e-14,2.157609e-13
25,Rma-CB,iptm_avg,0.004968,0.342,0.316318,1.839962e-05,8.032832e-05
48,Rma-CSi,iptm_avg,0.006988,0.32163,0.268775,5.986033e-05,0.0008819993


In [28]:
results_df[results_df["Score_Type"]=="chain_iptm_AC_avg"]

Unnamed: 0,Library,Score_Type,variability,Spearman_fitness,Spearman_selectivity,P_Value_fitness,P_Value_selectivity
8,ParLQ,chain_iptm_AC_avg,0.004996,0.478716,0.46223,1.9611780000000001e-29,2.623426e-27
31,Rma-CB,chain_iptm_AC_avg,0.001909,0.409886,0.365156,1.900914e-07,4.338116e-06
54,Rma-CSi,chain_iptm_AC_avg,0.002438,0.420939,0.236975,8.156569e-08,0.003502937


In [27]:
results_df.loc[results_df.groupby("Library")["Spearman_fitness"].idxmax()]

Unnamed: 0,Library,Score_Type,variability,Spearman_fitness,Spearman_selectivity,P_Value_fitness,P_Value_selectivity
8,ParLQ,chain_iptm_AC_avg,0.004996,0.478716,0.46223,1.9611780000000001e-29,2.623426e-27
24,Rma-CB,ptm_avg,0.004378,0.422657,0.347233,7.131455e-08,1.340531e-05
54,Rma-CSi,chain_iptm_AC_avg,0.002438,0.420939,0.236975,8.156569e-08,0.003502937
