In [2]:
%cd ~/REVIVAL2

/disk2/fli/REVIVAL2


In [3]:
%load_ext autoreload
%autoreload 2
%load_ext blackcellmagic

In [4]:
import numpy as np
import pandas as pd

from REVIVAL.util import get_file_name, get_dir_name, checkNgen_folder



In [None]:

def parse_chai_scores(mut_structure_dir: str, output_dir: str = "zs/chai/output"):

    """
    A function for going through the subfolder and getting the chai scores
    to generate a dataframe with the following columns:
        - var: The mutation, ie I165A:I183A:Y301V
        - rep: The replicate number
        - aggregate_score
        - ptm
        - iptm
        - chain_ptm_A
        - chain_ptm_B
        - chain_iptm_AA
        - chain_iptm_AB
        - chain_iptm_BA
        - chain_iptm_BB
        - has_inter_chain_clashes

    Args:
    - input_dir, str: The path to the folder containing the chai score
        ie zs/chai/mut_structure/PfTrpB-4bromo
    - output_dir, str: The path to the folder to save the dataframe to
        ie zs/chai/output
    """

    output_dir = checkNgen_folder(output_dir)
    lib_name = os.path.basename(mut_structure_dir)

    # init dataframe
    df = pd.DataFrame(
        columns=[
            "var",
            "rep",
            "aggregate_score",
            "ptm",
            "iptm",
            "chain_ptm_A",
            "chain_ptm_B",
            "chain_iptm_AB",
            "chain_iptm_BA",
            "has_inter_chain_clashes",
        ]
    )

    for subfolder in glob(f"{mut_structure_dir}/*"):
        var = os.path.basename(subfolder)
        
        for rep_npz in glob(f"{subfolder}/*.npz"):

            npz = np.load(rep_npz)

            df = df._append(
                {
                    "var": var,
                    "rep": get_file_name(rep_npz).split("_")[-1],
                    "aggregate_score": npz["aggregate_score"][0],
                    "ptm": npz["ptm"][0],
                    "iptm": npz["iptm"][0],
                    "chain_ptm_A": npz["per_chain_ptm"][0][0],
                    "chain_ptm_B": npz["per_chain_ptm"][0][1],
                    "chain_iptm_AB": npz["per_chain_pair_iptm"][0][0, 1],
                    "chain_iptm_BA": npz["per_chain_pair_iptm"][0][1, 0],
                    "has_inter_chain_clashes": npz["has_inter_chain_clashes"][0],
                },
                ignore_index=True,
            )

    df.to_csv(f"{output_dir}/{lib_name}.csv", index=False)
    print(f"Saved chai scores for {lib_name} to {output_dir}/{lib_name}.csv")

In [5]:
# load the npz file
npz = np.load('/disk2/fli/REVIVAL2/zs/chai/mut_structure/PfTrpB-4bromo/I165A:I183A:Y301V/I165A:I183A:Y301V_0.npz')

In [15]:
npz["per_chain_ptm"][0][0]

0.9515228

In [18]:
npz["per_chain_pair_iptm"][0][0, 0], npz["per_chain_pair_iptm"][0][0, 1]

(0.9515228, 0.51923597)

In [7]:
for key in npz.files:
    print(f"{key}: {npz[key]}")


aggregate_score: [0.85367167]
ptm: [0.9502157]
iptm: [0.82953566]
per_chain_ptm: [[0.9515228  0.61292917]]
per_chain_pair_iptm: [[[0.9515228  0.51923597]
  [0.82953566 0.61292917]]]
has_inter_chain_clashes: [False]
chain_intra_clashes: [[0 0]]
chain_chain_inter_clashes: [[[0 0]
  [0 0]]]


In [5]:
df = pd.read_csv('/disk2/fli/REVIVAL2/zs/chai/output/PfTrpB-4bromo.csv')
df

Unnamed: 0,var,rep,aggregate_score,ptm,iptm,chain_ptm_A,chain_ptm_B,chain_iptm_AB,chain_iptm_BA,has_inter_chain_clashes
0,I165A:I183A:Y301V,0,0.853672,0.950216,0.829536,0.951523,0.612929,0.519236,0.829536,False
1,I165A:I183A:Y301V,1,0.854554,0.950160,0.830652,0.951699,0.609123,0.505708,0.830652,False
2,I165A:I183A:Y301V,2,0.853499,0.950731,0.829191,0.952028,0.611788,0.513984,0.829191,False
3,I165A:I183A:Y301V,3,0.853769,0.950556,0.829572,0.951779,0.613500,0.519742,0.829572,False
4,I165A:I183A:Y301V,4,0.852863,0.950462,0.828463,0.951828,0.610994,0.511768,0.828463,False
...,...,...,...,...,...,...,...,...,...,...
1200,Y301V,0,0.849901,0.949901,0.824901,0.950898,0.606845,0.508144,0.824901,False
1201,Y301V,1,0.851191,0.950126,0.826457,0.950941,0.612418,0.512239,0.826457,False
1202,Y301V,2,0.850100,0.949950,0.825137,0.951009,0.609175,0.509415,0.825137,False
1203,Y301V,3,0.850774,0.949971,0.825975,0.950927,0.610762,0.510504,0.825975,False


In [7]:
filtered_df = df[df['has_inter_chain_clashes'] == False]
mean_df = filtered_df.groupby('var').mean(numeric_only=True).drop(columns=['rep']).reset_index()
mean_df

Unnamed: 0,var,aggregate_score,ptm,iptm,chain_ptm_A,chain_ptm_B,chain_iptm_AB,chain_iptm_BA,has_inter_chain_clashes
0,I165A:I183A:Y301V,0.853671,0.950425,0.829483,0.951771,0.611667,0.514088,0.829483,0.0
1,I165A:I183D:Y301L,0.841553,0.945489,0.815569,0.946526,0.614877,0.507755,0.815569,0.0
2,I165A:I183G:Y301C,0.850305,0.951961,0.824890,0.953809,0.608488,0.502199,0.824890,0.0
3,I165A:I183G:Y301L,0.846176,0.947347,0.820884,0.947895,0.622362,0.519105,0.820884,0.0
4,I165A:I183M:Y301N,0.851084,0.950643,0.826194,0.952148,0.609538,0.502532,0.826194,0.0
...,...,...,...,...,...,...,...,...,...
236,Y301L,0.839990,0.946637,0.813328,0.947722,0.607897,0.504092,0.813328,0.0
237,Y301M,0.841571,0.948266,0.814897,0.949378,0.607432,0.503898,0.814897,0.0
238,Y301Q,0.842753,0.948624,0.816286,0.950278,0.601130,0.497075,0.816286,0.0
239,Y301T,0.848510,0.951083,0.822866,0.952654,0.602673,0.503218,0.822866,0.0
