# Retrieve lead author data
Parse CIF files to get the name of the last author

In [1]:
from pathlib import Path
import pandas as pd
from collections import Counter
from Bio.PDB.MMCIF2Dict import MMCIF2Dict
from tqdm import tqdm

In [2]:
raw_pdbs = Path("../data/raw/pdb_pairs/pdb_files/pdb")
pdb_files = list(raw_pdbs.glob("*.pdb"))
print(len(pdb_files))

raw_cifs = Path("../data/raw/pdb_pairs/pdb_files/cif")
cif_files = list(raw_cifs.glob("*.cif"))
print(len(cif_files))


2849
10706


In [3]:
# List the structures that we use
df = pd.read_csv("../data/processed/pdb_pairs/filtered_df.csv")
pdbs_a = set(df["ENTRY_ID_ONE"])
pdbs_b = set(df["ENTRY_ID_TWO"])
valid_pdbs = pdbs_a.union(pdbs_b)
print(f"Valid number of PDB IDs: {len(valid_pdbs)}")

Valid number of PDB IDs: 4803


In [4]:
valid_cif_files = [x for x in cif_files if x.stem.lower() in valid_pdbs]
print(f"{len(valid_cif_files)} valid CIF files")

4803 valid CIF files


In [5]:
# Iterate over structures to get the last author of each
structure_to_lead_author = {}
for cif_file in tqdm(valid_cif_files):
    mmcif_dict = MMCIF2Dict(cif_file)
    try:
        authors = mmcif_dict["_citation_author.name"]
        lead_author = authors[-1]
        structure_to_lead_author[cif_file.stem.lower()] = lead_author
    except KeyError:
        print(f"File {cif_file} missing author name; adding placeholders")
        structure_to_lead_author[cif_file.stem.lower()] = "ANONYMOUS"

 96%|█████████████████████████████████████▎ | 4595/4803 [15:56<00:52,  3.96it/s]

File ../data/raw/pdb_pairs/pdb_files/cif/3UVR.cif missing author name; adding placeholders


100%|███████████████████████████████████████| 4803/4803 [16:49<00:00,  4.76it/s]


In [6]:
# Manually add author name https://www.rcsb.org/structure/3UVR
structure_to_lead_author["3uvr"] = "Rauh, D."

In [7]:
author_counter = Counter(structure_to_lead_author.values())

author_df = pd.DataFrame.from_dict(structure_to_lead_author,orient='index',
                       columns=['lead_author'])
author_df.to_csv("../data/processed/pdb_pairs/author_df.csv")

# Wrangle the data for input to a model

In [9]:
rmsds_path = Path("../results/rmsds")
subdirs = [x for x in rmsds_path.iterdir() if x.is_dir()]

In [10]:
glm_df = []
columns = ["structure_pair","rmsd","is_phospho_a","is_phospho_b","same_lead_author"]
for subdir in tqdm(subdirs):
    #print(subdir)
    rmsd_df = pd.read_csv(subdir / "rmsds_df.csv",index_col=0,
                          dtype={'PDB_ID_A':'string',
                                 'PDB_ID_B': 'string'})
    
    rmsd_df["ID_A"] = rmsd_df["PDB_ID_A"] + "_" + rmsd_df["Chain_A"]
    rmsd_df["ID_B"] = rmsd_df["PDB_ID_B"] + "_" + rmsd_df["Chain_B"]
    
    for idx, row in rmsd_df.iterrows():
        chain_a = row["ID_A"]
        chain_b = row["ID_B"]
        pdb_code_a = chain_a.split("_")[0]
        pdb_code_b = chain_b.split("_")[0]

        structure_pair = "$".join([chain_a, chain_b])
        rmsd = row["RMSD"]
        group = row["Group"]
        
        lead_author_a = structure_to_lead_author[pdb_code_a]
        lead_author_b = structure_to_lead_author[pdb_code_b]
        
        if lead_author_a == lead_author_b:
            same_lead_author = 1
        else:
            same_lead_author = 0
        
        if group == "between_groups":
            is_phospho_a = 0
            is_phospho_b = 1
        elif group == "within_phospho":
            is_phospho_a = 1
            is_phospho_b = 1
        elif group == "within_nonphospho":
            is_phospho_a = 0
            is_phospho_b = 0
        else:
            raise ValueError(f"Invalid group value {group}")
        row = [structure_pair, rmsd, is_phospho_a, is_phospho_b, same_lead_author]
        glm_df.append(row)
glm_df = pd.DataFrame(glm_df,columns=columns)

In [None]:
glm_df.to_csv("../data/processed/glm_control_input/glm_df.csv")

In [None]:
phospho_df = glm_df.loc[(glm_df["is_phospho_a"]==1) & (glm_df["is_phospho_b"]==1)]
phospho_df.to_csv("../data/processed/glm_control_input/phospho_df.csv")

In [None]:
nonphospho_df = glm_df.loc[(glm_df["is_phospho_a"]==0) & (glm_df["is_phospho_b"]==0)]
nonphospho_df.to_csv("../data/processed/glm_control_input/nonphospho_df.csv")