In [2]:
import os
import pandas as pd

from Bio.PDB import PDBParser, MMCIFParser
from Bio.PDB.MMCIF2Dict import MMCIF2Dict
from Bio.PDB.DSSP import DSSP
from Bio.PDB import PDBList

from loop.pdb_files import get_PDBfiles
from loop.dssp import save_SS
from utils.common import load_tab, save_tab, dump_dicts2jsons, dump_dict2json, read_json2list
from params import *

# Data Analysis (DI work here):
1. Re-define the data set by choosing a way of filtering these PDBs out. Lucía and July suggest filtering out all regions smaller than 30 residues as a safe choice. You can additionally filter out structures with low resolution (only keep data from resolution value <= 5Å) if this is easy to do. If it is not straightforward, I would recommend just apply the length filtering. 

2. Re-run the M2O
3. Re-train the ANN
4. Re-do T-SNE plots

In [3]:
df_domain_scop = load_tab(path_tab_scop_FA_continuous_uni_multi_domian)

In [4]:
df_domain_scop[:2]

df_domain_scop[df_domain_scop['FA-PDBID']=='1F1U']

Unnamed: 0,FA-DOMID,FA-PDBID,FA-PDBREG,FA-UNIID,FA-UNIREG,FA,FA-UNIREG-START,FA-UNIREG-END,FA-PDBREG-START,FA-PDBREG-END,FA-CHAINID
595,8032048,1F1U,A:148-323,H0QJ69,148-323,4001093,148,323,148,323,A
596,8032047,1F1U,A:2-147,H0QJ69,2-147,4001093,2,147,2,147,A


In [20]:
# generate seq_id -> 'pdbID_chainID'

df_domain_scop['seq_id'] = [f"{r['FA-PDBID'].lower()}_{r['FA-CHAINID']}" for _, r in df_domain_scop.iterrows()]
df_domain_scop['length'] = [r['FA-UNIREG-END']-r['FA-UNIREG-START']+1 for _, r in df_domain_scop.iterrows()]
len(df_domain_scop['seq_id'].unique()), len(df_domain_scop['FA-PDBID'].unique())

(2867, 2734)

## 1. resolution value <= 5A

Note: no PDB file: 3jaq.cif (Not available anymore).

### 1.1. get resolution values from PDB files

In [21]:
def classify_structure(pdbid, path_pdb_cif):
    """
    Classify a PDB .cif file into X-ray, EM, or NMR, with resolution handling.

    Parameters:
        cif_path (str): Path to the .cif file.

    Returns:
        dict: {'experiment_type': str, 'resolution': float}
              Resolution is 0 for NMR, otherwise taken from CIF.
              If resolution missing for X-ray/EM, set to None.
    """
    cif_path = f"{path_pdb_cif}/{pdbid.lower()}.cif"
    parser = MMCIFParser(QUIET=True)
    result = {'pdbid': pdbid, 'experiment_type': None, 'resolution': None}

    try:
        # Parse the file (this loads mmCIF metadata)
        parser.get_structure("temp", cif_path)
        mmcif_dict = parser._mmcif_dict

        # Get experimental method (usually in _exptl.method)
        if '_exptl.method' in mmcif_dict:
            method = mmcif_dict['_exptl.method'][0].upper()
        else:
            method = "UNKNOWN"

        # Classify experiment type
        if "X-RAY" in method:
            result['experiment_type'] = "XRay"
        elif "ELECTRON MICROSCOPY" in method or "EM" in method:
            result['experiment_type'] = "EM"
        elif "NMR" in method:
            result['experiment_type'] = "NMR"
        else:
            result['experiment_type'] = method  # store raw if unrecognized

        # Resolution logic
        if result['experiment_type'] == "NMR":
            result['resolution'] = 0.0
        else:
            # Try to get resolution from mmCIF
            if '_refine.ls_d_res_high' in mmcif_dict:
                try:
                    result['resolution'] = float(mmcif_dict['_refine.ls_d_res_high'][0])
                except (ValueError, IndexError):
                    result['resolution'] = None
            else:
                result['resolution'] = None

    except Exception as e:
        print(f"Error processing {cif_path}: {e}")

    return result


In [22]:
pid = '1A5J'
classify_structure(pid, path_pdb_cif)

{'pdbid': '1A5J', 'experiment_type': 'NMR', 'resolution': 0.0}

In [None]:
# list_pdbid = df_domain_scop_long['FA-PDBID'].unique()
list_pdbid = df_domain_scop['FA-PDBID'].unique()
len(list_pdbid)

list_experiment_resolution = []
for pid in list_pdbid:
    list_experiment_resolution.append(classify_structure(pid, path_pdb_cif))
    
dump_dict2json(list_experiment_resolution, 'list_experiment_resolution.json')


Error processing /home/dimeng/project/linker/domain_linker_v2/data/recollect_dsAll/pdb/mmcif/3jaq.cif: [Errno 2] No such file or directory: '/home/dimeng/project/linker/domain_linker_v2/data/recollect_dsAll/pdb/mmcif/3jaq.cif'


### 1.2. add resolution values to the SCOPE domain table

In [None]:
list_experiment_resolution = read_json2list('list_experiment_resolution.json')
df_pdb_resolution = pd.DataFrame(list_experiment_resolution)

df_merged = (
    df_domain_scop
    .merge(df_pdb_resolution, left_on='FA-PDBID', right_on='pdbid', how='inner')
    .drop(columns=['pdbid'])
)
save_tab(df_merged, path_tab_scop_FA_continuous_uni_multi_domian_withResolution)

### 1.3. check experimental types and numbers

In [None]:
df_merged = load_tab(path_tab_scop_FA_continuous_uni_multi_domian_withResolution)

df_nmr = df_merged[df_merged['experiment_type']=='NMR']
df_xray = df_merged[df_merged['experiment_type']=='XRay']
df_em = df_merged[df_merged['experiment_type']=='EM']
df_ec = df_merged[df_merged['experiment_type']=='ELECTRON CRYSTALLOGRAPHY']

df_other = df_merged[~df_merged['experiment_type'].isin(['NMR', 'XRay', 'EM', 'ELECTRON CRYSTALLOGRAPHY'])]

df_nmr.shape, df_xray.shape, df_em.shape, df_ec.shape, df_other.shape

Index(['FA-DOMID', 'FA-PDBID', 'FA-PDBREG', 'FA-UNIID', 'FA-UNIREG', 'FA',
       'FA-UNIREG-START', 'FA-UNIREG-END', 'FA-PDBREG-START', 'FA-PDBREG-END',
       'FA-CHAINID', 'seq_id', 'length'],
      dtype='object')

In [6]:
len(df_nmr['seq_id'].unique()), len(df_xray['seq_id'].unique()), len(df_em['seq_id'].unique()), len(df_ec['seq_id'].unique()), len(df_other['seq_id'].unique())

(93, 2665, 104, 4, 1)

In [12]:
len(df_nmr['seq_id'].unique()), len(df_xray[df_xray['resolution']<=5]['seq_id'].unique()), len(df_em[df_em['resolution']<=5]['seq_id'].unique()), len(df_ec[df_ec['resolution']<=5]['seq_id'].unique()), len(df_other['seq_id'].unique())

(93, 2659, 14, 4, 1)

In [8]:
93+2665+104+4+1

2867

In [9]:
len(df_nmr['FA-PDBID'].unique()), len(df_xray['FA-PDBID'].unique()), len(df_em['FA-PDBID'].unique()), len(df_ec['FA-PDBID'].unique()), len(df_other['FA-PDBID'].unique())

(93, 2569, 69, 2, 1)

In [11]:
sum((93, 2569, 69, 2, 1))

2734

In [13]:
len(df_nmr['FA-PDBID'].unique()), len(df_xray[df_xray['resolution']<=5]['FA-PDBID'].unique()), len(df_em[df_em['resolution']<=5]['FA-PDBID'].unique()), len(df_ec[df_ec['resolution']<=5]['FA-PDBID'].unique()), len(df_other['FA-PDBID'].unique())

(93, 2565, 11, 2, 1)

In [16]:
df_merged_resolution5 = df_merged[df_merged['resolution']<=5]
len(df_merged_resolution5['seq_id'].unique()), len(df_merged_resolution5['FA-PDBID'].unique())

(2770, 2671)

In [22]:
# domain length less than 30
df_merged_length30 = df_merged_resolution5[df_merged_resolution5['length']>=30]
len(df_merged_length30['seq_id'].unique()), len(df_merged_length30['FA-PDBID'].unique())

(2765, 2666)

## 2. add domain length threshold

Exclude PDB chains contain short domains (len<30)

In [36]:
df_merged_resolution5 = df_merged[df_merged['resolution']<=5]
df_merged_short = df_merged_resolution5[df_merged_resolution5['length']<30]

list_short_seqid = list(df_merged_short['seq_id'].unique()) # PDB chains contain short domains
df_merged_resolution5_length30 = df_merged_resolution5[~df_merged_resolution5['seq_id'].isin(list_short_seqid)]

In [42]:
len(df_merged_resolution5_length30['seq_id'].unique()), len(df_merged_resolution5_length30['FA-PDBID'].unique())

(2712, 2615)

In [None]:
path_tab_scop_FA_continuous_uni_multi_domian_lenResolution_filtered = os.path.join(path_scop_folder, 'scop-cla-latest-table-FAcontinuousUniMultiDomain_lenResolution_filtered.tsv')
save_tab(df_merged_resolution5_length30, path_tab_scop_FA_continuous_uni_multi_domian_lenResolution_filtered)