In [2]:
import pandas as pd

df=pd.read_csv("search.csv",sep=";",index_col=False)
df["RMSD"]=pd.to_numeric(df["RMSD"])
df["Nalgn"]=pd.to_numeric(df["Nalgn"])
df["Z-score"]=pd.to_numeric(df["Z-score"])
df["Q-score"]=pd.to_numeric(df["Q-score"])
df.sample(4)

Unnamed: 0,num,Q-score,P-score,Z-score,RMSD,Nalgn,Nsse,Ngaps,Seq-%,Nmd,Nres-Q,Nsse-Q,Nres-T,Nsse-T,code
409,410,0.4852,1.524,3.437,0.362,8,1,0,0.0,0,13,1,10,1,1xow:B
1814,1815,0.3322,1.718,3.662,0.615,9,1,0,0.2222,0,13,1,18,1,3e8c:G
619,620,0.537,1.775,3.725,0.41,8,1,0,0.125,0,13,1,9,1,5e0x:C
5736,5737,0.4035,1.725,3.669,1.804,10,1,0,0.1,0,13,1,14,1,6zrc:Q


In [3]:
mask=(
    (df["RMSD"]<=0.5)&
    (df["Nalgn"]>=11)&
     (df["Z-score"]>=4.9)&
    (df["Q-score"]>=0.8)
)
candidates=df.loc[mask,["num","RMSD","Nalgn","Z-score","code","Q-score","P-score"]]
candidates

Unnamed: 0,num,RMSD,Nalgn,Z-score,code,Q-score,P-score
0,1,0.0,13,5.713,1ycr:B,1.0,3.97
80,81,0.25,11,5.016,3dab:D,0.8403,3.108
114,115,0.267,11,5.016,2z5s:P,0.8395,3.108
899,900,0.458,11,4.908,4hfz:B,0.8269,2.983
929,930,0.464,12,4.908,7nel:C,0.9015,2.983


In [4]:
import os
import requests
from Bio import PDB
from tqdm.auto import tqdm

os.makedirs('pdb_files', exist_ok=True)
os.makedirs('fragments', exist_ok=True)

In [5]:
candidates["pdb"] = candidates["code"].str.split(":", expand=True)[0]

os.makedirs("pdb_files", exist_ok=True)
for pdb in tqdm(candidates["pdb"].unique(),desc="PDB download"):
    local_path = f"pdb_files/{pdb}.pdb"
    if not os.path.exists(local_path):
        url = f"https://files.rcsb.org/download/{pdb}.pdb"
        resp = requests.get(url)
        resp.raise_for_status()
        with open(local_path, "wb") as out:
            out.write(resp.content)
        print(f"✅ Scaricato {pdb}.pdb")
    else:
        print(f"– Già presente: {pdb}.pdb")

PDB download:   0%|          | 0/5 [00:00<?, ?it/s]

– Già presente: 1ycr.pdb
– Già presente: 3dab.pdb
– Già presente: 2z5s.pdb
– Già presente: 4hfz.pdb
– Già presente: 7nel.pdb


In [21]:
from Bio.PDB import PDBParser, PDBIO, Select

candidates["chain"]=candidates["code"].str.split(":", expand=True)[1]

outdir = "pdb_chains"
os.makedirs(outdir, exist_ok=True)

class ChainSelect(Select):
    def __init__(self, chain_id):
        self.chain_id = chain_id
    def accept_chain(self, chain):
        return chain.id == self.chain_id

parser = PDBParser(QUIET=True)
io     = PDBIO()

for _, row in candidates.iterrows():
    pdb_id = row["pdb"]       # es. "3dab"
    chain  = row["chain"]     # es. "D"

    inp = f"pdb_files/{pdb_id}.pdb"                   # il PDB completo
    out = f"{outdir}/{pdb_id}_{chain}.pdb"            # il file monocatena

    if not os.path.exists(inp):
        print(f"⚠️  Non ho trovato {inp}, salto.")
        continue

    # Leggi struttura e salva solo la catena desiderata
    struct = parser.get_structure(pdb_id, inp)
    io.set_structure(struct)
    io.save(out, select=ChainSelect(chain))
    print(f"✅ Estratta catena {chain} da {pdb_id}.pdb → {out}")

✅ Estratta catena B da 1ycr.pdb → pdb_chains/1ycr_B.pdb
✅ Estratta catena D da 3dab.pdb → pdb_chains/3dab_D.pdb
✅ Estratta catena P da 2z5s.pdb → pdb_chains/2z5s_P.pdb
✅ Estratta catena B da 4hfz.pdb → pdb_chains/4hfz_B.pdb
✅ Estratta catena C da 7nel.pdb → pdb_chains/7nel_C.pdb
