In [28]:
#Daten importieren
import pandas as pd
df = pd.read_csv("../data/ab_ag.tsv", sep="\t")

#dimension
print(df.shape)

#zeigen welche spalten es gibt
print(df.columns)



(5523, 30)
Index(['pdb', 'Hchain', 'Lchain', 'model', 'antigen_chain', 'antigen_type',
       'antigen_het_name', 'antigen_name', 'short_header', 'date', 'compound',
       'organism', 'heavy_species', 'light_species', 'antigen_species',
       'authors', 'resolution', 'method', 'r_free', 'r_factor', 'scfv',
       'engineered', 'heavy_subclass', 'light_subclass', 'light_ctype',
       'affinity', 'delta_g', 'affinity_method', 'temperature', 'pmid'],
      dtype='object')


In [29]:
# Nur die gewünschten Spalten behalten
df_cleaned = df[["pdb",	"Hchain", "Lchain", "antigen_species", "heavy_subclass", "light_subclass", "antigen_type", "antigen_name", "organism", "resolution", "light_ctype"]]

#zeilen die noch übrig sind
print(df_cleaned.columns)

Index(['pdb', 'Hchain', 'Lchain', 'antigen_species', 'heavy_subclass',
       'light_subclass', 'antigen_type', 'antigen_name', 'organism',
       'resolution', 'light_ctype'],
      dtype='object')


In [30]:
#alle Zeilen in denen NaN vorkommt entfernen 
df_cleaned = df_cleaned.dropna()

#zum überprüfen
print("Vorher:", len(df))
print("Nachher:", len(df_cleaned))



Vorher: 5523
Nachher: 5257


In [31]:
#dimension anschauen
print(df_cleaned.shape)


(5257, 11)


In [None]:
# Zuerst sicherstellen, dass die Spalte 'resolution' numerisch ist
df_cleaned['resolution'] = pd.to_numeric(df_cleaned['resolution'], errors='coerce')

# Step 1: Dann nur Zeilen mit resolution <= 3.0 behalten
df_cleaned = df_cleaned[df_cleaned['resolution'] <= 3.0]

#dimension anschauen
print(df_cleaned.shape)

(2799, 11)


In [None]:
#Step 2: was gibt es alles in der Spalte Antigen typen
print(df_cleaned['antigen_type'].value_counts())
#keine weiter filterung notwendig

antigen_type
protein                        2523
protein | protein               217
protein | protein | protein      28
protein | peptide                14
peptide | protein                 7
peptide                           6
protein | protein | peptide       2
peptide | protein | protein       1
protein | peptide | protein       1
Name: count, dtype: int64


In [38]:
# Step 3: Entferne Zeilen mit fehlender Hchain, Lchain oder antigen_name
#wurde vorher schon durch NaN entfernt

df_cleaned.head()


Unnamed: 0,pdb,Hchain,Lchain,antigen_species,heavy_subclass,light_subclass,antigen_type,antigen_name,organism,resolution,light_ctype
3,8uzp,H,L,mus musculus,IGHV1,IGLV1,protein,stem_mimetic_01,Homo sapiens; Mus musculus,2.711,Lambda
4,8uzp,A,B,mus musculus,IGHV1,IGLV1,protein,stem_mimetic_01,Homo sapiens; Mus musculus,2.711,Lambda
5,8veb,G,I,influenza a virus,IGHV4,IGKV1,protein,hemagglutinin,Homo sapiens; Influenza A virus,2.97,Kappa
6,8veb,H,L,influenza a virus,IGHV4,IGKV1,protein,hemagglutinin,Homo sapiens; Influenza A virus,2.97,Kappa
8,8ved,H,L,influenza a virus,IGHV4,IGKV2,protein,hemagglutinin,Homo sapiens; Influenza A virus,2.98,Kappa


In [39]:
# Steop 4: remove redundant structures

# Alle eindeutigen PDB-IDs aus der Spalte 'pdb' holen (ohne nochmal klein zu machen)
pdb_ids = df_cleaned['pdb'].unique().tolist()

print(f"Es sind {len(pdb_ids)} einzigartige PDB-IDs zum Herunterladen.")


Es sind 1480 einzigartige PDB-IDs zum Herunterladen.


In [40]:
from Bio.PDB import PDBList
import os

pdbl = PDBList()

# Ordner, in dem die PDB-Dateien gespeichert werden sollen
download_folder = "../data/pdb_files"

# Ordner erstellen, falls nicht vorhanden
os.makedirs(download_folder, exist_ok=True)

# Alle PDB-Dateien herunterladen
for pdb_id in pdb_ids:
    print(f"Lade PDB {pdb_id} herunter...")
    pdbl.retrieve_pdb_file(pdb_id, pdir=download_folder, file_format='pdb')


Lade PDB 8uzp herunter...
Downloading PDB structure '8uzp'...
Lade PDB 8veb herunter...
Downloading PDB structure '8veb'...
Lade PDB 8ved herunter...
Downloading PDB structure '8ved'...
Lade PDB 9dpc herunter...
Downloading PDB structure '9dpc'...
Lade PDB 9dru herunter...
Downloading PDB structure '9dru'...
Lade PDB 9ds1 herunter...
Downloading PDB structure '9ds1'...
Lade PDB 9mer herunter...
Downloading PDB structure '9mer'...
Lade PDB 9mev herunter...
Downloading PDB structure '9mev'...
Lade PDB 8rmx herunter...
Downloading PDB structure '8rmx'...
Lade PDB 8rmy herunter...
Downloading PDB structure '8rmy'...
Lade PDB 8ykt herunter...
Downloading PDB structure '8ykt'...
Lade PDB 9azr herunter...
Downloading PDB structure '9azr'...
Lade PDB 9azt herunter...
Downloading PDB structure '9azt'...
Lade PDB 9bjg herunter...
Downloading PDB structure '9bjg'...
Lade PDB 9bjh herunter...
Downloading PDB structure '9bjh'...
Lade PDB 9cci herunter...
Downloading PDB structure '9cci'...
Lade PDB