<a href="https://colab.research.google.com/github/eoinleen/Biophysics-general/blob/main/Total_analysis_RFdiff_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
1  """
2  RFdiffusion Structure Analysis and Sequence Extraction Tool
3  ========================================================
4  Created: January 31, 2025
5  Authors: Original Analysis - Dr. Eoin Leen, University of Leeds
6           Visualization & Integration - Claude AI & Dr. Eoin Leen
7  Version: 2.0
8
9  Purpose:
10 --------
11 Combined pipeline for:
12 1. Structural analysis of PDB files
13 2. AF2 score visualization
14 3. Sequence extraction and formatting
15 4. Generation of publication-ready visualizations
16
17 Input Required:
18 -------------
19 1. Directory containing PDB files
20 2. af2_scores.csv file in same directory containing:
21    - design: Design number
22    - n: Sequence number
23    - seq: Sequences in format "sequence1/sequence2"
24    - i_pae: iPAE scores
25    - Other AF2 metrics
26
27 Output Generated:
28 ---------------
29 1. PowerPoint presentation with:
30    - Slide 1: Structure-function correlation plots
31    - Slide 2: iPAE score visualization
32    - Slide 3: Top 10 sequences by iPAE score
33    - Slide 4: Detailed interface analysis for structures with i_PAE < 7.5
34 2. Combined FASTA file with all sequences
35 3. CSV file with combined structural analysis
36
37 Analysis Parameters:
38 ------------------
39 1. Hydrogen Bonds:
40    - Distance cutoff: O-N distance < 3.5 Å
41    - Calculated between backbone atoms only
42    - Only inter-chain H-bonds counted
43
44 2. Salt Bridges:
45    - Distance cutoff: < 4.0 Å between any atoms of residue pairs
46    - Residue pairs considered:
47      * Acidic: ASP, GLU
48      * Basic: LYS, ARG, HIS
49    - Only inter-chain salt bridges counted
50
51 3. Hydrophobic Contacts:
52    - Distance cutoff: < 5.0 Å between any atoms of residue pairs
53    - Hydrophobic residues considered:
54      * ALA, VAL, LEU, ILE, MET, PHE, TRP, PRO
55    - Only inter-chain contacts counted
56
57 4. Buried Surface Area:
58    - Calculated using FreeSASA algorithm
59    - Uses default atomic radii from FreeSASA (based on NACCESS/RSA)
60    - Process:
61      * First calculates SASA for entire complex
62      * Then calculates SASA for each chain individually
63      * BSA = (Sum of individual chain SASAs - Complex SASA) / 2
64    - Units: Å²
65    - Inter-chain burial only (interface area)
66    - Probe radius: 1.4 Å (water molecule)
67    - Resolution: 100 points/atom (FreeSASA default)
68
69 5. Interface Analysis (for structures with i_PAE < 7.5):
70    - Core Region: Residues with >90% SASA burial upon complex formation
71    - Rim Region: Residues with 10-90% SASA burial
72    - Residue Classification:
73      * Hydrophobic: ALA, VAL, LEU, ILE, MET, PHE, TRP, PRO
74      * Polar: SER, THR, ASN, GLN, TYR, CYS
75      * Charged: ASP, GLU, LYS, ARG, HIS
76    - SASA calculated using FreeSASA with default parameters
77      * Probe radius: 1.4 Å
78      * Per-residue SASA summed from atomic areas
79
80 6. Clash Score:
81    - Calculated as clashes per 1000 atoms
82    - Clash defined as: non-bonded atoms closer than sum of vdW radii - 0.4 Å
83    - Only inter-chain clashes considered
84    - Hydrogen atoms excluded
85    - Van der Waals radii used:
86      * C: 1.7 Å
87      * N: 1.55 Å
88      * O: 1.52 Å
89      * S: 1.8 Å
90      * P: 1.8 Å
91      * Halogens: F: 1.47 Å, Cl: 1.75 Å, Br: 1.85 Å, I: 1.98 Å
92 """
93
94 # Install required packages
95 !pip install -q biopython pandas freesasa numpy matplotlib seaborn python-pptx plotly kaleido
96
97 # Import required libraries
98 import os
99 import sys
100 import time
101 import pandas as pd
102 import matplotlib.pyplot as plt
103 from google.colab import files, drive
104 from pathlib import Path
105 from typing import Dict, List, Optional, Tuple, Any
106 from Bio import PDB
107 from Bio.PDB.PDBIO import PDBIO
108 from Bio.PDB.Polypeptide import is_aa
109 from Bio.PDB.Structure import Structure
110 import freesasa
111 import numpy as np
112 import seaborn as sns
113 from pptx import Presentation
114 from pptx.util import Inches, Cm, Pt
115 import plotly.graph_objects as go
116 from plotly.subplots import make_subplots
117
118 # Custom exception for structure validation
119 class StructureValidationError(Exception):
120     pass
121 # ===============================
122 # Structure Analysis Functions
123 # ===============================
124
125 def validate_pdb_file(file_path: str) -> bool:
126     """Validates if file exists and has proper PDB format."""
127     if not os.path.exists(file_path):
128         raise FileNotFoundError(f"PDB file not found: {file_path}")
129     try:
130         with open(file_path, 'r') as f:
131             first_line = f.readline()
132             if not any(marker in first_line for marker in ['HEADER', 'ATOM', 'MODEL']):
133                 raise StructureValidationError(f"Invalid PDB: {file_path}")
134     except UnicodeDecodeError:
135         raise StructureValidationError(f"Not a valid text file: {file_path}")
136     return True
137
138 def safe_structure_load(parser: PDB.PDBParser, file_path: str) -> Optional[Structure]:
139     """Safely loads PDB structure with error handling."""
140     try:
141         validate_pdb_file(file_path)
142         structure = parser.get_structure('protein', file_path)
143         if not list(structure.get_models()):
144             raise StructureValidationError("No models")
145         if not list(list(structure.get_models())[0].get_chains()):
146             raise StructureValidationError("No chains")
147         return structure
148     except Exception as e:
149         print(f"Error loading {file_path}: {str(e)}")
150         return None
151
152 def calculate_buried_surface_area(pdb_file: str) -> Tuple[Optional[float], Optional[Dict[str, float]]]:
153     """Calculates buried surface area between chains."""
154     parser = PDB.PDBParser(QUIET=True)
155     structure = safe_structure_load(parser, pdb_file)
156     if not structure:
157         return None, None
158     try:
159         chains = list(structure.get_chains())
160         if len(chains) < 2:
161             print(f"Warning: {pdb_file} has fewer than 2 chains")
162             return None, None
163
164         combined_structure = freesasa.Structure(pdb_file)
165         result = freesasa.calc(combined_structure)
166         total_area = result.totalArea()
167
168         chain_areas = {}
169         io = PDBIO()
170         temp_files = []
171
172         for chain in chains:
173             new_structure = PDB.Structure.Structure('temp')
174             new_model = PDB.Model.Model(0)
175             new_structure.add(new_model)
176             new_model.add(chain.copy())
177
178             temp_file = f"temp_chain_{chain.id}.pdb"
179             temp_files.append(temp_file)
180
181             io.set_structure(new_structure)
182             io.save(temp_file)
183
184             chain_structure = freesasa.Structure(temp_file)
185             chain_result = freesasa.calc(chain_structure)
186             chain_areas[chain.id] = chain_result.totalArea()
187
188         for temp_file in temp_files:
189             if os.path.exists(temp_file):
190                 os.remove(temp_file)
191
192         total_individual_area = sum(chain_areas.values())
193         buried_surface_area = abs(total_individual_area - total_area) / 2
194         return buried_surface_area, chain_areas
195
196     except Exception as e:
197         print(f"Error calculating BSA for {pdb_file}: {str(e)}")
198         return None, None
199
200 def calculate_hydrogen_bonds(structure: Structure) -> int:
201     """Calculates number of hydrogen bonds between chains."""
202     try:
203         h_bonds = 0
204         for chain1 in structure.get_chains():
205             for chain2 in structure.get_chains():
206                 if chain1.id >= chain2.id:
207                     continue
208                 for res1 in chain1.get_residues():
209                     if not is_aa(res1):
210                         continue
211                     for res2 in chain2.get_residues():
212                         if not is_aa(res2):
213                             continue
214                         if 'O' in res1 and 'N' in res2:
215                             distance = res1['O'] - res2['N']
216                             if distance < 3.5:
217                                 h_bonds += 1
218         return h_bonds
219     except Exception as e:
220         print(f"Error calculating H-bonds: {str(e)}")
221         return 0
222 def calculate_hydrophobic_contacts(structure: Structure) -> int:
223     """
224     Calculates number of hydrophobic contacts between chains.
225     Considers residues ALA, VAL, LEU, ILE, MET, PHE, TRP, PRO.
226     Contact is counted if distance < 5.0 Å.
227     """
228     try:
229         hydrophobic_residues = {'ALA', 'VAL', 'LEU', 'ILE', 'MET', 'PHE', 'TRP', 'PRO'}
230         contacts = 0
231         for chain1 in structure.get_chains():
232             for chain2 in structure.get_chains():
233                 if chain1.id >= chain2.id:
234                     continue
235                 for res1 in chain1.get_residues():
236                     if not is_aa(res1) or res1.get_resname() not in hydrophobic_residues:
237                         continue
238                     for res2 in chain2.get_residues():
239                         if not is_aa(res2) or res2.get_resname() not in hydrophobic_residues:
240                             continue
241                         min_distance = float('inf')
242                         for atom1 in res1.get_atoms():
243                             for atom2 in res2.get_atoms():
244                                 distance = atom1 - atom2
245                                 min_distance = min(min_distance, distance)
246                         if min_distance < 5.0:
247                             contacts += 1
248         return contacts
249     except Exception as e:
250         print(f"Error calculating hydrophobic contacts: {str(e)}")
251         return 0
252
253 def calculate_salt_bridges(structure: Structure) -> int:
254     """
255     Calculates number of salt bridges between chains.
256     Salt bridge is counted between ASP/GLU and LYS/ARG/HIS if distance < 4.0 Å.
257     """
258     try:
259         acidic = {'ASP', 'GLU'}
260         basic = {'LYS', 'ARG', 'HIS'}
261         salt_bridges = 0
262         for chain1 in structure.get_chains():
263             for chain2 in structure.get_chains():
264                 if chain1.id >= chain2.id:
265                     continue
266                 for res1 in chain1.get_residues():
267                     if not is_aa(res1):
268                         continue
269                     res1_name = res1.get_resname()
270                     for res2 in chain2.get_residues():
271                         if not is_aa(res2):
272                             continue
273                         res2_name = res2.get_resname()
274                         if ((res1_name in acidic and res2_name in basic) or
275                             (res1_name in basic and res2_name in acidic)):
276                             min_distance = float('inf')
277                             for atom1 in res1.get_atoms():
278                                 for atom2 in res2.get_atoms():
279                                     distance = atom1 - atom2
280                                     min_distance = min(min_distance, distance)
281                             if min_distance < 4.0:
282                                 salt_bridges += 1
283         return salt_bridges
284     except Exception as e:
285         print(f"Error calculating salt bridges: {str(e)}")
286         return 0
287
288 def save_results_as_df(results: List[Dict[str, Any]], output_file: str) -> pd.DataFrame:
289     """
290     Converts analysis results to DataFrame and saves to CSV.
291     Extracts design and variant numbers from filenames.
292     """
293     analysis_data = []
294     for result in results:
295         filename = result['file_name'].replace('.pdb', '')
296         try:
297             design_num = int(filename.split('design')[1].split('_')[0])
298             variant_num = int(filename.split('_n')[1])
299             analysis_data.append({
300                 'design': design_num,
301                 'n': variant_num,
302                 'buried_surface_area': result['buried_surface_area'] if result['buried_surface_area'] else 0,
303                 'hydrogen_bonds': result['hydrogen_bonds'],
304                 'hydrophobic_contacts': result['hydrophobic_contacts'],
305                 'salt_bridges': result['salt_bridges']
306             })
307         except Exception as e:
308             print(f"Error parsing filename {filename}: {str(e)}")
309             continue
310
311     df = pd.DataFrame(analysis_data)
312     df = df.sort_values(['design', 'n']).reset_index(drop=True)
313     df.to_csv(output_file, index=False)
314     print(f"Saved structure analysis to {output_file}")
315     return df
316
317 def merge_with_af2_scores(structure_df: pd.DataFrame, af2_scores_file: str) -> pd.DataFrame:
318     """Merges structural analysis results with AF2 scores."""
319     af2_df = pd.read_csv(af2_scores_file)
320     merged_df = pd.merge(af2_df, structure_df, on=['design', 'n'], how='left')
321     merged_df = merged_df.sort_values(['design', 'n']).reset_index(drop=True)
322     return merged_df
323 # ===============================
324 # Interface Analysis Functions
325 # ===============================
326
327 def calculate_residue_sasa(structure: Structure, chain_id: str, complex: bool = True) -> Dict[str, float]:
328     """
329     Calculates SASA for each residue in a chain.
330
331     Args:
332         structure: PDB Structure object
333         chain_id: Chain identifier
334         complex: If True, calculates SASA in context of complex; if False, treats chain in isolation
335
336     Returns:
337         Dictionary of residue IDs and their SASA values
338     """
339     # Create temporary PDB for SASA calculation
340     io = PDBIO()
341     if not complex:
342         # Create new structure with just the chain of interest
343         new_structure = PDB.Structure.Structure('temp')
344         new_model = PDB.Model.Model(0)
345         new_structure.add(new_model)
346         target_chain = structure[0][chain_id]
347         new_model.add(target_chain)
348         structure = new_structure
349
350     io.set_structure(structure)
351     temp_file = f"temp_sasa_{chain_id}.pdb"
352     io.save(temp_file)
353
354     # Calculate SASA
355     freesasa_struct = freesasa.Structure(temp_file)
356     result = freesasa.calc(freesasa_struct)
357
358     # Get per-residue SASA
359     residue_sasa = {}
360     chain = structure[0][chain_id]
361     for residue in chain:
362         res_id = f"{residue.get_resname()}_{residue.id[1]}"
363         sasa = sum(result.residueAreas()[chain_id].residueAreas[residue.id[1]].total)
364         residue_sasa[res_id] = sasa
365
366     # Cleanup
367     os.remove(temp_file)
368     return residue_sasa
369
370 def analyze_interface_details(structure: Structure) -> Dict:
371     """
372     Performs comprehensive interface analysis.
373
374     Calculates:
375     1. Core/Rim classification (>90% burial for core, 10-90% for rim)
376     2. Residue composition analysis
377     3. Interface shape parameters
378     """
379     results = {}
380
381     # Residue classifications
382     hydrophobic = {'ALA', 'VAL', 'LEU', 'ILE', 'MET', 'PHE', 'TRP', 'PRO'}
383     polar = {'SER', 'THR', 'ASN', 'GLN', 'TYR', 'CYS'}
384     charged = {'ASP', 'GLU', 'LYS', 'ARG', 'HIS'}
385
386     core_residues = {'hydrophobic': 0, 'polar': 0, 'charged': 0}
387     rim_residues = {'hydrophobic': 0, 'polar': 0, 'charged': 0}
388
389     # Analyze each chain
390     for chain in structure.get_chains():
391         # Calculate SASA for isolated chain
392         monomer_sasa = calculate_residue_sasa(structure, chain.id, complex=False)
393         # Calculate SASA in complex
394         complex_sasa = calculate_residue_sasa(structure, chain.id, complex=True)
395
396         for residue_id, monomer_value in monomer_sasa.items():
397             if monomer_value < 0.1:  # Skip buried residues
398                 continue
399
400             complex_value = complex_sasa.get(residue_id, 0)
401             burial_percent = (monomer_value - complex_value) / monomer_value * 100
402
403             # Get residue type
404             res_name = residue_id.split('_')[0]
405             if res_name in hydrophobic:
406                 res_type = 'hydrophobic'
407             elif res_name in polar:
408                 res_type = 'polar'
409             elif res_name in charged:
410                 res_type = 'charged'
411             else:
412                 continue
413
414             # Classify as core or rim
415             if burial_percent > 90:
416                 core_residues[res_type] += 1
417             elif burial_percent > 10:
418                 rim_residues[res_type] += 1
419
420     # Calculate statistics
421     total_core = sum(core_residues.values())
422     total_rim = sum(rim_residues.values())
423
424     results = {
425         'core_count': total_core,
426         'rim_count': total_rim,
427         'core_rim_ratio': total_core / max(1, total_rim),
428         'core_hydrophobic': round(100 * core_residues['hydrophobic'] / max(1, total_core)),
429         'core_polar': round(100 * core_residues['polar'] / max(1, total_core)),
430         'core_charged': round(100 * core_residues['charged'] / max(1, total_core)),
431         'rim_hydrophobic': round(100 * rim_residues['hydrophobic'] / max(1, total_rim)),
432         'rim_polar': round(100 * rim_residues['polar'] / max(1, total_rim)),
433         'rim_charged': round(100 * rim_residues['charged'] / max(1, total_rim))
434     }
435
436     return results
437
438 def create_clash_score(structure: Structure) -> float:
439     """
440     Calculates clash score for structure.
441
442     Clash defined as:
443     - Non-bonded atoms closer than sum of van der Waals radii minus 0.4Å
444     - Only considers inter-chain clashes
445     - Hydrogens not considered
446     """
447     # Van der Waals radii (Å)
448     vdw_radii = {
449         'C': 1.7, 'N': 1.55, 'O': 1.52, 'S': 1.8,
450         'P': 1.8, 'F': 1.47, 'Cl': 1.75, 'Br': 1.85, 'I': 1.98
451     }
452
453     clash_count = 0
454     total_atoms = 0
455
456     # Iterate through chain pairs
457     chains = list(structure.get_chains())
458     for i, chain1 in enumerate(chains):
459         for chain2 in chains[i+1:]:
460             # Get heavy atoms
461             atoms1 = [atom for atom in chain1.get_atoms()
462                      if atom.element != 'H' and atom.element in vdw_radii]
463             atoms2 = [atom for atom in chain2.get_atoms()
464                      if atom.element != 'H' and atom.element in vdw_radii]
465
466             # Check for clashes
467             for atom1 in atoms1:
468                 for atom2 in atoms2:
469                     distance = atom1 - atom2
470                     min_distance = vdw_radii[atom1.element] + vdw_radii[atom2.element] - 0.4
471
472                     if distance < min_distance:
473                         clash_count += 1
474
475             total_atoms += len(atoms1) + len(atoms2)
476
477     # Calculate clashes per 1000 atoms
478     clash_score = (1000 * clash_count) / max(1, total_atoms)
479     return clash_score
480 # ===============================
481 # Visualization Functions
482 # ===============================
483
484 def create_pptx_plots(df: pd.DataFrame, output_dir: str, timestamp: str):
485     """
486     Creates PowerPoint presentation with four slides:
487     1. Structure-function correlation plots
488     2. iPAE visualization
489     3. Top 10 lowest iPAE sequences
490     4. Detailed interface analysis for low iPAE structures
491     """
492     # Initialize presentation
493     prs = Presentation()
494     prs.slide_width = Cm(21)
495     prs.slide_height = Cm(29.7)
496
497     # First slide - correlation plots
498     print("Creating correlation plots...")
499     slide1 = prs.slides.add_slide(prs.slide_layouts[5])
500
501     fig, axes = plt.subplots(3, 2, figsize=(8.27, 11.69))
502     axes = axes.flatten()
503
504     y_vars = ['i_ptm', 'rmsd', 'buried_surface_area',
505               'hydrogen_bonds', 'hydrophobic_contacts', 'salt_bridges']
506     titles = ['iPTM', 'RMSD (Å)', 'Buried Surface Area (Å²)',
507              '# of Hydrogen Bonds', '# of Hydrophobic Contacts', '# of Salt Bridges']
508
509     for ax, y_var, title in zip(axes, y_vars, titles):
510         sns.scatterplot(data=df, x='i_pae', y=y_var, ax=ax, color='black', marker='x', s=16)
511         ax.set_xlabel('i_PAE')
512         ax.set_ylabel(title)
513         ax.set_title(title)
514         ax.set_facecolor('white')
515
516     fig.patch.set_facecolor('white')
517     plt.tight_layout()
518
519     temp_img1 = os.path.join(output_dir, 'temp_plots1.png')
520     plt.savefig(temp_img1, bbox_inches='tight', dpi=300, facecolor='white')
521     plt.close()
522
523     left = Cm(2)
524     top = Cm(2)
525     slide1.shapes.add_picture(temp_img1, left, top)
526
527     # Second slide - iPAE visualization
528     print("Creating iPAE visualization...")
529     slide2 = prs.slides.add_slide(prs.slide_layouts[5])
530
531     fig = make_subplots(
532         rows=4,
533         cols=1,
534         vertical_spacing=0.08,
535         subplot_titles=[f"Designs {i*8}-{(i+1)*8-1}" for i in range(4)]
536     )
537
538     rows_per_subplot = 512  # 8 designs × 64 sequences = 512 rows per subplot
539     colors = ['black', 'red']
540
541     for i in range(4):
542         start_idx = i * rows_per_subplot
543         end_idx = start_idx + rows_per_subplot
544         chunk = df.iloc[start_idx:end_idx].copy()
545
546         for design_num in chunk['design'].unique():
547             mask = chunk['design'] == design_num
548             color = colors[design_num % 2]
549
550             fig.add_trace(
551                 go.Bar(
552                     x=chunk[mask].index,
553                     y=chunk[mask]['i_pae'],
554                     showlegend=False,
555                     marker_color=color,
556                     width=1,
557                 ),
558                 row=i+1,
559                 col=1
560             )
561
562         fig.update_yaxes(
563             range=[0, 30],
564             title_text='iPAE' if i == 1 else None,
565             row=i+1,
566             col=1
567         )
568
569         design_numbers = sorted(chunk['design'].unique())
570         fig.update_xaxes(
571             tickmode='array',
572             ticktext=design_numbers,
573             tickvals=[start_idx + (j*64) + 32 for j in range(len(design_numbers))],
574             row=i+1,
575             col=1,
576             title_text='Design Number' if i == 3 else None
577         )
578
579     fig.update_layout(
580         title='iPAE Scores by Design Number and Sequence (Scale: 0-30)',
581         height=1000,
582         width=1200,
583         showlegend=False,
584         margin=dict(t=50, b=50, r=150, l=50),
585         paper_bgcolor='white',
586         plot_bgcolor='white'
587     )
588
589     temp_img2 = os.path.join(output_dir, 'temp_plots2.png')
590     fig.write_image(temp_img2)
591
592     left = Cm(1)
593     top = Cm(1)
594     slide2.shapes.add_picture(temp_img2, left, top)
595     # Third slide - Top 10 sequences
596     print("Creating top 10 sequences slide...")
597     slide3 = prs.slides.add_slide(prs.slide_layouts[5])
598
599     # Get top 10 lowest i_PAE sequences
600     top_10_sequences = df.nsmallest(10, 'i_pae')[['design', 'n', 'i_pae', 'seq']]
601
602     # Add title
603     title = slide3.shapes.title
604     title.text = "Top 10 Sequences (Lowest i_PAE Scores)"
605
606     # Create text box for sequences
607     left = Cm(2)
608     top = Cm(4)
609     width = Cm(17)
610     height = Cm(20)
611     textbox = slide3.shapes.add_textbox(left, top, width, height)
612     text_frame = textbox.text_frame
613     text_frame.clear()
614
615     # Add sequences
616     for _, row in top_10_sequences.iterrows():
617         sequence = row['seq'].split('/')[1].strip()
618         p = text_frame.add_paragraph()
619         p.text = f">d{row['design']}n{row['n']} (i_PAE: {row['i_pae']:.4f})\n{sequence}"
620         p.font.name = 'Courier New'
621         p.font.size = Pt(8)
622         p.line_spacing = 1.0
623
624     # Fourth slide - Detailed interface analysis
625     print("Creating interface analysis for low i_PAE structures...")
626     slide4 = prs.slides.add_slide(prs.slide_layouts[5])
627
628     # Add title
629     title = slide4.shapes.title
630     title.text = "Detailed Interface Analysis (Structures with i_PAE < 7.5)"
631
632     # Create text box
633     left = Cm(2)
634     top = Cm(4)
635     width = Cm(17)
636     height = Cm(20)
637     textbox = slide4.shapes.add_textbox(left, top, width, height)
638     text_frame = textbox.text_frame
639     text_frame.clear()
640
641     # Get low i_PAE structures
642     low_ipae_structures = df[df['i_pae'] < 7.5].sort_values('i_pae')
643
644     if len(low_ipae_structures) == 0:
645         p = text_frame.add_paragraph()
646         p.text = "No structures found with i_PAE < 7.5"
647         p.font.name = 'Courier New'
648         p.font.size = Pt(8)
649     else:
650         parser = PDB.PDBParser(QUIET=True)
651
652         for _, row in low_ipae_structures.iterrows():
653             pdb_file = os.path.join(os.path.dirname(output_dir),
654                                   f"design{row['design']}_n{row['n']}.pdb")
655             structure = safe_structure_load(parser, pdb_file)
656
657             if structure:
658                 interface_analysis = analyze_interface_details(structure)
659                 clash_score = create_clash_score(structure)
660
661                 p = text_frame.add_paragraph()
662                 p.text = (
663                     f"Structure d{row['design']}n{row['n']} (i_PAE: {row['i_pae']:.2f})\n"
664                     f"Buried Surface Area: {row['buried_surface_area']:.1f} Å²\n"
665                     f"Clash Score: {clash_score:.2f}\n"
666                     f"Interface Analysis:\n"
667                     f"  Core Residues: {interface_analysis['core_count']}\n"
668                     f"  Rim Residues: {interface_analysis['rim_count']}\n"
669                     f"  Core/Rim ratio: {interface_analysis['core_rim_ratio']:.2f}\n"
670                     f"  Core Composition:\n"
671                     f"    Hydrophobic: {interface_analysis['core_hydrophobic']}%\n"
672                     f"    Polar: {interface_analysis['core_polar']}%\n"
673                     f"    Charged: {interface_analysis['core_charged']}%\n"
674                     f"  Rim Composition:\n"
675                     f"    Hydrophobic: {interface_analysis['rim_hydrophobic']}%\n"
676                     f"    Polar: {interface_analysis['rim_polar']}%\n"
677                     f"    Charged: {interface_analysis['rim_charged']}%\n"
678                     f"----------------------------------------\n"
679                 )
680                 p.font.name = 'Courier New'
681                 p.font.size = Pt(8)
682                 p.line_spacing = 1.0
683
684     # Save PowerPoint
685     output_basename = os.path.basename(output_dir)
686     pptx_path = os.path.join(output_dir, f"{output_basename}_{timestamp}_analysis.pptx")
687     prs.save(pptx_path)
688
689     # Clean up temporary files
690     os.remove(temp_img1)
691     os.remove(temp_img2)
692     print(f"Saved PowerPoint to {pptx_path}")
693 # ===============================
694 # Main Processing Functions
695 # ===============================
696
697 def process_multiple_pdb_files(pdb_directory: str, af2_scores_file: str = None) -> pd.DataFrame:
698     """
699     Main processing function that:
700     1. Analyzes all PDB files in directory
701     2. Merges with AF2 scores
702     3. Generates visualizations and outputs
703     4. Saves sequences to FASTA
704     """
705     if not os.path.exists(pdb_directory):
706         raise FileNotFoundError(f"Directory not found: {pdb_directory}")
707
708     # Get timestamp for file naming
709     timestamp = time.strftime("%y%m%d")
710
711     # Initialize results
712     results = []
713     parser = PDB.PDBParser(QUIET=True)
714     pdb_files = [f for f in os.listdir(pdb_directory) if f.endswith('.pdb')]
715
716     if not pdb_files:
717         print(f"No PDB files found in {pdb_directory}")
718         return pd.DataFrame()
719
720     print(f"Processing {len(pdb_files)} PDB files...")
721     total_files = len(pdb_files)
722
723     # Process each PDB file
724     for idx, file_name in enumerate(pdb_files, 1):
725         pdb_file = os.path.join(pdb_directory, file_name)
726         print(f"Processing file {idx}/{total_files}: {file_name}")
727
728         structure = safe_structure_load(parser, pdb_file)
729         if not structure:
730             continue
731
732         # Calculate structural parameters
733         buried_surface_area, chain_areas = calculate_buried_surface_area(pdb_file)
734         h_bonds = calculate_hydrogen_bonds(structure)
735         hydrophobic = calculate_hydrophobic_contacts(structure)
736         salt_bridges = calculate_salt_bridges(structure)
737
738         results.append({
739             'file_name': file_name,
740             'buried_surface_area': buried_surface_area,
741             'hydrogen_bonds': h_bonds,
742             'hydrophobic_contacts': hydrophobic,
743             'salt_bridges': salt_bridges,
744             'chain_areas': chain_areas
745         })
746
747     # Save structural analysis
748     output_basename = os.path.basename(pdb_directory)
749     structure_csv = os.path.join(pdb_directory, f"{output_basename}_{timestamp}_structure.csv")
750     structure_df = save_results_as_df(results, structure_csv)
751
752     # If AF2 scores exist, merge and create visualizations
753     if af2_scores_file and os.path.exists(af2_scores_file):
754         print(f"Merging with AF2 scores from {af2_scores_file}")
755         final_df = merge_with_af2_scores(structure_df, af2_scores_file)
756
757         # Save combined analysis
758         combined_csv = os.path.join(pdb_directory, f"{output_basename}_{timestamp}_combined.csv")
759         final_df.to_csv(combined_csv, index=False)
760         print(f"Saved combined results to {combined_csv}")
761
762         # Create PowerPoint plots
763         create_pptx_plots(final_df, pdb_directory, timestamp)
764
765         # Save sequences to FASTA
766         fasta_path = os.path.join(pdb_directory, f"{output_basename}_{timestamp}_sequences.fasta")
767         with open(fasta_path, 'w') as f:
768             for _, row in final_df.iterrows():
769                 sequence = row['seq'].split('/')[1].strip()
770                 header = f">d{row['design']}n{row['n']}"
771                 f.write(f"{header}\n{sequence}\n")
772         print(f"Saved sequences to {fasta_path}")
773
774         return final_df
775     return structure_df
776
777 # ===============================
778 # Main Execution
779 # ===============================
780
781 if __name__ == "__main__":
782     # Mount Google Drive
783     drive.mount('/content/drive')
784
785     # Set directory containing PDB files and AF2 scores
786     pdb_directory = '/content/drive/MyDrive/PDB-files/202501xx/3NOB-70-110-all_pdb'  # Update this path
787     af2_scores_path = os.path.join(pdb_directory, 'af2_scores.csv')
788
789     if not os.path.exists(af2_scores_path):
790         af2_scores_path = None
791         print("No AF2 scores file found - will generate structure analysis only")
792
793     print("\nStarting analysis...")
794     print(f"Processing PDB files from: {pdb_directory}")
795
796     try:
797         results_df = process_multiple_pdb_files(pdb_directory, af2_scores_path)
798         print("\nAnalysis completed successfully!")
799     except Exception as e:
800         print(f"\nError during analysis: {str(e)}")
801         raise
