In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import logging

# Setup basic logging
logging.basicConfig(level=logging.INFO)

def parse_peptiderive_scores(file_content):
    """
    Parse Peptiderive output file and extract residue positions and interface scores.
    
    Parameters:
    file_content (str): Content of the Peptiderive output file
    
    Returns:
    dict: Dictionary with positions and their corresponding interface scores
    """
    positions = []
    interface_scores = []
    
    lines = file_content.split('\n')
    start_parsing = False
    
    for line in lines:
        if '| Position | Interface score | Relative interface score (%)' in line:
            start_parsing = True
            continue
            
        if start_parsing and line.startswith('---'):
            break
            
        if start_parsing and '|' in line:
            parts = [part.strip() for part in line.split('|')]
            if len(parts) >= 4:
                try:
                    positions.append(int(parts[1]))
                    interface_scores.append(float(parts[2]))
                except ValueError:
                    continue
    
    return dict(zip(positions, interface_scores))

def create_combined_dataframe(directory="raw_output"):
    """
    Process all .txt files in the specified directory and create a DataFrame
    with residues as rows and interface scores from each file as columns.
    
    Parameters:
    directory (str): Path to the directory containing .txt files
    
    Returns:
    pd.DataFrame: Combined DataFrame with residue positions and interface scores
    """
    all_positions = set()
    all_data = {}
    
    # Get list of all .txt files in the specified directory
    file_path = Path(directory)
    txt_files = list(file_path.glob("*.txt"))
    
    # Process each .txt file
    for idx, file in enumerate(txt_files, 1):
        try:
            with open(file, 'r') as f:
                content = f.read()
            
            # Parse content and store in dictionary
            file_data = parse_peptiderive_scores(content)
            file_key = f"File_{idx}"
            all_data[file_key] = file_data
            all_positions.update(file_data.keys())
            
            logging.info(f"Processed {file.name}")
            
        except FileNotFoundError:
            logging.warning(f"Warning: {file.name} not found")
            continue
    
    # Create DataFrame with all positions and scores
    all_positions = sorted(list(all_positions))
    df_dict = {'Residue': all_positions}
    
    # Add scores from each file
    for idx in range(1, len(txt_files) + 1):
        file_key = f"File_{idx}"
        if file_key in all_data:
            scores = [all_data[file_key].get(pos, np.nan) for pos in all_positions]
            df_dict[f"IF_Score_{idx}"] = scores
    
    # Create final DataFrame
    df = pd.DataFrame(df_dict)
    df = df.sort_values('Residue').reset_index(drop=True)
    
    return df

def main():
    logging.info("Processing Peptiderive files...")
    
    # Create combined DataFrame from files in 'raw_output'
    results = create_combined_dataframe("raw_output")
    
    # Display first few rows
    logging.info("\nFirst few rows of combined data:")
    print(results.head())
    
    # Save results
    results.to_csv('interface_scores.csv', index=False)
    logging.info("\nResults saved to 'interface_scores.csv'")
    
    # Print basic statistics
    logging.info("\nBasic statistics:")
    logging.info(f"Total residues analyzed: {len(results)}")
    logging.info("\nMean scores for each file:")
    for col in results.columns:
        if col.startswith('IF_Score'):
            mean_score = results[col].mean()
            logging.info(f"{col}: {mean_score:.3f}")

if __name__ == "__main__":
    main()


INFO:root:Processing Peptiderive files...
INFO:root:Processed 121944-2.1AA-complex.peptiderive.txt
INFO:root:Processed 121945-3.1AA-complex.peptiderive.txt
INFO:root:Processed 121946-6.1AA-complex.peptiderive.txt
INFO:root:Processed 121947-1.1AA-complex.peptiderive.txt
INFO:root:Processed 121948-12.1AA-complex.peptiderive.txt
INFO:root:Processed 121949-11.1AA-complex.peptiderive.txt
INFO:root:Processed 121950-10.1AA-complex.peptiderive.txt
INFO:root:Processed 121951-8.1AA-complex.peptiderive.txt
INFO:root:Processed 121952-5.1AA-complex.peptiderive.txt
INFO:root:Processed 121953-4.1AA-complex.peptiderive.txt
INFO:root:
First few rows of combined data:
INFO:root:
Results saved to 'interface_scores.csv'
INFO:root:
Basic statistics:
INFO:root:Total residues analyzed: 63
INFO:root:
Mean scores for each file:
INFO:root:IF_Score_1: -3.836
INFO:root:IF_Score_2: -2.607
INFO:root:IF_Score_3: -2.349
INFO:root:IF_Score_4: -2.795
INFO:root:IF_Score_5: -3.571
INFO:root:IF_Score_6: -2.183
INFO:root:I

   Residue  IF_Score_1  IF_Score_2  IF_Score_3  IF_Score_4  IF_Score_5  \
0       21      -2.182       0.040      -0.381      -0.120      -0.257   
1       22      -2.982      -0.142      -1.717      -2.813      -0.916   
2       23      -5.256      -2.020      -2.905      -3.040      -3.684   
3       24      -5.266      -2.024      -2.991      -2.950      -4.303   
4       25      -5.725      -2.098      -3.652      -3.160      -5.059   

   IF_Score_6  IF_Score_7  IF_Score_8  IF_Score_9  IF_Score_10  
0      -0.191      -0.279      -3.303      -2.616          NaN  
1      -1.033      -0.948      -5.689      -3.448          NaN  
2      -2.322      -3.926      -7.207      -5.683          NaN  
3      -2.355      -4.014      -8.164      -5.738          NaN  
4      -2.690      -4.815     -10.977      -5.734          NaN  
