In [None]:
import Bio
import pandas as pd
from Bio.PDB import PDBParser
import os

###INPUTS

pdb_file = 'r1_HA_Michigan_2024_sym_INPUT.pdb'  # Replace with your PDB file
directory = "/home/iwe25/Franz/CEPI/zwaste/we/" # Replace with your path to .weights files
output_csv = '/results'

results_directory = 'results'
if not os.path.exists(results_directory):
    os.makedirs(results_directory)

###read aminoacid sequence from PDB file


def pdb_to_dataframe(pdb_file):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('structure', pdb_file)
    
    # List to hold data for each residue
    residue_data = []
    
    for model in structure:
        for chain in model:
            for residue in chain:
                # Skip heteroatoms and water
                if residue.id[0] != ' ':
                    continue
                
                # Get the residue name (3-letter code), chain ID, and residue number
                resname = residue.resname
                chain_id = chain.id
                resnum = residue.id[1]
                
                # Append to the residue data list
                residue_data.append([resname, resnum])
    
    # Create a DataFrame
    df = pd.DataFrame(residue_data, columns=['RESIDUETYPE', '#POSNUM'])
    
    return df

# Example usage
df_native = pdb_to_dataframe(pdb_file)

# Display the DataFrame
print(df_native)




# Loop through each input.weights file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".weights"):
        filepath = os.path.join(directory, filename)

        # Load the current .weights file into df_reference
        df_reference = pd.read_csv(filepath, sep=' ')  # Adjust the parameters of read_csv if needed
        
        # Assuming df_native is already defined as a DataFrame
        df_native = df_native  # Replace this with the actual loading of df_native if needed
        
        # Function to get the highest RESIDUETYPE that does not match the native one
        def get_highest_non_matching_residue(posnum, native_residuetype, df_ref):
            # Filter reference dataframe by POSNUM
            df_filtered = df_ref[df_ref['#POSNUM'] == posnum]
            
            # Sort by WEIGHT in descending order
            df_sorted = df_filtered.sort_values(by='WEIGHT', ascending=False)
            
            # Iterate through the sorted dataframe and find the first non-matching RESIDUETYPE
            for _, row in df_sorted.iterrows():
                if row['RESIDUETYPE'] != native_residuetype:
                    return row['RESIDUETYPE'], row['WEIGHT']
            
            return None, None  # If all residues match, return None
        
        # Create a list to store results
        results = []
        
        # Iterate over each row in the native dataframe
        for index, row in df_native.iterrows():
            posnum = row['#POSNUM']
            native_residuetype = row['RESIDUETYPE']
            
            # Get the highest non-matching residue for the current position
            selected_residue, weight = get_highest_non_matching_residue(posnum, native_residuetype, df_reference)
            
            if selected_residue:
                results.append({
                    '#POSNUM': posnum,
                    'NATIVE_RESIDUETYPE': native_residuetype,
                    'SELECTED_RESIDUETYPE': selected_residue,
                    'WEIGHT': weight
                })
        
        # Convert results into a dataframe
        df_results = pd.DataFrame(results)
        
        # Function to process and save top entries with a dynamic filename
        def process_and_save_top_entries(df_results, output_file):
            # Ensure the DataFrame has the expected columns
            if 'WEIGHT' not in df_results.columns:
                raise ValueError("The DataFrame must contain a 'WEIGHT' column")
            
            # Get the top 15 entries with the highest weights
            top_15_entries = df_results.nlargest(15, 'WEIGHT')
            
            # Print the top 15 entries
            print(top_15_entries)
            
            # Save the top 15 entries to a CSV file
            top_15_entries.to_csv(output_file, index=False)
            print(f"Top 15 entries have been saved to {output_file}")
        
        # Generate output file name based on input file name
        output_file = os.path.splitext(filename)[0] + "_results.csv"
        
        # Process and save the top entries with the dynamic file name
        process_and_save_top_entries(df_results, output_file)





# Specify the directory where your CSV files are located
output_csv = directory

# Create an empty list to store dataframes
dataframes = []

# Iterate over each CSV file in the directory
for filename in os.listdir(output_csv):
    if filename.endswith('.csv'):
        filepath = os.path.join(output_csv, filename)
        df = pd.read_csv(filepath)
        dataframes.append(df)

# Combine all dataframes into one
combined_df = pd.concat(dataframes)

# Sort by #POSNUM and WEIGHT, with WEIGHT in descending order
sorted_df = combined_df.sort_values(by=['#POSNUM', 'WEIGHT'], ascending=[True, False])

# Drop duplicates in #POSNUM, keeping the one with the highest WEIGHT
final_df = sorted_df.drop_duplicates(subset=['#POSNUM'], keep='first')

# Sort the final dataframe by WEIGHT in descending order
final_sorted_df = final_df.sort_values(by='WEIGHT', ascending=False)

# Save the final sorted dataframe to a new CSV file
final_sorted_df.to_csv('results/final_mutations.csv', index=False)

print("CSV files combined and sorted successfully!")


###franz.dietzmeyer@medizin.uni-leipzig.de
###version 1.0.0

    RESIDUETYPE  #POSNUM
0           SER        1
1           LEU        2
2           VAL        3
3           LYS        4
4           SER        5
..          ...      ...
551         CYS      552
552         ARG      553
553         ILE      554
554         CYS      555
555         ILE      556

[556 rows x 2 columns]
