> This notebook can be used to generate mutant sequences

In [2]:
import pandas as pd
import numpy as np 
import os

### 1. Single Mutant sequences

In [2]:
def generate_mutations(wt_seq):
    mutations = []
    for i in range(len(wt_seq)):
        for amino_acid in 'ACDEFGHIKLMNPQRSTVWY':
            if amino_acid != wt_seq[i]:
                mutated_seq = list(wt_seq)
                mutated_seq[i] = amino_acid
                mutated_seq = ''.join(mutated_seq)

                mutation = {
                    'Position': i + 1,
                    'Original_AA': wt_seq[i],
                    'Mutated_Position': i + 1,
                    'Mutated_AA': amino_acid,
                    'seq': mutated_seq
                }
                mutations.append(mutation)
    return mutations

if __name__ == "__main__":
    wt_seq = 'MLDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVSDPRLADNPLIAINQAFTDLTGYSEEECVGRNCRFLAGSGTEPWLTDKIRQGVREHKPVLVEILNYKKDGTPFRNAVLVAPIYDDDDELLYFLGSQVEVDDDQPNMGMARRERAAEMLKTLSPRQLEVTTLVASGLRNKEVAARLGLSEKTVKMHRGLVMEKLNLKTSADLVRIAVEAGI'
    mutations = generate_mutations(wt_seq)
    
    # Create a DataFrame from the mutations list
    df = pd.DataFrame(mutations)

    # Reorder the columns
    df = df[['Position', 'Original_AA', 'Mutated_Position', 'Mutated_AA', 'seq']]

    # Save the DataFrame to a CSV file
    df.to_csv('single_mutants.csv', index=False)


In [3]:
df.head()

Unnamed: 0,Position,Original_AA,Mutated_Position,Mutated_AA,seq
0,1,M,1,A,ALDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...
1,1,M,1,C,CLDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...
2,1,M,1,D,DLDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...
3,1,M,1,E,ELDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...
4,1,M,1,F,FLDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...


##### Remarks:

In [4]:
print(19*len(wt_seq))

4275


> There are 4275 possible sequences for single mutant proteins.

In [5]:
import pandas as pd

def generate_double_mutants(wt_seq):
    double_mutants = []
    seq_length = len(wt_seq)

    for i in range(seq_length):
        for j in range(i + 1, seq_length):  # Ensure j > i to avoid duplicate pairs
            for amino_acid_i in 'ACDEFGHIKLMNPQRSTVWY':
                for amino_acid_j in 'ACDEFGHIKLMNPQRSTVWY':
                    if amino_acid_i != wt_seq[i] and amino_acid_j != wt_seq[j]:
                        mutated_seq = list(wt_seq)
                        mutated_seq[i] = amino_acid_i
                        mutated_seq[j] = amino_acid_j
                        mutated_seq = ''.join(mutated_seq)

                        double_mutant = {
                            'Position1': i + 1,
                            'Original_AA1': wt_seq[i],
                            'Mutated_Position1': i + 1,
                            'Mutated_AA1': amino_acid_i,
                            'Position2': j + 1,
                            'Original_AA2': wt_seq[j],
                            'Mutated_Position2': j + 1,
                            'Mutated_AA2': amino_acid_j,
                            'seq': mutated_seq
                        }
                        double_mutants.append(double_mutant)

    return double_mutants

if __name__ == "__main__":
    wt_seq = 'MLDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVSDPRLADNPLIAINQAFTDLTGYSEEECVGRNCRFLAGSGTEPWLTDKIRQGVREHKPVLVEILNYKKDGTPFRNAVLVAPIYDDDDELLYFLGSQVEVDDDQPNMGMARRERAAEMLKTLSPRQLEVTTLVASGLRNKEVAARLGLSEKTVKMHRGLVMEKLNLKTSADLVRIAVEAGI'
    double_mutants = generate_double_mutants(wt_seq)
    
    # Create a DataFrame from the mutations list
    df2 = pd.DataFrame(double_mutants)

    # Reorder the columns
    df2 = df2[['Position1', 'Original_AA1', 'Mutated_Position1', 'Mutated_AA1', 'Position2', 'Original_AA2', 'Mutated_Position2', 'Mutated_AA2', 'seq']]

    # Save the DataFrame to a CSV file
    df2.to_csv('double_mutants.csv', index=False)

df2.head()


Unnamed: 0,Position1,Original_AA1,Mutated_Position1,Mutated_AA1,Position2,Original_AA2,Mutated_Position2,Mutated_AA2,seq
0,1,M,1,A,2,L,2,A,AADMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...
1,1,M,1,A,2,L,2,C,ACDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...
2,1,M,1,A,2,L,2,D,ADDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...
3,1,M,1,A,2,L,2,E,AEDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...
4,1,M,1,A,2,L,2,F,AFDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...


##### Remarks

In [6]:
df2.shape

(9097200, 9)

In [7]:
wt_seq = 'MLDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVSDPRLADNPLIAINQAFTDLTGYSEEECVGRNCRFLAGSGTEPWLTDKIRQGVREHKPVLVEILNYKKDGTPFRNAVLVAPIYDDDDELLYFLGSQVEVDDDQPNMGMARRERAAEMLKTLSPRQLEVTTLVASGLRNKEVAARLGLSEKTVKMHRGLVMEKLNLKTSADLVRIAVEAGI'
len_wt_seq = len(wt_seq)
number_of_double_mutants = 19**2 * (len_wt_seq * (len_wt_seq - 1)) // 2
print("Number of double mutants (excluding original amino acids):", number_of_double_mutants)

Number of double mutants (excluding original amino acids): 9097200


In [3]:
#Function to split the double_mutants.csv in different file because it is too big to be trained otherwise
def split_csv(input_csv_path, output_folder, chunk_size):
    """
    Split a CSV file into smaller chunks.

    Parameters:
    - input_csv_path: str, the path to the input CSV file.
    - output_folder: str, the folder where the smaller CSV files will be saved.
    - chunk_size: int, the number of rows in each smaller CSV file.
    """
    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Read the input CSV file
    df = pd.read_csv(input_csv_path)

    # Calculate the number of chunks
    num_chunks = len(df) // chunk_size

    # Split the DataFrame into chunks and save each chunk as a separate CSV file
    for i in range(num_chunks):
        start_idx = i * chunk_size
        end_idx = (i + 1) * chunk_size
        chunk_df = df.iloc[start_idx:end_idx, :]
        
        # Create a file name based on the index
        output_csv_path = os.path.join(output_folder, f"chunk_{i + 1}.csv")
        
        # Save the chunk as a CSV file
        chunk_df.to_csv(output_csv_path, index=False)

    # If there's a remainder, save the last chunk
    if len(df) % chunk_size != 0:
        last_chunk_df = df.iloc[num_chunks * chunk_size:, :]
        output_csv_path = os.path.join(output_folder, f"chunk_{num_chunks + 1}.csv")
        last_chunk_df.to_csv(output_csv_path, index=False)

# Example usage:
input_csv_path = 'double_mutants.csv'
output_folder = 'data'
chunk_size = 500000  # Adjust the chunk size as needed

split_csv(input_csv_path, output_folder, chunk_size)


> 