> The aim of this notebook is to process the labeled dataset.

> Please run every cell only once.

In [404]:
import pandas as pd 
import numpy as np
from Bio import SeqIO
import shutil

### 1. Data Loading

In [405]:
dataset = pd.read_excel('original_data/Supervised_training_data.xlsx')

fasta_file = 'original_data/EL222.fa'
sequences = list(SeqIO.parse(fasta_file, 'fasta'))

### 2. First Glance

In [406]:
dataset.head()

Unnamed: 0,Mutation,Light,Darkness
0,WT EL222,52.009956,-2.983813
1,Lys90Glu,-5.717948,0.749374
2,"Asp89Tyr, Lys90Glu",10.860253,9.802956
3,Pro114Arg,676.570607,36.595275
4,Cys75Arg,228.129054,152.705591


In [407]:
# WT sequence

for seq_record in sequences:
    print(f"ID: {seq_record.id}")
    print(f"Description: {seq_record.description}")
    print(f"Sequence: {seq_record.seq}")
  

ID: Q2NB98
Description: Q2NB98
Sequence: MLDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVSDPRLADNPLIAINQAFTDLTGYSEEECVGRNCRFLAGSGTEPWLTDKIRQGVREHKPVLVEILNYKKDGTPFRNAVLVAPIYDDDDELLYFLGSQVEVDDDQPNMGMARRERAAEMLKTLSPRQLEVTTLVASGLRNKEVAARLGLSEKTVKMHRGLVMEKLNLKTSADLVRIAVEAGI


### 3. Let's get the processing started

In [408]:
label_dataset = dataset.copy()

#### 3.1 'n_mut' column

In [409]:
# Add a column 'n_mut': number of mutations for each protein

label_dataset['n_mut'] = label_dataset['Mutation'].apply(lambda x: len(x.split(', ')) if ',' in x else 1)

# Only keep single mutants

print(label_dataset.shape)
label_dataset = label_dataset[1:]
label_dataset = label_dataset[label_dataset['n_mut'] == 1]
print(label_dataset.shape)

(40, 4)
(35, 4)


#### 3.2 'seq' column

In [410]:
# Initialize new 'seq' column

wt_seq = 'MLDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVSDPRLADNPLIAINQAFTDLTGYSEEECVGRNCRFLAGSGTEPWLTDKIRQGVREHKPVLVEILNYKKDGTPFRNAVLVAPIYDDDDELLYFLGSQVEVDDDQPNMGMARRERAAEMLKTLSPRQLEVTTLVASGLRNKEVAARLGLSEKTVKMHRGLVMEKLNLKTSADLVRIAVEAGI'

label_dataset['seq'] = wt_seq 

In [411]:
# Create three new columns for original_aa, position, and mutated_aa

label_dataset['original_aa'] = label_dataset['Mutation'].str[0:3]
label_dataset['position'] = label_dataset['Mutation'].str[3:-3]
label_dataset['mutated_aa'] = label_dataset['Mutation'].str[-3:]
label_dataset['position'] = pd.to_numeric(label_dataset['position'], errors='coerce')

In [412]:
# Then convert 'original_aa' and 'mutated_aa' into one-letter code

aa_mapping = {
    'Ala': 'A', 'Arg': 'R', 'Asn': 'N', 'Asp': 'D', 'Cys': 'C', 'Glu': 'E',
    'Gln': 'Q', 'Gly': 'G', 'His': 'H', 'Ile': 'I', 'Leu': 'L', 'Lys': 'K',
    'Met': 'M', 'Phe': 'F', 'Pro': 'P', 'Ser': 'S', 'Thr': 'T', 'Trp': 'W',
    'Tyr': 'Y', 'Val': 'V'
}

label_dataset['mutated_aa'] = label_dataset['mutated_aa'].apply(lambda x: ''.join(aa_mapping[aa] for aa in x.split(' ')))
label_dataset['original_aa'] = label_dataset['original_aa'].apply(lambda x: ''.join(aa_mapping[aa] for aa in x.split(' ')))

In [413]:
# We adjust the position because the 'mutation' column is shifted compared to the wild-type (wt) sequence we use as a reference.

label_dataset['position'] = label_dataset['position'] + 3

In [414]:
# Mutate the 'seq' column accordingly to the given mutation

def mutate_sequence(row):
    seq, position, mutated_aa, original_aa, mutation = row['seq'], row['position'], row['mutated_aa'], row['original_aa'], row['Mutation']
    position -= 1
    
    if 0 <= position < len(seq):
        seq_list = list(seq)
        if seq_list[position] == original_aa:
            seq_list[position] = mutated_aa
        else:
            print(f"Mutation '{mutation}': Original amino acid at position {position + 1} is not {original_aa}. Skipping mutation.")
        return ''.join(seq_list)
    else:
        print(f"Mutation '{mutation}': Position {position + 1} is out of bounds. Skipping mutation.")
        return seq  


label_dataset['seq'] = label_dataset.apply(mutate_sequence, axis=1)


In [415]:
# Few checks

label_dataset.head()

Unnamed: 0,Mutation,Light,Darkness,n_mut,seq,original_aa,position,mutated_aa
1,Lys90Glu,-5.717948,0.749374,1,MLDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...,K,93,E
3,Pro114Arg,676.570607,36.595275,1,MLDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...,P,117,R
4,Cys75Arg,228.129054,152.705591,1,MLDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...,C,78,R
5,Glu104Gly,363.278645,96.002665,1,MLDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...,E,107,G
6,Ile105Val,924.137272,22.264132,1,MLDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...,I,108,V


In [416]:
print(label_dataset['seq'].iloc[0][92])
print(label_dataset['seq'].iloc[1][116])
print(label_dataset['seq'].iloc[2][78])
print(label_dataset['seq'].iloc[3][106])

E
R
R
G


#### 3.3 'mutant column'

In [417]:
# Create a 'mutant' column by concatenating original_aa, position, and mutated_aa

label_dataset = label_dataset.copy()  
label_dataset['mutant'] = label_dataset['original_aa'] + label_dataset['position'].astype(str) + label_dataset['mutated_aa']

print(label_dataset['mutant'].iloc[:3])


1     K93E
3    P117R
4     C78R
Name: mutant, dtype: object


### 4. More Data Processing

#### 4.1 Removal of extra columns

In [418]:
label_dataset.drop(['mutated_aa', 'position', 'original_aa', 'Mutation'], axis=1, inplace=True)

label_dataset.head()

Unnamed: 0,Light,Darkness,n_mut,seq,mutant
1,-5.717948,0.749374,1,MLDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...,K93E
3,676.570607,36.595275,1,MLDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...,P117R
4,228.129054,152.705591,1,MLDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...,C78R
5,363.278645,96.002665,1,MLDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...,E107G
6,924.137272,22.264132,1,MLDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...,I108V


#### 4.2 Creation of two separate labeled datasets

> We have two fitness labels per mutant: one corresponding to conditions with light and the other to conditions when the protein of interest is in darkness.

In [419]:
label_light = label_dataset.copy().drop('Darkness', axis=1)
label_darkness = label_dataset.copy().drop('Light', axis=1)

label_darkness.head(2)

Unnamed: 0,Darkness,n_mut,seq,mutant
1,0.749374,1,MLDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...,K93E
3,36.595275,1,MLDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...,P117R


> We rename the corresponding columns to 'log_fitness,' as is done in the Git repository we are using.

In [420]:
label_darkness = label_darkness.rename(columns={'Darkness': 'log_fitness'})
label_light = label_light.rename(columns={'Light': 'log_fitness'})

label_darkness.head(2)

Unnamed: 0,log_fitness,n_mut,seq,mutant
1,0.749374,1,MLDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...,K93E
3,36.595275,1,MLDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...,P117R


#### 4.3 Log Fitness Adjustments

> 'log_fitness' represents the log enrichment ratio, where a higher value indicates better fitness. 
>
> We will do the following transformation: 
>
>`log_fitness = log((fitness_mut - min_all_mutants + 1) / (fitness_wt - min_all_mutants + 1))`


In [421]:
# For the Darkness dataset

wt_fitness_D = dataset.loc[dataset['Mutation'] == 'WT EL222', 'Darkness']
min_all_mut_D = label_darkness['log_fitness'].min()

numerator_D = label_darkness['log_fitness'] - min_all_mut_D + 1
denominator_D = float(wt_fitness_D.iloc[0] - min_all_mut_D + 1)

label_darkness['log_fitness'] = np.log(numerator_D/denominator_D)
label_darkness.head()

Unnamed: 0,log_fitness,n_mut,seq,mutant
1,0.912759,1,MLDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...,K93E
3,2.821944,1,MLDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...,P117R
4,4.146125,1,MLDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...,C78R
5,3.70227,1,MLDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...,E107G
6,2.405598,1,MLDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...,I108V


In [426]:
# For the Light dataset

wt_fitness_L = dataset.loc[dataset['Mutation'] == 'WT EL222', 'Light']
min_all_mut_L = label_light['log_fitness'].min()

numerator_L = label_light['log_fitness'] - min_all_mut_L + 1
denominator_L = float(wt_fitness_L.iloc[0] - min_all_mut_L + 1)

label_darkness['log_fitness'] = np.log(numerator_L/denominator_L)
label_darkness.head()

Unnamed: 0,log_fitness,n_mut,seq,mutant
1,-4.072915,1,MLDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...,K93E
3,2.454002,1,MLDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...,P117R
4,1.386019,1,MLDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...,C78R
5,1.840579,1,MLDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...,E107G
6,2.763189,1,MLDMGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVS...,I108V


In [428]:
label_darkness['log_fitness'].describe()

count    35.000000
mean      2.483555
std       2.167613
min      -4.072915
25%       1.751211
50%       2.763189
75%       4.108784
max       4.878035
Name: log_fitness, dtype: float64

### 

### 5. Processed Datasets savings

In [429]:
Darkness_path = 'data/Darkness/'
Light_path = 'data/Light/'

label_darkness.to_csv(Darkness_path + 'data.csv')
label_light.to_csv(Light_path + 'data.csv')

In [430]:
# wt.fasta has to be present in each folder

shutil.copy(fasta_file, Darkness_path + 'wt.fasta')
shutil.copy(fasta_file, Light_path + 'wt.fasta')

'data/Light/wt.fasta'