> The aim of this notebook is to process the labeled dataset.

> Please run every cell only once.

In [1]:
import pandas as pd 
import numpy as np
from Bio import SeqIO

### 1. Data Loading

In [2]:
label_dataset = pd.read_excel('original_data/Supervised_training_data.xlsx')

fasta_file = 'original_data/EL222.fa'
sequences = list(SeqIO.parse(fasta_file, 'fasta'))

### 2. First Glance

In [3]:
label_dataset.head()

Unnamed: 0,Mutation,Light,Darkness
0,WT EL222,52.009956,-2.983813
1,Lys90Glu,-5.717948,0.749374
2,"Asp89Tyr, Lys90Glu",10.860253,9.802956
3,Pro114Arg,676.570607,36.595275
4,Cys75Arg,228.129054,152.705591


In [4]:
# WT sequence

for seq_record in sequences:
    print(f"ID: {seq_record.id}")
    print(f"Description: {seq_record.description}")
    print(f"Sequence: {seq_record.seq}")
  

ID: EL222>Q2NB98
Description: EL222>Q2NB98
Sequence: GADDTRVEVQPPAQWVLDLIEASPIASVVSDPRLADNPLIAINQAFTDLTGYSEEECVGRNCRFLAGSGTEPWLTDKIRQGVREHKPVLVEILNYKKDGTPFRNAVLVAPIYDDDDELLYFLGSQVEVDDDQPNMGMARRERAAEMLKTLS


### 3. Let's get the processing started

#### 3.1 'n_mut' column

In [247]:
# Add a column 'n_mut': number of mutations for each protein

label_dataset['n_mut'] = label_dataset['Mutation'].apply(lambda x: len(x.split(', ')) if ',' in x else 1)

# Only keep single mutants

print(label_dataset.shape)
label_dataset = label_dataset[1:]
label_dataset = label_dataset[label_dataset['n_mut'] == 1]
print(label_dataset.shape)

(40, 4)
(35, 4)


#### 3.2 'seq' column

In [248]:
# Initialize new 'seq' column

wt_seq = 'MGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVSDPRLADNPLIAINQAFTDLTGYSEEECVGRNCRFLAGSGTEPWLTDKIRQGVREHKPVLVEILNYKKDGTPFRNAVLVAPIYDDDDELLYFLGSQVEVDDDQPNMGMARRERAAEMLKTLSPRQLEVTTLVASGLRNKEVAARLGLSEKTVKMHRGLVMEKLNLKTSADLVRIAVEAGI'

label_dataset['seq'] = wt_seq 

In [249]:
# Create three new columns for original_aa, position, and mutated_aa

label_dataset['original_aa'] = label_dataset['Mutation'].str[0:3]
label_dataset['position'] = label_dataset['Mutation'].str[3:-3]
label_dataset['mutated_aa'] = label_dataset['Mutation'].str[-3:]
label_dataset['position'] = pd.to_numeric(label_dataset['position'], errors='coerce')

In [250]:
# Then convert 'original_aa' and 'mutated_aa' into one-letter code

aa_mapping = {
    'Ala': 'A', 'Arg': 'R', 'Asn': 'N', 'Asp': 'D', 'Cys': 'C', 'Glu': 'E',
    'Gln': 'Q', 'Gly': 'G', 'His': 'H', 'Ile': 'I', 'Leu': 'L', 'Lys': 'K',
    'Met': 'M', 'Phe': 'F', 'Pro': 'P', 'Ser': 'S', 'Thr': 'T', 'Trp': 'W',
    'Tyr': 'Y', 'Val': 'V'
}

label_dataset['mutated_aa'] = label_dataset['mutated_aa'].apply(lambda x: ''.join(aa_mapping[aa] for aa in x.split(' ')))
label_dataset['original_aa'] = label_dataset['original_aa'].apply(lambda x: ''.join(aa_mapping[aa] for aa in x.split(' ')))

In [251]:
# Mutate the 'seq' column accordingly to the given mutation

def mutate_sequence(row):
    seq, position, mutated_aa, original_aa, mutation = row['seq'], row['position'], row['mutated_aa'], row['original_aa'], row['Mutation']
    position -= 1
    
    if 0 <= position < len(seq):
        seq_list = list(seq)
        if seq_list[position] == original_aa:
            seq_list[position] = mutated_aa
        else:
            print(f"Mutation '{mutation}': Original amino acid at position {position + 1} is not {original_aa}. Skipping mutation.")
        return ''.join(seq_list)
    else:
        print(f"Mutation '{mutation}': Position {position + 1} is out of bounds. Skipping mutation.")
        return seq  


label_dataset['seq'] = label_dataset.apply(mutate_sequence, axis=1)


In [252]:
# Few checks

label_dataset.head()

Unnamed: 0,Mutation,Light,Darkness,n_mut,seq,original_aa,position,mutated_aa
1,Lys90Glu,-5.717948,0.749374,1,MGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVSDPR...,K,90,E
3,Pro114Arg,676.570607,36.595275,1,MGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVSDPR...,P,114,R
4,Cys75Arg,228.129054,152.705591,1,MGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVSDPR...,C,75,R
5,Glu104Gly,363.278645,96.002665,1,MGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVSDPR...,E,104,G
6,Ile105Val,924.137272,22.264132,1,MGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVSDPR...,I,105,V


In [253]:
print(label_dataset['seq'].iloc[0][89])
print(label_dataset['seq'].iloc[1][113])
print(label_dataset['seq'].iloc[2][75])

E
R
R


#### 3.3 'mutant column'

In [254]:
# Create a 'mutant' column by concatenating original_aa, position, and mutated_aa

label_dataset = label_dataset.copy()  
label_dataset['mutant'] = label_dataset['original_aa'] + label_dataset['position'].astype(str) + label_dataset['mutated_aa']

print(label_dataset['mutant'].iloc[:3])


1     K90E
3    P114R
4     C75R
Name: mutant, dtype: object


### 4. More Data Processing

#### 4.1 Removal of extra columns

In [255]:
label_dataset.drop(['mutated_aa', 'position', 'original_aa', 'Mutation'], axis=1, inplace=True)

label_dataset.head()

Unnamed: 0,Light,Darkness,n_mut,seq,mutant
1,-5.717948,0.749374,1,MGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVSDPR...,K90E
3,676.570607,36.595275,1,MGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVSDPR...,P114R
4,228.129054,152.705591,1,MGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVSDPR...,C75R
5,363.278645,96.002665,1,MGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVSDPR...,E104G
6,924.137272,22.264132,1,MGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVSDPR...,I105V


#### 4.2 Creation of two separate labeled datasets

> We have two fitness labels per mutant: one corresponding to conditions with light and the other to conditions when the protein of interest is in darkness.

In [256]:
label_light = label_dataset.copy().drop('Darkness', axis=1)
label_darkness = label_dataset.copy().drop('Light', axis=1)

label_darkness.head(2)

Unnamed: 0,Darkness,n_mut,seq,mutant
1,0.749374,1,MGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVSDPR...,K90E
3,36.595275,1,MGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVSDPR...,P114R


> We rename the corresponding columns to 'log_fitness,' as is done in the Git repository we are using.

In [257]:
label_darkness = label_darkness.rename(columns={'Darkness': 'log_fitness'})
label_light = label_light.rename(columns={'Light': 'log_fitness'})

label_darkness.head(2)

Unnamed: 0,log_fitness,n_mut,seq,mutant
1,0.749374,1,MGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVSDPR...,K90E
3,36.595275,1,MGQDRPIDGSGAPGADDTRVEVQPPAQWVLDLIEASPIASVVSDPR...,P114R


#### 4.3 Log Fitness Adjustments

> We assume that this column does not require additional processing, as stated in the Git README we are using. The term 'log_fitness' represents the log enrichment ratio or other log-scale fitness values, where a higher value indicates better fitness. Although referred to as 'log_fitness' here, it corresponds to the concept of fitness.

### 5. Processed Datasets savings

In [258]:
Darkness_path = 'data/Darkness/data.csv'
Light_path = 'data/Light/data.csv'

label_darkness.to_csv(Darkness_path)
label_light.to_csv(Light_path)