In [1]:
import numpy as np
import pandas as pd
from Bio import SeqIO
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [2]:
raw_df = pd.read_csv('raw/GB1_Wu_2016.csv')
del raw_df['Unnamed: 0']
raw_df.dropna(inplace=True)
raw_df.drop(raw_df.index[raw_df['fit']==0], inplace=True)
raw_df.reset_index(drop=True, inplace=True)
fasta_sequences = SeqIO.parse(open('raw/GB1_Wu_2016.fasta'),'fasta')
for fasta in fasta_sequences:
    _, sequence = fasta.id, str(fasta.seq)
mut_positions = [39, 40, 41, 54]


In [3]:
wt = ''
for pos in mut_positions:
    wt = wt + sequence[pos-1]

In [4]:
def get_mut_seq(base_seq, pos, new_aa, positions): 
    '''
    Uses a base sequence, position and new amino acid to generate a sequence corresponding to the inputted mutations. Also performs checks
    that the position is valid and matches the expected old amino acid. 

    Arguments:
    - base_seq[str]: the base sequence of the non-mutant protein
    - pos[int or list]: a single position or a list of positions where mutations occurred
    - old_aa[str or list]: a single string or list of strings indicating the expected old amino acids at pos
    - new_aa[str or list]: a string or list of strings indicating the mutant amino acids
    - mut_positions[str]: a list of totel possible mutation positions 

    Returns:
    - seq[str]: the mutated sequence
    '''
    if type(pos) is int:
        adj_pos = positions.index(pos)
        seq = base_seq[:adj_pos] + new_aa + base_seq[adj_pos+1:]

    else:
        assert len(new_aa)==len(pos), f'Number of positions {len(pos)} is different from the number of new amino acids: {len(new_aa)}'
        seq=base_seq
        for i, pos_ in enumerate(pos):
            adj_pos_ = positions.index(pos_)
            seq = seq[:adj_pos_] + new_aa[i] + seq[adj_pos_+1:]
    return seq

In [11]:
n_mut = []
for i in range(len(raw_df)):
    try:
        n_mut.append(raw_df['muts'][i].count(':') + 1)
    except:
        print(raw_df['muts'][i])
raw_df['n_mut'] = n_mut

# separate single mutants to identify mutated sites and possible amino acids at each site
single_muts = raw_df[raw_df['n_mut']==1]
# for single mutants, the position is an int and the new_aa and old_aa are single-character strings
single_muts['pos'] = single_muts['muts'].apply(lambda x: int(x[1:-1]))
single_muts['new_aa'] = single_muts['muts'].apply(lambda x: x[-1])

multi_muts = raw_df[raw_df['n_mut']>1]
multi_muts.reset_index(drop=True, inplace=True)
# for multi mutants, pos, new_aa and old_aa are lists, in order of each mutation at a given entry
positions = []
new_aa = []
for i in range(len(multi_muts)):
    mutations = multi_muts['muts'][i].split(':')
    positions.append([int(j[1:-1]) for j in mutations])
    new_aa.append([j[-1] for j in mutations])
multi_muts['pos'] = positions
multi_muts['new_aa'] = new_aa
data = pd.concat([single_muts, multi_muts])
data.reset_index(drop=True, inplace=True)
new_seqs = []
var = []
for i, row in tqdm(data.iterrows()):
    new_seqs.append(get_mut_seq(wt, row['pos'], row['new_aa'], mut_positions))
    var.append(row['fit']*0.2)
data.insert(0, 'seq', new_seqs)
data.insert(3, 'var', var)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_muts['pos'] = single_muts['muts'].apply(lambda x: int(x[1:-1]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_muts['new_aa'] = single_muts['muts'].apply(lambda x: x[-1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multi_muts['pos'] = positions
A value is trying to be set on a copy

In [12]:
del data['muts']
del data['pos']
del data['new_aa']
del data['n_mut']

In [23]:
data.head()

Unnamed: 0,seq,fit,var
0,ADGV,0.06191,0.012382
1,CDGV,0.242237,0.048447
2,DDGV,0.006472,0.001294
3,EDGV,0.032719,0.006544
4,FDGV,0.377101,0.07542


In [24]:
train, test = train_test_split(data, test_size=0.3)

In [25]:
train.reset_index(inplace=True, drop=True)

In [26]:
babel = ''
for i in range(len(train)):
    babel += train['seq'][i]

alphabet = set(babel)

In [28]:
train

Unnamed: 0,seq,fit,var
0,HSAT,0.036406,0.007281
1,IPSR,0.000816,0.000163
2,HSFY,0.002095,0.000419
3,DNAA,0.020837,0.004167
4,NCVG,0.048085,0.009617
...,...,...,...
83913,IRPC,0.010819,0.002164
83914,FHAH,0.002404,0.000481
83915,IEKY,0.007213,0.001443
83916,RFKH,0.003458,0.000692


In [29]:
train.to_csv('processed/GB1_train.csv', header=False, index=False)
test.to_csv('processed/GB1_test.csv', header=False, index=False)

In [30]:
data.to_csv('processed/GB1.csv', header=False, index=False)