In [1]:
import pandas as pd
import numpy as np

In [2]:
!pip install xlrd
nature = pd.read_excel('data/LibraryNatural_Training5299.xls')
design = pd.read_excel('data/20220928_design_sheet.xlsx')
seq_59_design = [a[:16] + a[18:44] + a[45:] for a in design.seq][:11608]



In [3]:
def amino_acid_to_logits(sequences):
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY-*'
    mapping = {aa: i for i, aa in enumerate(amino_acids)}
    return np.array([[mapping[aa] for aa in seq] for seq in sequences])

def hamming_distances(design_sequences, nature_sequences):
    distances = np.zeros((len(design_sequences), len(nature_sequences)))
    for i, design_sequence in enumerate(design_sequences):
        distances[i] = np.sum(design_sequence != nature_sequences, axis=1)
    return distances

design_sequences = amino_acid_to_logits(seq_59_design)
nature_sequences = amino_acid_to_logits(nature['Sequence_aligned'].values.tolist())

distances = hamming_distances(design_sequences, nature_sequences)
closest_sequences_indices = np.argmin(distances, axis=1)

closest_sequences = [nature['Sequence_aligned'][index] for index in closest_sequences_indices]

In [4]:
closest_sequences

['NFIYKAKALYPYDADDAYEISFEQNEILQVSDIEGRWWKARRNGETGIIPSNYVQLIDG',
 'PALFEVQSLYDYDALEPTDLSLSKGERLVIVDQMEHWWKARNNGQEGYIPANYVRKLGL',
 'TARKVAVALHDFNAASSDELSLKVGDRVTVLNEVTGWWMGE-NGRSGLFPTTYTEVISS',
 'PKKPMAKVLYDFSSAQSNELSIKAGELVEIVSKEGGWWLCMNTSVQGWTPQAYLEEQKA',
 'SKKPTAKVLYDFSSDRDNELNIRAGEIVQIVSKEGGWWLCMNTSTQGWTPEAYLEEQVA',
 'PKKPTAKALYDFNSQQPNELSIKAGEIVQIVSKEGGWWLCMNTSSQGWTPEAYLEEQVA',
 'PKKPSAKVLYDFSSDRPNELTIHAGEIIQIVSKEGGWWLCMNTSAQGWTPEAYLEEIPV',
 'PKKPSAKVLYDFSSDRPNELTIHAGEIIQIVSKEGGWWLCMNTSAQGWTPEAYLEEIPV',
 'SKKPTAKVLYDFSSDRDNELNIRAGEIVQIVSKEGGWWLCMNTSTQGWTPEAYLEEQVA',
 'PKKPMAKVLYDFNSGNTNELAIRQGELVQIISRETGWWLCMNTGVQGWTPEAYIEEIKE',
 'SKKPTAKVLYDFSSDRDNELNIRAGEIVQIVSKEGGWWLCMNTSTQGWTPEAYLEEQVA',
 'PTKPQAKVKFDFNSPNANELSIKAGEIIEIVQKEGGWWLCKNTNAQGWTPSAYVEEIEQ',
 'TARKVAVALHDFNAASSDELSLKVGDRVTVLNEVTGWWMGE-NGRSGLFPTTYTEVISS',
 'TARKVAVALHDFNAASSDELSLKVGDRVTVLNEVTGWWMGE-NGRSGLFPTTYTEVISS',
 'SLGGKFISLADYTAAGHSEVSMNEGDTIELLKTGCGWWYVRVSNAEGWVPAAYLESINR',
 'RPETYVTVLYDYKAQGELELSLKAGEKLKLIEKENTWW

In [5]:
nature.loc[closest_sequences_indices] # index for each closest design seq

Unnamed: 0,Sequence_aligned,Sequences_unaligned,RE_norm,Protein Name,DomainName,TranscriptID,Species,Phylogeny,orthologous_group,DomainNo,In Training Set
827,NFIYKAKALYPYDADDAYEISFEQNEILQVSDIEGRWWKARRNGET...,DNFIYKAKALYPYDADDDDAYEISFEQNEILQVSDIEGRWWKARRA...,1.000000,Tyrosine kinases,"YER118C_domain_number[1],jgi",2071.0,Saccharomyces cerevisiae S288C,cellular organisms; Eukaryota; Opisthokonta; F...,NOG09120,1,True
4425,PALFEVQSLYDYDALEPTDLSLSKGERLVIVDQMEHWWKARNNGQE...,APALFEVQSLYDYDALEPTDLSLSKGERLVIVDQMEEHWWKARNNA...,0.012654,Predicted protein {ECO:0000313|EMBL:EDQ89876.1},,,Monosiga brevicollis,cellular organisms; Eukaryota; Opisthokonta; C...,,1,True
2047,TARKVAVALHDFNAASSDELSLKVGDRVTVLNEVTGWWMGE-NGRS...,ATARKVAVALHDFNAASSDELSLKVGDRVTVLNEVTDGWWMGECNG...,-0.086621,Lysophosphatidic acid acyltransferase endophil...,"estExt_Genewise1.C_190192_domain_number[1],""jgi",155364.0,Gelatoporia subvermispora B,cellular organisms; Eukaryota; Opisthokonta; F...,,1,True
504,PKKPMAKVLYDFSSAQSNELSIKAGELVEIVSKEGGWWLCMNTSVQ...,APKKPMAKVLYDFSSAQSNELSIKAGELVEIVSKEGNGWWLCMNTT...,,Myosin class I heavy chain,"ATET_07759_domain_number[1],jgi",1897.0,Aspergillus terreus NIH2624,cellular organisms; Eukaryota; Opisthokonta; F...,COG5022,1,True
2149,SKKPTAKVLYDFSSDRDNELNIRAGEIVQIVSKEGGWWLCMNTSTQ...,ASKKPTAKVLYDFSSDRDNELNIRAGEIVQIVSKEGNGWWLCMNMT...,-0.076212,Myosin class I heavy chain,"EPS31463_domain_number[1],jgi",6586.0,Penicillium oxalicum 114-2,cellular organisms; Eukaryota; Opisthokonta; F...,COG5022,1,True
...,...,...,...,...,...,...,...,...,...,...,...
158,DYGYKARALYAYQANAPTEISFSKGEVLDIVDNSGKWWQARRNGET...,PDYGYKARALYAYQANADDPTEISFSKGEVLDIVDNSGKWWQARRS...,0.912006,Epidermal growth factor receptor kinase substrate,"mRNA_UMAG_03156_domain_number[1],jgi",10649.0,Ustilago maydis 521,cellular organisms; Eukaryota; Opisthokonta; F...,NOG09120,1,True
729,EPVKQAQALYAYKPDDPNELSFKKGDIFDIIDSSGKWWEVEADGST...,EEPVKQAQALYAYKASPDDPNELSFKKGDIFDIIDSSGKWWEVEAA...,0.791435,"Adaptor protein NCK/Dock, contains SH2 and SH3...",fgenesh3_kg.LG_4_#_1174_#_Locus_2884_Transcrip...,688849.0,Laccaria bicolor S238N-H82,cellular organisms; Eukaryota; Opisthokonta; F...,NOG09120,1,True
2229,FTAKKAKAVYTYTAGNADELPFNEGDELSIIDMSEEWWKTEKDGVV...,VFTAKKAKAVYTYTAGNADELPFNEGDELSIIDMSEDEWWKTEKDG...,,Synaptic vesicle protein EHS-1 and related EH ...,"estExt_fgenesh1_pg.C_640024_domain_number[2],jgi",79137.0,Hypholoma sublateritium FD-334 SS-4,cellular organisms; Eukaryota; Opisthokonta; F...,,2,True
1583,NYMYKAKALYAYSADDPNEISFAKGEILDIIDKNGKWWQAKKDGTI...,GNYMYKAKALYAYSASADDPNEISFAKGEILDIIDKNGKWWQAKKA...,0.593156,"Adaptor protein NCK/Dock, contains SH2 and SH3...","estExt_fgenesh1_pg.C_130231_domain_number[1],""jgi",141591.0,Stereum hirsutum FP-91666 SS1,cellular organisms; Eukaryota; Opisthokonta; F...,NOG09120,1,True
