In [13]:
from Bio import SeqIO
import pandas as pd
from Bio.Seq import Seq
import numpy as np


In [14]:

# parse sequence fasta file
identifiers = [seq_record.id for seq_record in SeqIO.parse("All_Oligos.fasta",
                                                           "fasta")]
lengths = [len(seq_record.seq) for seq_record in SeqIO.parse("All_Oligos.fasta",
                                                             "fasta")]
#converting lists to pandas Series    
s1 = pd.Series(identifiers, name='ID')
s2 = pd.Series(lengths, name='length')
s3 = [str(seq_record.seq) for seq_record in SeqIO.parse("All_Oligos.fasta",
                                                             "fasta")]
#Gathering Series into a pandas DataFrame and rename index as ID column
Qfasta = pd.DataFrame(dict(ID=s1, length=s2, seq = s3))

In [15]:
Qfasta

Unnamed: 0,ID,length,seq
0,Dxs_Del2-48_Ser2null,230,ATATAGATGCCGTCCTAGCGCCCTTTAATCAGATGGGTCTCCTatg...
1,Dxs_Del2-48_Phe3null,230,ATATAGATGCCGTCCTAGCGCCCTTTAATCAGATGGGTCTCCTatg...
2,Dxs_Del2-48_Asp4null,230,ATATAGATGCCGTCCTAGCGCCCTTTAATCAGATGGGTCTCCTatg...
3,Dxs_Del2-48_Ile5null,230,ATATAGATGCCGTCCTAGCGCCCTTTAATCAGATGGGTCTCCTatg...
4,Dxs_Del2-48_Ala6null,230,ATATAGATGCCGTCCTAGCGCCCTTTAATCAGATGGGTCTCCTatg...
...,...,...,...
279,Dxs_Del240-286_Met281null,230,TTATAATCATCCTCCCCGGCTAGATTCCCTTGACTGGTCTCCGCCT...
280,Dxs_Del240-286_Thr282null,230,TTATAATCATCCTCCCCGGCTAGATTCCCTTGACTGGTCTCCGCCT...
281,Dxs_Del240-286_Lys283null,230,TTATAATCATCCTCCCCGGCTAGATTCCCTTGACTGGTCTCCGCCT...
282,Dxs_Del240-286_Lys284null,230,TTATAATCATCCTCCCCGGCTAGATTCCCTTGACTGGTCTCCGCCT...


In [16]:
np.unique(Qfasta['length'])

array([230])

In [17]:
# exclude extra introduction of cys into smurfp and also exclude any change of Cys52. But still keep the mutation of Cys66 and Cys103

i = 0
Geneaa_df = Qfasta.copy()
Geneaa_list = []
for idx, row in Qfasta.iterrows():
    DNAseq = row['seq']
    DNAseqBsaI = DNAseq.replace('GGTCTC','@').replace('GAGACC','@')
    if DNAseqBsaI.count('@') != 2:
        i += 1
        print(row['ID'])
        print(DNAseq)
    Genechunk = DNAseqBsaI.split('@')[1][2:-2]
    Gene_aa = Seq(Genechunk).translate()
    Geneaa_list.append(str(Gene_aa))

    
    

In [18]:
Geneaa_list

['MFDIAKYPTLALVDSTQELRLLPKESLPKLCDELRRYLLDSVSRSSGH',
 'MSDIAKYPTLALVDSTQELRLLPKESLPKLCDELRRYLLDSVSRSSGH',
 'MSFIAKYPTLALVDSTQELRLLPKESLPKLCDELRRYLLDSVSRSSGH',
 'MSFDAKYPTLALVDSTQELRLLPKESLPKLCDELRRYLLDSVSRSSGH',
 'MSFDIKYPTLALVDSTQELRLLPKESLPKLCDELRRYLLDSVSRSSGH',
 'MSFDIAYPTLALVDSTQELRLLPKESLPKLCDELRRYLLDSVSRSSGH',
 'MSFDIAKPTLALVDSTQELRLLPKESLPKLCDELRRYLLDSVSRSSGH',
 'MSFDIAKYTLALVDSTQELRLLPKESLPKLCDELRRYLLDSVSRSSGH',
 'MSFDIAKYPLALVDSTQELRLLPKESLPKLCDELRRYLLDSVSRSSGH',
 'MSFDIAKYPTALVDSTQELRLLPKESLPKLCDELRRYLLDSVSRSSGH',
 'MSFDIAKYPTLLVDSTQELRLLPKESLPKLCDELRRYLLDSVSRSSGH',
 'MSFDIAKYPTLAVDSTQELRLLPKESLPKLCDELRRYLLDSVSRSSGH',
 'MSFDIAKYPTLALDSTQELRLLPKESLPKLCDELRRYLLDSVSRSSGH',
 'MSFDIAKYPTLALVSTQELRLLPKESLPKLCDELRRYLLDSVSRSSGH',
 'MSFDIAKYPTLALVDTQELRLLPKESLPKLCDELRRYLLDSVSRSSGH',
 'MSFDIAKYPTLALVDSQELRLLPKESLPKLCDELRRYLLDSVSRSSGH',
 'MSFDIAKYPTLALVDSTELRLLPKESLPKLCDELRRYLLDSVSRSSGH',
 'MSFDIAKYPTLALVDSTQLRLLPKESLPKLCDELRRYLLDSVSRSSGH',
 'MSFDIAKYPTLALVDSTQERLLPKESLPKLCDELRRYLLDSVSR

In [6]:
skipseq = 'TTAACCGTCCTCTCCATACGGAGGTCTCCTAATGCCACCAACATTTCTCGCGCGCTTATGGACGAGTTTACCGAAACGGTTAAAGAGGCCACGTTGTCACAGTGGGGATTCTGGTCTCGGAGACCCAGTCTGGGGAGTTCTATCCTG'

In [7]:
Geneaa_df['oligo_aa'] = Geneaa_list

In [8]:
Geneaa_df

Unnamed: 0,ID,length,seq,oligo_aa
0,Alkx_Mut2-27_Asp2Cys,147,ATATAGATGCCGTCCTAGCGCCCGGTCTCCTATGTGCCGTATGCCC...,MCRMPPRPSTRDQRSSGDSRGGRWSPRF
1,Alkx_Mut2-27_Asp2Ser,147,ATATAGATGCCGTCCTAGCGCCCGGTCTCCTATGTCCCGTATGCCC...,MSRMPPRPSTRDQRSSGDSRGGRWSPRF
2,Alkx_Mut2-27_Asp2Gln,147,ATATAGATGCCGTCCTAGCGCCCGGTCTCCTATGCAGCGTATGCCC...,MQRMPPRPSTRDQRSSGDSRGGRWSPRF
3,Alkx_Mut2-27_Asp2Met,147,ATATAGATGCCGTCCTAGCGCCCGGTCTCCTATGATGCGTATGCCC...,MMRMPPRPSTRDQRSSGDSRGGRWSPRF
4,Alkx_Mut2-27_Asp2Asn,147,ATATAGATGCCGTCCTAGCGCCCGGTCTCCTATGAATCGTATGCCC...,MNRMPPRPSTRDQRSSGDSRGGRWSPRF
...,...,...,...,...
61795,Dxs_Ins148-176_INS176Arg,150,GGCTGTTAGTTTTAGAGCCGGTCTCCTCACGCGGGCGATATCCGTC...,HAGDIRPDMLVILNDNEMSISENVGALNNRHL
61796,Dxs_Ins148-176_INS176Trp,150,GGCTGTTAGTTTTAGAGCCGGTCTCCTCACGCGGGCGATATCCGTC...,HAGDIRPDMLVILNDNEMSISENVGALNNWHL
61797,Dxs_Ins148-176_INS176Val,150,GGCTGTTAGTTTTAGAGCCGGTCTCCTCACGCGGGCGATATCCGTC...,HAGDIRPDMLVILNDNEMSISENVGALNNVHL
61798,Dxs_Ins148-176_INS176Glu,150,GGCTGTTAGTTTTAGAGCCGGTCTCCTCACGCGGGCGATATCCGTC...,HAGDIRPDMLVILNDNEMSISENVGALNNEHL


In [9]:
finaldf = Geneaa_df.drop_duplicates(subset=['oligo_aa'])

In [10]:
len(finaldf)

60212

In [15]:
# exclude extra introduction of cys into smurfp and also exclude any change of Cys52. But still keep the mutation of Cys66 and Cys103
nocysfinal = finaldf.copy()
i = 0
for idx, row in finaldf.iterrows():
    Seq = row['seq']
    ID = row['ID']
    #if Seq == skipseq:
    #    nocysfinal = nocysfinal.drop(idx)
    if 'Smurfp' in ID:
        if 'Cys'in ID :
            if ('103'in ID) and ('INS' not in ID) :
                #print(idx)
                continue
            if ('66'in ID) and ('INS' not in ID) :
                #print(idx)   
                continue
            else:
                i += 1
                print(ID)
                nocysfinal = nocysfinal.drop(idx)
print(i)

Smurfp_Mut2-27_Lys2Cys
Smurfp_Mut2-27_Thr3Cys
Smurfp_Mut2-27_Ser4Cys
Smurfp_Mut2-27_Glu5Cys
Smurfp_Mut2-27_Gln6Cys
Smurfp_Mut2-27_Arg7Cys
Smurfp_Mut2-27_Val8Cys
Smurfp_Mut2-27_Asn9Cys
Smurfp_Mut2-27_Ile10Cys
Smurfp_Mut2-27_Ala11Cys
Smurfp_Mut2-27_Thr12Cys
Smurfp_Mut2-27_Leu13Cys
Smurfp_Mut2-27_Leu14Cys
Smurfp_Mut2-27_Thr15Cys
Smurfp_Mut2-27_Glu16Cys
Smurfp_Mut2-27_Asn17Cys
Smurfp_Mut2-27_Lys18Cys
Smurfp_Mut2-27_Lys19Cys
Smurfp_Mut2-27_Lys20Cys
Smurfp_Mut2-27_Ile21Cys
Smurfp_Mut2-27_Val22Cys
Smurfp_Mut2-27_Asp23Cys
Smurfp_Mut2-27_Lys24Cys
Smurfp_Mut2-27_Ala25Cys
Smurfp_Mut2-27_Ser26Cys
Smurfp_Mut2-27_Gln27Cys
Smurfp_Ins2-27_INS2Cys
Smurfp_Ins2-27_INS3Cys
Smurfp_Ins2-27_INS4Cys
Smurfp_Ins2-27_INS5Cys
Smurfp_Ins2-27_INS6Cys
Smurfp_Ins2-27_INS7Cys
Smurfp_Ins2-27_INS8Cys
Smurfp_Ins2-27_INS9Cys
Smurfp_Ins2-27_INS10Cys
Smurfp_Ins2-27_INS11Cys
Smurfp_Ins2-27_INS12Cys
Smurfp_Ins2-27_INS13Cys
Smurfp_Ins2-27_INS14Cys
Smurfp_Ins2-27_INS15Cys
Smurfp_Ins2-27_INS16Cys
Smurfp_Ins2-27_INS17Cys
Smurfp_I

In [17]:
nocysfinal[nocysfinal['seq'] == skipseq]

Unnamed: 0,ID,length,seq,oligo_aa


In [23]:
nocysfinal = nocysfinal.drop(columns = 'oligo_aa')

In [24]:
nocysfinal

Unnamed: 0,ID,length,seq
0,Alkx_Mut2-27_Asp2Cys,147,ATATAGATGCCGTCCTAGCGCCCGGTCTCCTATGTGCCGTATGCCC...
1,Alkx_Mut2-27_Asp2Ser,147,ATATAGATGCCGTCCTAGCGCCCGGTCTCCTATGTCCCGTATGCCC...
2,Alkx_Mut2-27_Asp2Gln,147,ATATAGATGCCGTCCTAGCGCCCGGTCTCCTATGCAGCGTATGCCC...
3,Alkx_Mut2-27_Asp2Met,147,ATATAGATGCCGTCCTAGCGCCCGGTCTCCTATGATGCGTATGCCC...
4,Alkx_Mut2-27_Asp2Asn,147,ATATAGATGCCGTCCTAGCGCCCGGTCTCCTATGAATCGTATGCCC...
...,...,...,...
61795,Dxs_Ins148-176_INS176Arg,150,GGCTGTTAGTTTTAGAGCCGGTCTCCTCACGCGGGCGATATCCGTC...
61796,Dxs_Ins148-176_INS176Trp,150,GGCTGTTAGTTTTAGAGCCGGTCTCCTCACGCGGGCGATATCCGTC...
61797,Dxs_Ins148-176_INS176Val,150,GGCTGTTAGTTTTAGAGCCGGTCTCCTCACGCGGGCGATATCCGTC...
61798,Dxs_Ins148-176_INS176Glu,150,GGCTGTTAGTTTTAGAGCCGGTCTCCTCACGCGGGCGATATCCGTC...


In [25]:
nocysfinal.to_csv('20220608Synoligofinal.csv')

In [None]:
# fatrans All_Oligos.fasta All_Oligos_aa.fa