In [1]:
from Bio import pairwise2 as pw
from Bio import Seq
from Bio import SeqIO 
import Bio
from Bio.Alphabet import IUPAC
import numpy as np
import pandas as pd
import re

In [2]:
import time
from functools import wraps
  
def fn_timer(function):
    @wraps(function)
    def function_timer(*args, **kwargs):
        t0 = time.process_time()
        result = function(*args, **kwargs)
        t1 = time.process_time()
        print ("Total time running %s: %s seconds" %
            (function.__name__, str(t1-t0))
            )
        return result
    return function_timer

In [3]:
uniprot = list(SeqIO.parse('uniprot-proteome_human.fasta','fasta'))

In [54]:
uniprot[0].seq

Seq('MKMASSLAFLLLNFHVSLLLVQLLTPCSAQFSVLGPSGPILAMVGEDADLPCHL...KSA', SingleLetterAlphabet())

In [4]:
for seq in uniprot:   #
    x = seq.id
    seq.id = x.split("|")[1]

In [5]:
ids = [seq.id for seq in uniprot]
names = [seq.name for seq in uniprot]

In [6]:
uni_dict = {}
for i in uniprot:
    uni_dict[i.id] =i

In [7]:
#@fn_timer
def calc_identity_score(seq1,seq2,gap=-0.5,extend=-0.1):
    """
    return seq1 seq1 pairwise identiy score
    seq1 is positive 
    seq1 seq2 is string

    Bio.pairwise2.format_alignment output:
    MPKGKKAKG------
      |||||||      
    --KGKKAKGKKVAPA
      Score=7
      
    alignment output: [('MPKGKKAKG------', '--KGKKAKGKKVAPA', 7.0, 0, 15)]
    score = ali[0][2] = 7
    """
    ali = pw.align.globalxs(seq1,seq2,gap,extend,score_only=True)
    # gap penalty = -0.5 in case of cak caak score =3
    return ali/min(len(seq1),len(seq2)) # 返回短序列的值,防止substring

In [8]:
#@fn_timer
def window_generator(seq, window_lenth=7,step=1):
    """
    return list of seq window slide
    seq is string
    """
    if len(seq) >= window_lenth:
        return [seq[i:i+window_lenth] for i in range(0,len(seq)-window_lenth+1,step)]
    else:
        return []

In [9]:
window_generator('123',2)

['12', '23']

In [10]:
from Bio.pairwise2 import format_alignment

In [11]:
def flat(nums):
    res = []
    for i in nums:
        if isinstance(i, list):
            res.extend(flat(i))
        else:
            res.append(i)
    return res

In [12]:
#@fn_timer
def slide_with_flank(seq,full,step=1,up_flank=6,down_flank=6,flags=0):
    """
    return window slide result as list for a full str given potential sub seq and it's flank removed
    seq=abc, full = 01234abc45678abc1234 up=1 down=1 step =1
    result is ['012', '123', '567', '234']
    生成一个str full 的滑动窗口自片段,去除了seq和其上下游的部分
    """
    res = []
    window_len = len(seq)
    coords = [i.span() for i in re.finditer(seq,full,flags)] # = search_all
    # 处理首尾情况
    if len(coords) == 0:
        res.append(window_generator(full,window_lenth=window_len,step=step))

    elif len(coords) == 1:
        if (coords[0][0]-up_flank) >= 0:
            res.append(window_generator(full[0:coords[0][0]-up_flank],
                                        window_lenth=window_len,step=step))
        if (coords[0][1]+down_flank) <= len(full):
            res.append(window_generator(full[coords[0][1]+up_flank:],
                                        window_lenth=window_len,step=step))
    else: # len(coords) >1  
        if (coords[0][0]-up_flank) >= 0:
            res.append(window_generator(full[0:coords[0][0]-up_flank],
                                        window_lenth=window_len,step=step))
        for i in range(1,len(coords)):
        ## 处理 1 2 之间的东西
            if coords[i][0] - coords[i-1][1] > up_flank+down_flank+window_len:
                res.append(window_generator(full[coords[i-1][1]+up_flank:coords[i][0]-down_flank],
                                            window_lenth=window_len,step=step))
                           
        if (coords[-1][1]+down_flank) <= len(full):
            res.append(window_generator(full[coords[-1][1]+down_flank:],
                                        window_lenth=window_len,step=step))           
        
    return set(flat(res))
    #return flat(res)

In [13]:
slide_with_flank('abc','01234abc456raa78abc7779',up_flank=2,down_flank=2)

{'012', '6ra', 'raa'}

In [14]:
from collections import defaultdict

In [15]:
###### @fn_timer

def filter_with_identity_affinity(seq,full,identity_cutoff=0.5,step=1,up=6,down=6,flags=0):
    """给出full中与seq 过滤 identity/netMHC todo的片段
    identity最低的
    netMHC 最好的 若干 个seq 片段
    """
    tmp = 1
    out=''
    slides = slide_with_flank(seq,full,step=step,up_flank=up,down_flank=down,flags=flags)
    for s in slides:
        identity_score = calc_identity_score(seq,s)
        if identity_score <= identity_cutoff:          
            if identity_score <= tmp:
                out = s
                tmp = identity_score
    return out

In [16]:
s5 = pd.read_excel('S5_merge.xlsx')#,sep = None,error_bad_lines=False)

df4 = s5.dropna(subset=["Sequence"])

grist = pd.read_table('gristone_positive_data.txt')

In [44]:
s6 = pd.read_excel('S6_merge.xlsx')

In [45]:
s6.head()

Unnamed: 0,Sequence,Modifications,Modifications (all possible sites),Qvality PEP by SEQUEST,Qvality q-value by FDR,SVM_Score,# Protein Groups,# Proteins,# PSMs,Master Protein Accessions,...,heart,kidney,liver,lung,lymph,ovary,prostate,skeletal_muscle,testes,thyroid
0,KGTQVVKISVHMGRVS,,,5.8485e-08,0.0,2.42936,1.0,2.0,8.0,Q7Z7M9,...,0.100182,0.086017,0.0,3.17426,0.003604,0.241705,0.1607,0.0,0.126499,0.046727
1,HPAHLQTLPVTPNKQKTDG,,,8.44177e-07,0.0,2.009,1.0,2.0,5.0,Q7Z7M9,...,0.100182,0.086017,0.0,3.17426,0.003604,0.241705,0.1607,0.0,0.126499,0.046727
2,HPAHLQTLPVTPNKQKT,,,3.08378e-06,0.0,1.805,1.0,2.0,9.0,Q7Z7M9,...,0.100182,0.086017,0.0,3.17426,0.003604,0.241705,0.1607,0.0,0.126499,0.046727
3,HPAHLQTLPVTPNKQ,,,5.89396e-06,0.0,1.703,1.0,2.0,9.0,Q7Z7M9,...,0.100182,0.086017,0.0,3.17426,0.003604,0.241705,0.1607,0.0,0.126499,0.046727
4,HPAHLQTLPVTPNKQK,,,1.23125e-05,0.0,1.587,1.0,2.0,10.0,Q7Z7M9,...,0.100182,0.086017,0.0,3.17426,0.003604,0.241705,0.1607,0.0,0.126499,0.046727


In [47]:
df6 = s6.dropna(subset=["Sequence"])

In [48]:
len(df6)

2108

In [33]:
ids_set = set(ids)

In [50]:
df6['neg'] =np.full_like(df6['Sequence'],'o')
t0 = time.process_time()
matched = 0
for i in range(0,len(df6)):
#for i in range(0,10000):
    g = df6.iloc[i]
    ac = set(g['Protein Accessions'].split('; '))
    #print(ac)
    unique = list(ac & ids_set)
    if len(unique) >0:
        uni = unique[0]
        matched = matched +1
        pep = str(g['Sequence'])
        full = str(uni_dict[uni].seq)
        #pdb.set_trace() 
        res = filter_with_identity_affinity(pep,full,step=1,identity_cutoff=0.5)

        if res != '':
            df6.loc[i,'neg'] = res 
        
print(time.process_time()-t0)
print(matched)
#grist[grist['neg1'] != 'o'].to_csv('testdata0401.txt',index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


25.130960429000027
1945


In [51]:
len(df6[df6['neg'] == 'o'])

176

In [52]:
df6.to_csv('s6_full_neg.csv')

In [22]:

persons={'ZhangSan':'male',
         'LiSi':'male',
         'WangHong':'female'}

#找出所有男性
males = filter(lambda x:'male'== x[1], persons.items())

for (key,value) in males:
    print('%s : %s' % (key,value))


ZhangSan : male
LiSi : male


In [42]:
grist.to_csv?

[0;31mSignature:[0m
[0mgrist[0m[0;34m.[0m[0mto_csv[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mpath_or_buf[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msep[0m[0;34m=[0m[0;34m','[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mna_rep[0m[0;34m=[0m[0;34m''[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfloat_format[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcolumns[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mheader[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mindex[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mindex_label[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmode[0m[0;34m=[0m[0;34m'w'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mencoding[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcompression[0m[0;34m=[0m[0;34m'infer'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0

In [2]:
%%writefile train.py

UsageError: %%writefile is a cell magic, but the cell body is empty.
