In [2]:
# 训练卷积神经网络
import numpy
import pandas

# order中的key是有序的
from collections import OrderedDict
import re
import os
import io

from sklearn.utils import shuffle

# 评估指标
from sklearn.metrics import roc_auc_score,roc_curve,auc
from scipy import stats

from keras.models import Sequential
from keras.layers import Dense,Dropout,Activation,Flatten,Embedding
from keras.layers import Conv1D
from keras.utils import np_utils
from keras import backend as K
from keras.layers.advanced_activations import LeakyReLU
from keras.optimizers import SGD,Adam,RMSprop
from keras.callbacks import EarlyStopping
from keras.models import load_model

from gensim.models import Word2Vec

aa_idx = OrderedDict([
    ('A', 1),
    ('C', 2),
    ('E', 3),
    ('D', 4),
    ('G', 5),
    ('F', 6),
    ('I', 7),
    ('H', 8),
    ('K', 9),
    ('M', 10),
    ('L', 11),
    ('N', 12),
    ('Q', 13),
    ('P', 14),
    ('S', 15),
    ('R', 16),
    ('T', 17),
    ('W', 18),
    ('V', 19),
    ('Y', 20)
])

In [3]:
# 开始训练

# 1、先读入数据
"""
    return test_peptide,test_target,peptide_length,peptide
"""
# 测试集
test_df = pandas.read_csv('train_test_data/test_data/B2705',delim_whitespace=True)
print(test_df.head())

# 通过正则化进行搜索
peptide = re.search(r'[A-Z]\*\d{2}:\d{2}',test_df['Allele'][0]).group()
print(peptide)

print(test_df['Allele'][0])

peptide_length = len(test_df['Peptide_seq'][0])
print(peptide_length)

measurement_type = test_df['Measurement_type'][0]
print(measurement_type)

if measurement_type.lower() == 'binary':
    test_df['Measurement_value'] = numpy.where(test_df.Measurement_value == 1.0,1,0)
else:
    test_df['Measurement_value'] = numpy.where(test_df.Measurement_value < 500.0,1,0)

test_peptide = test_df.Peptide_seq
test_target = test_df.Measurement_value

test_target = test_target.values

         Date  IEDB_reference       Allele  Peptide_length Measurement_type  \
0  2015-07-31         1029125  HLA-B*27:05               9           binary   
1  2015-07-31         1029125  HLA-B*27:05               9           binary   
2  2015-07-31         1029125  HLA-B*27:05               9           binary   
3  2015-07-31         1029125  HLA-B*27:05               9           binary   
4  2015-07-31         1029125  HLA-B*27:05               9           binary   

  Peptide_seq  Measurement_value  NetMHCpan       SMM      ANN      ARB  \
0   KRGILTLKL                1.0     154.07    257.38    143.0    45.70   
1   KAYKSIVKY                0.0   14040.61  12121.88  11950.0  3691.96   
2   GRLTKHTKR                1.0     633.90    187.31    124.0    31.54   
3   KRYKSIVKR                1.0     102.17    116.30     39.0     7.95   
4   KRYKSIVKL                1.0      55.15    112.35     32.0    21.93   

   SMMPMBEC  IEDB_Consensus  NetMHCcons  PickPocket  
0    229.92         

In [4]:
# 训练集
df_train = pandas.read_csv('train_test_data/train_data/proteins.txt',delim_whitespace=True,header=0)
print(df_train.head())
df_train.columns = ['sequence','HLA','target']

# 创建训练矩阵
df_train = df_train[df_train.HLA == peptide]
print(df_train.head())

df_train = df_train[df_train['sequence'].map(len) == peptide_length]
print(df_train.head())

# 删除未知变量
df_train = df_train[df_train.sequence.str.contains('X') == False]
df_train = df_train[df_train.sequence.str.contains('B') == False]

print(df_train.head())

df_train['target'] = numpy.where(df_train.target == 1,1,0)
seqMatrix = df_train.sequence
targetMatrix = df_train.target
targetMatrix = targetMatrix.values

print('----------------------')
print(seqMatrix)
print('----------------------')
print(targetMatrix)

      Peptide      HLA  BindingCategory
0  AAAAAAAALY  A*29:02                1
1   AAAAALQAK  A*03:01                1
2    AAAAALWL  C*16:01                1
3   AAAAARAAL  B*14:02               -1
4   AAAAEEEEE  A*02:01               -1
      sequence      HLA  target
118  AADFPGIAR  B*27:05      -1
206  AAFLDDNAF  B*27:05      -1
260  AAGLPAIFV  B*27:05      -1
315  AAILKQHKL  B*27:05      -1
353  AAKKKGASL  B*27:05      -1
      sequence      HLA  target
118  AADFPGIAR  B*27:05      -1
206  AAFLDDNAF  B*27:05      -1
260  AAGLPAIFV  B*27:05      -1
315  AAILKQHKL  B*27:05      -1
353  AAKKKGASL  B*27:05      -1
      sequence      HLA  target
118  AADFPGIAR  B*27:05      -1
206  AAFLDDNAF  B*27:05      -1
260  AAGLPAIFV  B*27:05      -1
315  AAILKQHKL  B*27:05      -1
353  AAKKKGASL  B*27:05      -1
----------------------
118       AADFPGIAR
206       AAFLDDNAF
260       AAGLPAIFV
315       AAILKQHKL
353       AAKKKGASL
            ...    
140863    YVYPDNLPR
141059    YYLEKANKI
1