# 训练

## 导入训练数据

In [1]:
import pickle

In [2]:
f = open('./dataset/train.pkl','rb')
train = pickle.load(f)

In [3]:
ID = set()
for i in train.keys():
    ID.add(i[:-2])
#print(phone)
ID = list(ID)
#print("###########################")
print(len(ID))

17534


In [4]:
X = []
Y = []

## 获取label为1的训练数据

In [5]:
for i in ID:
    a = train[i+'-A']
    b = train[i+'-B']
    X.append([a,b])
    Y.append(1)

In [6]:
len(X)

17534

## 获取label为0的训练数据

+ 不平衡比例为3：1，此类数据的数量约为50000组

In [7]:
import random

In [8]:
random.shuffle(ID)
for i in ID[:1000]:
    random.shuffle(ID) 
    index = 0
    for j in ID:
        if i != j :
            a = train[i+'-A']
            b = train[j+'-B']
            X.append([a,b])
            Y.append(0)
            index += 1
        
        if index == 50:         
            break
            
           

In [9]:
len(Y)

67534

## 训练数据打乱

In [10]:
import numpy as np
from sklearn.utils import shuffle

In [11]:
X,Y = shuffle(X,Y)

## 构建Siamese模型

In [12]:
import keras
from keras.layers import Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import LSTM, Bidirectional
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Flatten
from keras.models import Model
from sklearn import metrics
from keras import backend as K
from keras.layers import Embedding,Lambda
from keras.layers.normalization import BatchNormalization
from keras.optimizers import SGD,RMSprop,Adam
from keras.callbacks import TensorBoard

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [13]:
EMBEDDING_DIM = 100
MAX_A_LENGTH = 100
MAX_B_LENGTH = 100

## 定义的损失函数

In [14]:
def contrastive_loss(y_true, y_pred):
    
    l2 = 2

    return (
             1.1 * ( y_true - 0.0 ) * K.square(K.maximum(0.0, l2 - y_pred)) +
             ( 1.0 - y_true ) * K.square(y_pred)
    )

## 定义的距离函数

In [15]:
import keras.backend as K
def exp_distance(vects):
    x, y = vects
    return K.exp( 2.5 - K.sqrt(K.sum(K.square(x - y), axis=1, keepdims=True)))

def dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

In [16]:
# 输入
A_sequence_input = Input(shape=(MAX_A_LENGTH,EMBEDDING_DIM), dtype='float32')
B_sequence_input = Input(shape=(MAX_B_LENGTH,EMBEDDING_DIM), dtype='float32')

# 双向LSTM
shared_lstm = Bidirectional(LSTM(MAX_A_LENGTH, return_sequences = False))

BN_A = BatchNormalization()(A_sequence_input)
encoded_A = shared_lstm(BN_A)

BN_B = BatchNormalization()(B_sequence_input)
encoded_B = shared_lstm(BN_B)

encoded_A = Dropout(rate=0.1)(encoded_A)
encoded_A = Dense(units=128)(encoded_A)


encoded_B = Dropout(rate=0.1)(encoded_B)
encoded_B = Dense(units=128)(encoded_B)

distance = Lambda(exp_distance,output_shape=dist_output_shape)([encoded_A, encoded_B])

model = Model(inputs=[A_sequence_input, B_sequence_input], outputs=distance)    

# optimizer
rms = RMSprop()
adam = Adam()

model.compile(optimizer=adam,
              loss=contrastive_loss,  # 'categorical_crossentropy',
              metrics=['accuracy'])


In [17]:
## 打印Model的结构
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 100, 100)     0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 100, 100)     0                                            
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, 100, 100)     400         input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization_2 (BatchNor (None, 100, 100)     400         input_2[0][0]                    
__________________________________________________________________________________________________
bidirectio

## 开始训练(以生成器的方式)

In [18]:
#Y = keras.utils.to_categorical(np.asarray(Y),2)

In [19]:
import word2vec
def generate_batch_data_random(x, y, batch_size):
    """逐步提取batch数据到显存，降低对内存的占用"""
    
    #word2vec模型导入
    w2v_model = word2vec.load('./text8.bin')
    
    while (True):
        
        x,y = shuffle(x,y)
        
        for i in range(batch_size,len(x),batch_size):
            
            batch_A = []
            batch_B = []
            batch_y = y[i - batch_size:i]
            
            for item in x[i - batch_size:i]:
                
                seq = []
                for line in item[0][-1]:    
                    tmp = []
                    for word in line.strip().split(' '):
                        if word in w2v_model.vocab:
                            tmp.append(w2v_model[word])
                    seq.append((len(tmp) * np.mean(tmp,0)).tolist())
                batch_A.append(seq)
            
                seq = []
                for line in item[1][-1]:    
                    tmp = []
                    for word in line.strip().split(' '):
                        if word in w2v_model.vocab:
                            tmp.append(w2v_model[word])
                    seq.append((len(tmp) * np.mean(tmp,0)).tolist())
                batch_B.append(seq) 
            
            yield [np.asarray(batch_A),np.asarray(batch_B)],batch_y
                        

In [22]:
batch_size = 1024
epoch = 10
model.fit_generator(
    generate_batch_data_random(X,Y,batch_size),
    steps_per_epoch = len(X)//batch_size,
    epochs = epoch,
    verbose=1,
    callbacks=[TensorBoard(log_dir='./log/')]    
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fa9484b1c18>

# 进行测试

In [23]:
f = open('./dataset/test.pkl','rb')
test = pickle.load(f)

In [24]:
ID = set()
for i in test.keys():
    ID.add(i[:-2])
#print(ID)
ID = list(ID)
#print("###########################")
print(len(ID))

1948


In [25]:
test_X = []
test_Y = []

## 获取label为1的训练数据

In [26]:
for i in ID:
    a = test[i+'-A']
    b = test[i+'-B']
    test_X.append([a,b])
    test_Y.append(1)

In [27]:
len(test_X)

1948

## 获取label为0的训练数据

+ 不平衡比例为3：1，此类数据的数量约为6000组

In [29]:
random.shuffle(ID)
for i in ID[:600]:
    random.shuffle(ID) 
    index = 0
    for j in ID:
        if i != j :
            a = test[i+'-A']
            b = test[j+'-B']
            test_X.append([a,b])
            test_Y.append(0)
            index += 1
        
        if index == 10:         
            break
            
           

In [30]:
len(test_Y)

7948

## 训练数据打乱

In [31]:
import numpy as np
from sklearn.utils import shuffle

In [32]:
test_X,test_Y = shuffle(test_X,test_Y)

## 预测

In [33]:
import word2vec
#word2vec模型导入
w2v_model = word2vec.load('./text8.bin')

def generate_batch_test_data_random(x):
    """逐步提取batch数据到显存，降低对显存的占用"""
                
    batch_A = []
    batch_B = []

    for item in x:

        seq = []
        for line in item[0][-1]:    
            tmp = []
            for word in line.strip().split(' '):
                if word in w2v_model.vocab:
                    tmp.append(w2v_model[word])
            seq.append((len(tmp) * np.mean(tmp,0)).tolist())
        batch_A.append(seq)

        seq = []
        for line in item[1][-1]:    
            tmp = []
            for word in line.strip().split(' '):
                if word in w2v_model.vocab:
                    tmp.append(w2v_model[word])
            seq.append((len(tmp) * np.mean(tmp,0)).tolist())
        batch_B.append(seq) 

    return [np.asarray(batch_A),np.asarray(batch_B)]
                        

In [34]:
pred = []
batch_size = 1000
for i in range(batch_size,len(test_X),batch_size):
        
    batch_pred = model.predict(generate_batch_test_data_random(test_X[i-batch_size:i]))
    for res,unit in zip(batch_pred,test_X[i-batch_size:i]):
        
        threshold = 2.4
        if res > threshold:
            pred.append(1)
        else:
            pred.append(0)
        
        

In [35]:
y_pred = pred

y_true = test_Y[:7000]

from sklearn.metrics import confusion_matrix
mat = confusion_matrix(y_true,y_pred)
print(mat)

[[5293    0]
 [   0 1707]]


In [36]:
from sklearn.metrics import classification_report
print(classification_report(y_true,y_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      5293
          1       1.00      1.00      1.00      1707

avg / total       1.00      1.00      1.00      7000



In [37]:
import matplotlib  
import matplotlib.pyplot as plt  
import matplotlib.cm as cm 
from sklearn.metrics import confusion_matrix 
import numpy as np

 
labels = [0,1] 
cm = confusion_matrix(y_true, y_pred,labels=labels)  
plt.matshow(cm)  
plt.colorbar()
plt.ylabel('True label')  
plt.xlabel('Predicted label')  
plt.xticks(np.arange(cm.shape[1]),labels)  
plt.yticks(np.arange(cm.shape[1]),labels)  
plt.show() 

<matplotlib.figure.Figure at 0x7fa9d4d20438>