predict.py

# -*- coding: utf-8 -*-
"""
Created on Fri Dec 14 18:54:59 2018

@author: dujingb
"""

import pickle
import numpy as np
import pandas as pd
from keras.models import load_model
from keras.preprocessing import sequence
from datetime import datetime

def predict(test_path, tokenizer_path, result_path, isBestWeightUsed = False):
    # Loading processed word dictionary into keras Tokenizer would be better
    max_length = 1024
    print('load data...')
    tokenizer = pickle.load(open(tokenizer_path, 'rb'))
    text = pd.read_csv(test_path)['api'].values
    file_id = pd.read_csv(test_path)['file_id'].values
    print('transfer texts to sequences...then pad each sequence...')
    X = tokenizer.texts_to_sequences(text)
    X = sequence.pad_sequences(X, maxlen=max_length)
    print('load model...')
    model = load_model('model/lstm_model.h5')
    if isBestWeightUsed:#use best weights got from modelCheckPoint(minimum val_loss)
    	model.load_weights('model/best-lstm_weights.h5')
    	result_path = 'best-'+result_path
    else:#use the final weights after all epoch finished
    	model.load_weights('model/lstm_weights.h5')
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    print('prediction...')
    prediction = model.predict_proba(X, verbose = 0, batch_size = 32)
    #prediction = np.around(prediction, decimals = 2) #保留七位小数
    df = pd.DataFrame(columns = ['prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7'], data = prediction)
    df = df.round(7) # 保留七位小数
    df.insert(0, 'file_id', file_id)
    df.to_csv(result_path, index = None)        
    print('save probability of each label finished!')
	
if __name__ == '__main__':
    test_path = 'data/test_set.csv'
    tokenizer_path = 'build/tokenizer.pkl'
    dt = datetime.now() 
    d = dt.date()
    h = dt.time().hour
    m = dt.time().minute
    time_str = '{}_{}{}'.format(d, h, m)
    result_path = 'data/result_{}.csv'.format(time_str)
    isBestWeightUsed = False
    predict(test_path, tokenizer_path, result_path, isBestWeightUsed)