-
Notifications
You must be signed in to change notification settings - Fork 3
/
predict.py
54 lines (48 loc) · 2.11 KB
/
predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 14 18:54:59 2018
@author: dujingb
"""
import pickle
import numpy as np
import pandas as pd
from keras.models import load_model
from keras.preprocessing import sequence
from datetime import datetime
def predict(test_path, tokenizer_path, result_path, isBestWeightUsed = False):
# Loading processed word dictionary into keras Tokenizer would be better
max_length = 1024
print('load data...')
tokenizer = pickle.load(open(tokenizer_path, 'rb'))
text = pd.read_csv(test_path)['api'].values
file_id = pd.read_csv(test_path)['file_id'].values
print('transfer texts to sequences...then pad each sequence...')
X = tokenizer.texts_to_sequences(text)
X = sequence.pad_sequences(X, maxlen=max_length)
print('load model...')
model = load_model('model/lstm_model.h5')
if isBestWeightUsed:#use best weights got from modelCheckPoint(minimum val_loss)
model.load_weights('model/best-lstm_weights.h5')
result_path = 'best-'+result_path
else:#use the final weights after all epoch finished
model.load_weights('model/lstm_weights.h5')
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
print('prediction...')
prediction = model.predict_proba(X, verbose = 0, batch_size = 32)
#prediction = np.around(prediction, decimals = 2) #保留七位小数
df = pd.DataFrame(columns = ['prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7'], data = prediction)
df = df.round(7) # 保留七位小数
df.insert(0, 'file_id', file_id)
df.to_csv(result_path, index = None)
print('save probability of each label finished!')
if __name__ == '__main__':
test_path = 'data/test_set.csv'
tokenizer_path = 'build/tokenizer.pkl'
dt = datetime.now()
d = dt.date()
h = dt.time().hour
m = dt.time().minute
time_str = '{}_{}{}'.format(d, h, m)
result_path = 'data/result_{}.csv'.format(time_str)
isBestWeightUsed = False
predict(test_path, tokenizer_path, result_path, isBestWeightUsed)