-
Notifications
You must be signed in to change notification settings - Fork 0
/
test.py
123 lines (91 loc) · 2.84 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# coding=utf-8
import preprocess
import pickle
import BLSTM
import os
import random
import shutil
import logging
import pr
import numpy as np
import sys
logging.basicConfig(level = logging.DEBUG,
filename = 'modelrun.log',
filemode = 'w')
batch_size = 20
cdic_size = 5000
worddim = 50
chardim = 50
datafile= "data.txt"
window_size = 0
top_n = 1
em_num = 1
bsize = 10
# word model train and test
data = preprocess.getwData(datafile)
tags = preprocess.tags2dic(map(lambda x:x[1], data))
wdic, wvectors = preprocess.words2dic2("skip_neg10_50", worddim)
padding_id = wdic["<padding>"] = len(wvectors)
pids = [padding_id]
wvectors.append(np.random.randn(worddim))
embedding = wvectors
indexdata = preprocess.raw2num1(data,wdic,tags,0,padding_id)
#record location of every word in sentence
wordloc = []
for item in data:
wordloc.append([])
loc = 0
for word in item[0]:
wordloc[-1].append(loc)
loc += len(word)
wordloc[-1].append(loc)
for word,loc in zip(data[0][0],wordloc[0]):
print word.encode('utf-8')+str(loc)
testlocs = wordloc[len(indexdata)/20*18:len(indexdata)]
traindata = indexdata[0:len(indexdata)/20*16]
devdata = indexdata[len(indexdata)/20*16:len(indexdata)/20*18]
testdata = indexdata[len(indexdata)/20*18:len(indexdata)]
wresult1 = []
for item in testdata:
wresult1.append(pr.pre2en(tags, item[1]))
#转化词语索引的(start, end, type) 为字符索引的(start, end, type)
wresult2 = []
for result,sentence in zip(wresult1,testlocs):
wresult2.append([])
print sentence
for entity in result:
print entity
wresult2[-1].append((sentence[entity[0]], sentence[entity[1]], entity[2]))
wresult = wresult2
# char model
data = preprocess.getcData(datafile)
cdic, cvectors = preprocess.chars2dic2("char_vector_50",chardim)
padding_id = cdic["<padding>"] = len(cvectors)
pids = [padding_id]
cvectors.append(np.random.randn(chardim))
embedding = cvectors
indexdata = preprocess.raw2num1(data,cdic,tags,0,padding_id)
traindata = indexdata[0:len(indexdata)/20*16]
devdata = indexdata[len(indexdata)/20*16:len(indexdata)/20*18]
testdata = indexdata[len(indexdata)/20*18:len(indexdata)]
testwdata = data[len(indexdata)/20*18:len(indexdata)]
cgold = []
for item in testdata:
cgold.append(pr.pre2en(tags,item[1]))
error = 0
isum = 0
lines = []
iid = 0
for item1, item2 in zip(wresult, cgold):
for en1, en2 in zip(item1,item2):
isum += 1
if en1[0] != en2[0] or en1[1] != en2[1] or en1[2] != en2[2]:
error+=1
lines.append(iid)
break
iid += 1
for iid in lines:
print " ".join(testwdata[iid][0])
for en1,en2 in zip(wresult[iid],cgold[iid]):
print "".join(testwdata[iid][0][en1[0]:en1[1]])+" "+ "".join(testwdata[iid][0][en2[0]:en2[1]])
print "sum : "+str(isum)+" error : "+str(error)