-
Notifications
You must be signed in to change notification settings - Fork 1
/
run.py
339 lines (300 loc) · 14.1 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
# coding=utf-8
import cv2
import pytesseract
import os
import textdistance
import numpy as np
import pandas as pd
from operator import itemgetter, attrgetter
# 路径配置
ROOT_PATH = os.getcwd()
IMAGE_PATH = os.path.join(ROOT_PATH, 'images/ktp6.png')
LINE_REC_PATH = os.path.join(ROOT_PATH, 'data/ID_CARD_KEYWORD.csv')
CITIES_REC_PATH = os.path.join(ROOT_PATH, 'data/CITIES.csv')
RELIGION_REC_PATH = os.path.join(ROOT_PATH, 'data/RELIGION.csv')
MARRIAGE_REC_PATH = os.path.join(ROOT_PATH, 'data/MARRIAGE.csv')
NEED_COLON = [3, 4, 6, 8, 10, 11, 12, 13, 14, 15, 17, 18, 19, 21]
NEXT_LINE = 9
ID_NUMBER = 3
# 对身份证上的信息进行初步OCR
def ocr_raw(image_path):
# (1) Read
img_raw = cv2.imread(image_path)
image = cv2.resize(img_raw, (50 * 16, 500))
img_gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
id_number = return_id_number(image, img_gray)
# img_gray = cv2.cvtColor(img_raw, cv2.COLOR_BGR2GRAY)
# (2) Threshold
cv2.fillPoly(img_gray, pts=[np.asarray([(540, 150), (540, 499), (798, 499), (798, 150)])], color=(255, 255, 255))
th, threshed = cv2.threshold(img_gray, 127, 255, cv2.THRESH_TRUNC)
# (3) Detect
result_raw = pytesseract.image_to_string(threshed, lang="ind")
return result_raw, id_number
# 去除result_raw中的无效信息
def strip_op(result_raw):
result_list = result_raw.split('\n')
new_result_list = []
for tmp_result in result_list:
if tmp_result.strip(' '):
new_result_list.append(tmp_result)
return new_result_list
# KTP_ number
# NIK_NUMBER模版匹配识别
# 从左到右进行排列识别
def sort_contours(cnts, method="left-to-right"):
reverse = False
i = 0
if method == "right-to-left" or method == "bottom-to-top":
reverse = True
if method == "top-to-bottom" or method == "bottom-to-top":
i = 1
boundingBoxes = [cv2.boundingRect(c) for c in cnts] # 用一个最小的矩形,把找到的形状包起来x,y,h,w
(cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes),
key=lambda b: b[1][i], reverse=reverse))
return cnts, boundingBoxes
def return_id_number(image, gray):
# ret, thresh1 = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY)#二值处理
# 定义核函数
rectKernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 15)) # 返回矩形核函数
# sqKernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 15))
tophat = cv2.morphologyEx(gray, cv2.MORPH_TOPHAT, rectKernel) # 顶帽
gradX = cv2.Sobel(tophat, ddepth=cv2.CV_32F, dx=1, dy=0, ksize=-1)
# ksize=-1相当于用3*3的 sobel过滤器,由于 Sobel 函数求导数后会有负值,还会有大于 255 的值,而原图像是 uint8 ,
# 所以 Sobel 建立的图像位数不够,会有截断。因此要使用 16 位有符号的数据类型,即 cv2.CV_16S
gradX = np.absolute(gradX)
(minVal, maxVal) = (np.min(gradX), np.max(gradX))
gradX = (255 * ((gradX - minVal) / (maxVal - minVal)))
gradX = gradX.astype("uint8") # Sobel之后无法显示,将其转回原来的 uint8 形式,否则无法显示图像
# print (np.array(gradX).shape)
# 通过闭操作(先膨胀,再腐蚀)将数字连在一起
gradX = cv2.morphologyEx(gradX, cv2.MORPH_CLOSE, rectKernel)
# THRESH_OTSU会自动寻找合适的阈值,适合双峰,需把阈值参数设置为0
thresh = cv2.threshold(gradX, 0, 255,
cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
# 再来一个闭操作
thresh = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, rectKernel) # 再来一个闭操作
threshCnts, hierarchy = cv2.findContours(thresh.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # 寻找轮廓
cnts = threshCnts
cur_img = image.copy()
cv2.drawContours(cur_img, cnts, -1, (0, 0, 255), 3)
copy = image.copy()
locs = []
# 遍历轮廓
for (i, c) in enumerate(cnts):
# 计算矩形
(x, y, w, h) = cv2.boundingRect(c)
ar = w / float(h)
# 选择合适的区域,根据实际任务来,这里的基本都是四个数字一组
# if ar >10:
# if (w > 40 ) and (h > 10 and h < 20):
# 符合的留下来
if h > 10 and w > 100 and x < 300:
img = cv2.rectangle(copy, (x, y), (x + w, y + h), (0, 255, 0), 2)
locs.append((x, y, w, h, w * h))
# 将符合的轮廓按照面积从大到小排序
locs = sorted(locs, key=itemgetter(1), reverse=False)
# print(locs[1][0])
nik = image[locs[1][1] - 10:locs[1][1] + locs[1][3] + 10, locs[1][0] - 10:locs[1][0] + locs[1][2] + 10]
# cv_show('nik', nik)
# print(nik)
text = image[locs[2][1] - 10:locs[2][1] + locs[2][3] + 10, locs[2][0] - 10:locs[2][0] + locs[2][2] + 10]
# 读取一个模板图像
img = cv2.imread("images/module.jpeg")
# 灰度图
ref = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# 二值图像
ref = cv2.threshold(ref, 66, 255, cv2.THRESH_BINARY_INV)[1]
# 计算轮廓
# cv2.findContours()函数接受的参数为二值图,即黑白的(不是灰度图),cv2.RETR_EXTERNAL只检测外轮廓,cv2.CHAIN_APPROX_SIMPLE只保留终点坐标
# 返回的list中每个元素都是图像中的一个轮廓
refCnts, hierarchy = cv2.findContours(ref.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# cv2.drawContours(img, refCnts, -1, (0, 0, 255), 3)#上颜色
# cv2.imshow('888', img)
# cv2.waitKey(0)
# cv2.destroyAllWindows()
refCnts = sort_contours(refCnts, method="left-to-right")[0] # 排序,从左到右,从上到下
digits = {}
# 遍历每一个轮廓
for (i, c) in enumerate(refCnts):
# 计算外接矩形并且resize成合适大小
(x, y, w, h) = cv2.boundingRect(c)
# if
roi = ref[y:y + h, x:x + w]
roi = cv2.resize(roi, (57, 88))
# 每一个数字对应每一个模板
digits[i] = roi
# cv_show('digits[i]', digits[i])
# nik = np.clip(nik, 0, 255)
# nik = np.array(nik,np.uint8)
# NIK识别
gray_nik = cv2.cvtColor(nik, cv2.COLOR_BGR2GRAY)
# gray_nik = cv2.GaussianBlur(gray_nik, (3, 3), 0)
# ret_nik, thresh_nik = cv2.threshold(gray_nik, 127, 255, cv2.THRESH_BINARY)
# cv2.imshow('8', gray_nik)
# cv2.waitKey(0)
group = cv2.threshold(gray_nik, 127, 255, cv2.THRESH_BINARY_INV)[1]
# cv2.imshow('9', group)
# 计算每一组的轮廓
digitCnts, hierarchy_nik = cv2.findContours(group.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
nik_r = nik.copy()
cv2.drawContours(nik_r, digitCnts, -1, (0, 0, 255), 3)
# 定位NIK的位置
gX = locs[1][0]
gY = locs[1][1]
gW = locs[1][2]
gH = locs[1][3]
ctx = sort_contours(digitCnts, method="left-to-right")[0]
locs_x = []
# 遍历轮廓
for (i, c) in enumerate(ctx):
# 计算矩形
(x, y, w, h) = cv2.boundingRect(c)
# 选择合适的区域,根据实际任务来,这里的基本都是四个数字一组
if h > 10 and w > 10:
img = cv2.rectangle(nik_r, (x, y), (x + w, y + h), (0, 255, 0), 2)
locs_x.append((x, y, w, h))
# digitCnts = sort_contours(digitCnts, method="left-to-right")[0]
output = []
groupOutput = []
# 计算每一组中的每一个数值
for c in locs_x:
# 找到当前数值的轮廓,resize成合适的的大小
(x, y, w, h) = c
roi = group[y:y + h, x:x + w]
roi = cv2.resize(roi, (57, 88))
# cv_show('roi',roi)
# 计算匹配得分
scores = []
# 在模板中计算每一个得分
for (digit, digitROI) in digits.items():
# 模板匹配
result = cv2.matchTemplate(roi, digitROI,
cv2.TM_CCOEFF)
(_, score, _, _) = cv2.minMaxLoc(result)
scores.append(score)
# 得到最合适的数字
groupOutput.append(str(np.argmax(scores)))
# 画出来
cv2.rectangle(image, (gX - 5, gY - 5),
(gX + gW + 5, gY + gH + 5), (0, 0, 255), 1)
cv2.putText(image, "".join(groupOutput), (gX, gY - 15),
cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
# 图像,文字内容, 坐标 ,字体,大小,颜色,字体厚度
# 得到结果
output.extend(groupOutput)
return ''.join(output)
def main():
raw_df = pd.read_csv(LINE_REC_PATH, header=None)
# 读入城市字典
cities_df = pd.read_csv(CITIES_REC_PATH, header=None)
# 读入宗教字典
religion_df = pd.read_csv(RELIGION_REC_PATH, header=None)
# 读入婚姻字典
marriage_df = pd.read_csv(MARRIAGE_REC_PATH, header=None)
result_raw, id_number = ocr_raw(IMAGE_PATH)
result_list = strip_op(result_raw)
# test
# result_list = ['—————— ————— —&——— ——————', 'PROVINSI DAEGAH ISTIMAWA YOGYEKARTA', 'KABUPATEN SLEMQN', 'NIK : 347LL40209790001', 'Nama :RIYANTO. SE', 'Tempat/Tgi Lahir : GROBOGAN. 02-09-1979', 'Jenis Kelamin : LAKI-LAKI Gol Darah :O', 'Alamat PRM PURI DOMAS D-3. SEMPU', 'RTARW 001: 024', 'Kel/Desa : WEDOMARTANI', 'Kecamatan : NGEMPLAK', 'Agama :ISPAM', 'Status Perkawman: KAWAN PE', 'Pekerjaan : PEDAGANG 05-06-2012', 'Kewarganegaraan: WNI SA—', 'Berlaku Hingga :02-09-2017 N EA']
print('-------origin rec-------')
print(result_list)
loc2index = dict()
for i, tmp_line in enumerate(result_list):
for j, tmp_word in enumerate(tmp_line.split(' ')):
tmp_sim_list = [textdistance.damerau_levenshtein.normalized_similarity(tmp_word_, tmp_word.strip(':')) for
tmp_word_ in
raw_df[0].values]
tmp_sim_np = np.asarray(tmp_sim_list)
arg_max = np.argmax(tmp_sim_np)
if tmp_sim_np[arg_max] >= 0.6:
loc2index[(i, j)] = arg_max ##最高相似度
print('--------processed------------')
# 数据处理
last_result_list = []
useful_info = False
for i, tmp_line in enumerate(result_list):
tmp_list = []
for j, tmp_word in enumerate(tmp_line.split(' ')):
tmp_word = tmp_word.strip(':')
if (i, j) in loc2index:
useful_info = True
if loc2index[(i, j)] == NEXT_LINE: ##换行
last_result_list.append(tmp_list)
tmp_list = []
tmp_list.append(raw_df[0].values[loc2index[(i, j)]])
if loc2index[(i, j)] in NEED_COLON:
tmp_list.append(':')
elif tmp_word == ':' or tmp_word == '':
continue
else:
tmp_list.append(tmp_word)
if useful_info:
if len(last_result_list) > 2 and ':' not in tmp_list:
last_result_list[-1].extend(tmp_list)
else:
last_result_list.append(tmp_list)
# print(last_result_list)
for tmp_data in last_result_list:
if '—' in tmp_data:
tmp_data.remove('—')
# 对前两行进行处理
if 'PROVINSI' in tmp_data or 'KABUPATEN' in tmp_data or 'KOTA' in tmp_data:
for tmp_index, tmp_word in enumerate(tmp_data[1:]):
# 计算相似度
tmp_sim_list = [textdistance.damerau_levenshtein.normalized_similarity(tmp_word, tmp_word_) for
tmp_word_ in cities_df[0].values]
tmp_sim_np = np.asarray(tmp_sim_list)
# 获得相似度最大单词索引
arg_max = np.argmax(tmp_sim_np)
if tmp_sim_np[arg_max] >= 0.6:
tmp_data[tmp_index + 1] = cities_df[0].values[arg_max]
# 对身份证一行进行处理
if 'NIK' in tmp_data:
if len(id_number) != 16:
id_number = tmp_data[2]
if "D" in id_number:
id_number = id_number.replace("D", "0")
if "?" in id_number:
id_number = id_number.replace("?", "7")
if "L" in id_number:
id_number = id_number.replace("L", "1")
while len(tmp_data) > 2:
tmp_data.pop()
tmp_data.append(id_number)
else:
while len(tmp_data) > 3:
tmp_data.pop()
tmp_data[2] = id_number
# 对宗教一行进行处理
if 'Agama' in tmp_data:
for tmp_index, tmp_word in enumerate(tmp_data[1:]):
# 计算相似度
tmp_sim_list = [textdistance.damerau_levenshtein.normalized_similarity(tmp_word, tmp_word_) for
tmp_word_ in religion_df[0].values]
tmp_sim_np = np.asarray(tmp_sim_list)
# 获得相似度最大单词索引
arg_max = np.argmax(tmp_sim_np)
if tmp_sim_np[arg_max] >= 0.6:
tmp_data[tmp_index + 1] = religion_df[0].values[arg_max]
# 对婚姻一行进行处理
if 'Status' in tmp_data or 'Perkawinan' in tmp_data:
for tmp_index, tmp_word in enumerate(tmp_data[2:]):
# 计算相似度
tmp_sim_list = [textdistance.damerau_levenshtein.normalized_similarity(tmp_word, tmp_word_) for
tmp_word_ in marriage_df[0].values]
tmp_sim_np = np.asarray(tmp_sim_list)
# 获得相似度最大单词索引
arg_max = np.argmax(tmp_sim_np)
if tmp_sim_np[arg_max] >= 0.6:
tmp_data[tmp_index + 2] = marriage_df[0].values[arg_max]
if 'Alamat' in tmp_data:
for tmp_index in range(len(tmp_data)):
if "!" in tmp_data[tmp_index]:
tmp_data[tmp_index] = tmp_data[tmp_index].replace("!", "I")
if "1" in tmp_data[tmp_index]:
tmp_data[tmp_index] = tmp_data[tmp_index].replace("1", "I")
if "i" in tmp_data[tmp_index]:
tmp_data[tmp_index] = tmp_data[tmp_index].replace("i", "I")
for tmp_data in last_result_list:
print(' '.join(tmp_data))
if __name__ == '__main__':
main()