In [None]:
import argparse
from datetime import date
import os
import sys
import cv2
import json
import numpy as np
import time
import tensorflow as tf

from tensorflow import keras
import tensorflow.keras.backend as K
from tensorflow.keras.optimizers import Adam, SGD

import matplotlib.pyplot as plt
from model import efficientdet
from utils import preprocess_image, postprocess_boxes
from augmentor.color import VisualEffect
from augmentor.misc import MiscEffect
from losses import smooth_l1, focal, smooth_l1_quad
from efficientnet import BASE_WEIGHTS_PATH, WEIGHTS_HASHES
from imgaug.augmentables.bbs import BoundingBox, BoundingBoxesOnImage

import re
import requests
from urllib import error

gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_visible_devices(gpus[1], 'GPU')
tf.config.experimental.set_memory_growth(device=gpus[1], enable=True)

In [None]:
def init_ai():
    phi = 5
    weighted_bifpn = False
    model_path = '/ai/EfficientDet-master/checkpoints/2020-08-12/pascal_195_0.0246_0.0184.h5'
    image_sizes = (512, 640, 768, 896, 1024, 1280, 1408)
    image_size = image_sizes[phi]
    score_threshold = 0.9
    classes = ['jz','qpl']
    num_classes = len(classes)
    colors = [np.random.randint(255,0,0).tolist() for _ in range(num_classes)]
    _, model = efficientdet(phi=phi,
                            weighted_bifpn=weighted_bifpn,
                            num_classes=num_classes,
                            score_threshold=score_threshold)
    model.load_weights(model_path, by_name=True)
    return model

model = init_ai()

In [None]:
def crawl_url(word, page=300):
    t = 0
    i = 1
    s = 0
    url_list = list()
    url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + '&pn='
    while t < page:
        Url = url + str(t)
        try:
            Result = requests.get(Url, timeout=7)
        except BaseException:
            t = t + 60
            continue
        else:
            result = Result.text
            pic_url = re.findall('"objURL":"(.*?)",', result, re.S)  # 先利用正则表达式找到图片url
            if len(pic_url) == 0:
                break
            else:
                url_list = url_list + pic_url
                t = t + 60
    return {'keyword':word, 'urls':url_list}

In [None]:
def ai_process(model, key_url_dict, out_path = ''):
    i = 0
    keyword = key_url_dict['keyword']
    url_list = key_url_dict['urls']
    for url in url_list:
        try:
            print(url)
            pic = requests.get(url, timeout=7)
            
            image = np.asarray(bytearray(pic.content), dtype="uint8")
            image = cv2.imdecode(image, cv2.IMREAD_COLOR)              
            
            src_image = image.copy()
            
            # BGR -> RGB
            image = image[:, :, ::-1]
            h, w = image.shape[:2]
            image_sizes = (512, 640, 768, 896, 1024, 1280, 1408)
            
            score_threshold = 0.9
            
            phi = 5
            image_size = image_sizes[phi]
            image, scale = preprocess_image(image, image_size=image_size)
            # run network
            start = time.time()
            boxes, scores, labels = model.predict_on_batch([np.expand_dims(image, axis=0)])
            boxes, scores, labels = np.squeeze(boxes), np.squeeze(scores), np.squeeze(labels)
            boxes = postprocess_boxes(boxes=boxes, scale=scale, height=h, width=w)
            # select indices which have a score above the threshold
            indices = np.where(scores[:] > score_threshold)[0]

            # select those detections
            boxes = boxes[indices]
            
            sku_path = out_path + keyword
            if not os.path.exists(sku_path):
                os.mkdir(sku_path)
            
            for bbox in boxes:
                bbs = BoundingBoxesOnImage([BoundingBox(x1=int(bbox[0]), x2=int(bbox[2]), y1=int(bbox[1]), y2=int(bbox[3]))], shape=src_image.shape)
                bbox_img = bbs.bounding_boxes[0].extract_from_image(src_image)    
                cv2.imwrite(sku_path + '/' + keyword + str(i) + '.jpg', bbox_img)
                i = i + 1
        except:
            next

In [None]:
sku = ['万宝路(软红)',
 '中华(硬)',
 '中华(软)',
 '中南海(8mg)',
 '云烟(软紫)',
 '长白山(777)',
 '七匹狼(锋芒)',
 '双喜(软国际)',
 '泰山(儒风细支)',
 '泰山(华贵)',
 '泰山(大鸡)',
 '泰山(好好学习)',
 '泰山(好客细支)',
 '泰山(宏图)',
 '泰山(平安)',
 '泰山(心悦)',
 '泰山(新品)',
 '黄鹤楼(硬红)',
 '泰山(白将军)',
 '泰山(硬红八喜)',
 '泰山(红将军)',
 '泰山(青秀)',
 '黄鹤楼(天下名楼)',
 '泰山(颜悦)',
 '玉溪(初心)',
 '黄金叶(金满堂)',
 '玉溪(软)',
 '玉溪(阿诗玛)',
 '黄金叶(爱尚)',
 '黄金叶(喜满堂)',
 '苏烟(五星红杉树）',
 '苏烟(软金砂)',
 '钻石(荷花)',
 '长白山(红)',
 '黄果树(长征)',
 '红旗渠(芒果)',
 '天子(金)',
 '红旗渠(雪茄)',
 '黄金叶(大M)',
 '红双喜(硬)',
 '白沙(硬精品三代)',
 '555(双冰)',
 '中南海(5mg)',
 '利群(软长嘴)',
 '金圣(硬滕王阁)',
 '黄山(硬记忆)',
 '泰山(皇家礼炮)',
 '利群(软红长嘴)',
 '555(配方555·金锐)',
 '牡丹(软)',
 '娇子(宽窄好运)',
 '娇子(蓝)',
 '娇子(时代阳光)',
 '娇子(五粮醇香)',
 '利群(蓝天)',
 '利群(新版)',
 '南京(红)',
 '南京(炫赫门)',
 '云烟(紫)',
 '555(冰炫细支)',
 '黄山(印象一品)',
 '南京(十二钗薄荷)',
 '娇子(格调细支)',
 '南京(十二钗烤烟)',
 '南京(雨花石)',
 '七匹狼(红)',
 '云烟(软珍品)',
 '云烟(细支云龙)',
 '七匹狼(豪情)',
 '真龙(经典红)',
 '黄山(红方印细支)',
 '黄山(记忆)',
 '黄山(新一品)',
 '娇子(X)',
 '红河(小熊猫世纪风)',
 '好猫(长乐)',
 '真龙(凌云)',
 '哈德门(软)',
 '哈德门(纯香)',
 '白沙(精品二代)',
 '白沙(硬)',
 '芙蓉王(硬)',
 '贵烟(跨越)',
 '红塔山(软经典)',
 '红塔山(硬经典100)',
 '黄鹤楼(软蓝)',
 '黄金叶(乐途)',
 '黄金叶(小目标)',
 '娇子(宽窄如意)',
 '七匹狼(白)',
 '长城(醇雅陈皮薄荷)',
 '石狮(软富健)']

image_path = '/ai/data/debug/'

for keyword in sku:
    print('开始爬取：' + keyword)
    key_url_dict = crawl_url(keyword)
    ai_process(model, key_url_dict, image_path)
    print('爬取结束!')