In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import os

num_epochs = 1000
batch_size = 32
learning_rate = 0.01

In [None]:
def _decode_and_resize(filename, label):
    image_string = tf.io.read_file(filename)            # 读取原始文件
    image_decoded = tf.image.decode_jpeg(image_string)  # 解码JPEG图片
    image_resized = tf.image.resize(image_decoded, [224, 224]) / 255.0
    return image_resized, label

data_text = pd.read_csv('train_data.csv')
# 构建训练数据集
train_filenames = tf.constant([filename for filename in list(data_text.path.values)])
train_labels = list(data_text.page_num.values)

train_dataset = tf.data.Dataset.from_tensor_slices((train_filenames, train_labels))
train_dataset = train_dataset.map(
    map_func=_decode_and_resize, 
    num_parallel_calls=tf.data.experimental.AUTOTUNE)
# 取出前buffer_size个数据放入buffer，并从其中随机采样，采样后的数据用后续数据替换
train_dataset = train_dataset.shuffle(buffer_size=500)
train_dataset = train_dataset.batch(batch_size)
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
resnet50_fine_tune = tf.keras.models.Sequential()

resnet50_fine_tune.add(tf.keras.applications.ResNet50(include_top = False, pooling = 'avg', weights = 'imagenet'))
resnet50_fine_tune.add(tf.keras.layers.Dense(48, activation = 'softmax'))
resnet50_fine_tune.layers[0].trainable = False
# resnet50_fine_tune.compile(loss="sparse_categorical_crossentropy", optimizer="sgd", metrics=['accuracy'])
resnet50_fine_tune.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
    loss=tf.keras.losses.sparse_categorical_crossentropy,
    metrics=[tf.keras.metrics.sparse_categorical_accuracy]
)
resnet50_fine_tune.summary()

In [None]:
resnet50_fine_tune.fit(train_dataset, epochs=num_epochs)

In [None]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torchvision import models
from torchvision.models.resnet import Bottleneck, BasicBlock, ResNet
import torch.utils.model_zoo as model_zoo

import numpy as np
import cv2
import pandas as pd

from model.models import *

In [None]:
image_net = ResidualNet()
image_net.eval()
image_net = image_net.cuda()

In [None]:
text_net = RoBertaChinese()

In [None]:
with torch.no_grad():
    best_seller = pd.read_csv('best_seller.csv')
    image_vectors = []
    text_vectors = []
    for i in range(0, len(best_seller)):
        image_path = 'data/best_seller/'+best_seller['id'][i]+'.jpg'
        image_space_vector = get_image_space_vector(image_path, image_net, 'avg')
        text_space_vector = get_text_space_vector(best_seller['title'][i], text_net.tokenizer, text_net.model).tolist()
        image_vectors.append(image_space_vector)
        text_vectors.append(text_space_vector)
    best_seller['image_vectors'] = pd.Series(image_vectors)
    best_seller['text_vectors'] = pd.Series(text_vectors)

In [None]:
with torch.no_grad():
    source_goods = pd.read_csv('source_goods.csv')
    image_vectors = []
    text_vectors = []
    for i in range(0, len(source_goods)):
        image_path = 'data/source_goods/'+source_goods['id'][i]+'.jpg'
        image_space_vector = get_image_space_vector(image_path, image_net, 'avg')
        text_space_vector = get_text_space_vector(source_goods['title'][i], text_net.tokenizer, text_net.model).tolist()
        image_vectors.append(image_space_vector)
        text_vectors.append(text_space_vector)
    source_goods['image_vectors'] = pd.Series(image_vectors)
    source_goods['text_vectors'] = pd.Series(text_vectors)

In [None]:

    

source_goods['image_mean_cosine'] = 0
list_best_seller_vectors = best_seller.image_vectors.values
for i in range(0, len(source_goods)):
    mean_cosine = get_good_mean_cosine(source_goods['image_vectors'][i], list_best_seller_vectors)
    source_goods.loc[i, 'image_mean_cosine'] = mean_cosine
    
source_goods['text_mean_cosine'] = 0
list_best_seller_vectors = best_seller.text_vectors.values
for i in range(0, len(source_goods)):
    mean_cosine = get_good_mean_cosine(source_goods['text_vectors'][i], list_best_seller_vectors)
    source_goods.loc[i, 'text_mean_cosine'] = mean_cosine

In [None]:
temp = source_goods.sort_values(by=['image_mean_cosine'],ascending=False).reset_index(drop=True)
temp

In [None]:
temp1 = temp.loc[:20,:]
temp1

In [None]:
from PIL import Image

for i in range(0,len(temp1)):
    pic = Image.open('data/source_goods/'+temp1['id'][i]+'.jpg')
    pic = pic.convert('RGB')
    pic.save('data/test/'+temp1['id'][i]+'.jpg')

In [1]:
import re
import time
import pandas as pd
import configparser
import argparse
from spider.spider import *
from db.db import *

    
firefox_profile = "/home/ml/.mozilla/firefox/0k5sh6ex.default"
crawl_config = "config/crawl.cfg"
redis_config = '172.17.0.2:6379:1'
mysql_config = '172.17.0.3:root:123456:hot'

cf = configparser.ConfigParser()
cf.read(crawl_config)

redis_ctrl = RedisCtrl(redis_config.split(':'))
mysql_ctrl = MysqlCtrl(mysql_config.split(':'))

# # Is there any key words in redis yet
# key_words = '连衣裙'

# # Secend crawl the corresponding key word of platform of best seller
# best_seller_data = pd.DataFrame()
# spider_sing_page = SinglePageSpider(firefox_profile)
# for platform, url in cf.items('best_seller_search_url'):
#     url = url.replace(re.findall('&q=(.*?)&',url)[0],key_words)
#     cfg = cf.items(platform + '_parse_links')
#     cfg.pop(0)
#     cfg.pop(0)
#     best_seller_data = best_seller_data.append(spider_sing_page.run(url, dict(cfg))).reset_index(drop=True)
# spider_sing_page.close()

# # Third crawl the corresponding key word of platform of source goods
# source_goods = pd.DataFrame()
# spider_multi_pages = MultiPageSpider(firefox_profile)
# for platform, url in cf.items('source_goods_url'):
#     url = url.replace(re.findall('\?so=(.*?)&',url)[0],key_words)
#     cfg = cf.items(platform + '_parse_links')
#     source_goods = source_goods.append(spider_multi_pages.run(url, dict(cfg))).reset_index(drop=True)
# spider_multi_pages.close()

In [2]:
mysql_ctrl.load('source_goods')

Unnamed: 0,id,title,price,shop,images_url_attribute,product_url_attribute,page_num,crawl_time
0,009494d8-9473-496f-9fc6-33ce6fcbaa64,实拍大码女2020早新胖妹显瘦连衣裙内搭打底小香风针织开衫两件套,50.00,宜蔓大码,https://img.alicdn.com/bao/uploaded/i4/6744267...,https://gz.17zwd.com/item.htm?GID=116409585&sp...,2,2020-10-18 05:25:07
1,009f4c6d-223e-404e-b1c0-8e17fd1b9f01,实拍现货大码微胖遮肚裙子显瘦秋季新款女装长袖胖mm中长款连衣裙,50.00,汇美大码女装,https://img01-gms.17zwd.com/imgextra/61445306/...,https://gz.17zwd.com/item.htm?GID=117196679&sp...,3,2020-10-18 05:25:07
2,015236d7-36f0-406e-9b1e-f9745d6904ab,印花香云纱真丝连衣裙2020春夏季新款气质修身拼接桑蚕丝旗袍裙子,65.00,艾瑞衣阁,https://img.alicdn.com/bao/uploaded/i2/2532585...,https://gz.17zwd.com/item.htm?GID=114577444&sp...,2,2020-10-18 05:25:07
3,02357cae-1468-441c-9a13-e4daef970e4f,实拍2775#裙子秋女新款连衣裙碎花雪纺超仙收腰沙滩大摆超长裙,63.00,女人花旗舰店,https://img.alicdn.com/bao/uploaded/i3/8369987...,https://gz.17zwd.com/item.htm?GID=115886504&sp...,0,2020-10-18 05:25:07
4,02b19dd7-f545-460b-954b-d5e10529081b,实拍改良版旗袍连衣裙2020秋装新款高开叉中式刺绣民族风唐装日常,73.00,欧丽姿实拍服饰,https://img.alicdn.com/bao/uploaded/i4/2048402...,https://gz.17zwd.com/item.htm?GID=116964113&sp...,1,2020-10-18 05:25:07
...,...,...,...,...,...,...,...,...
395,ff67a71e-1c52-4612-8f6f-bb9e38ee96fc,实拍！秋冬新款银狐绒加绒加厚连衣裙孕妇宽松休闲荷叶边连帽裙女,52.00,孕味坊,https://static-new.17zwd.com/assets/source/sta...,https://gz.17zwd.com/item.htm?GID=117195741&sp...,2,2020-10-18 05:25:07
396,ff70325e-b164-4586-a2f8-c91fd3b2b6bb,实拍现货 针织打底衫+水貂毛背心裙连衣裙两件套装女2020秋冬季新,79.00,乐颜,https://img.alicdn.com/bao/uploaded/i1/1771533...,https://gz.17zwd.com/item.htm?GID=117196908&sp...,3,2020-10-18 05:25:07
397,ffab4597-e728-4f79-93bc-ad410eb542f5,v领雪纺碎花连衣裙女夏2020新款za法式复古显瘦显高气质高腰长裙,68.00,宜蔓服饰,https://img.alicdn.com/bao/uploaded/i2/2256175...,https://gz.17zwd.com/item.htm?GID=115461043&sp...,1,2020-10-18 05:25:07
398,ffc1a633-2e64-443d-9434-ba423e17b090,实拍现货 针织连衣裙女秋冬季2020年新款毛衣裙超仙内搭打底裙,68.00,乐颜,https://img.alicdn.com/bao/uploaded/i2/1771533...,https://gz.17zwd.com/item.htm?GID=117194786&sp...,3,2020-10-18 05:25:07
