In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import os

num_epochs = 1000
batch_size = 32
learning_rate = 0.01

In [None]:
def _decode_and_resize(filename, label):
    image_string = tf.io.read_file(filename)            # 读取原始文件
    image_decoded = tf.image.decode_jpeg(image_string)  # 解码JPEG图片
    image_resized = tf.image.resize(image_decoded, [224, 224]) / 255.0
    return image_resized, label

data_text = pd.read_csv('train_data.csv')
# 构建训练数据集
train_filenames = tf.constant([filename for filename in list(data_text.path.values)])
train_labels = list(data_text.page_num.values)

train_dataset = tf.data.Dataset.from_tensor_slices((train_filenames, train_labels))
train_dataset = train_dataset.map(
    map_func=_decode_and_resize, 
    num_parallel_calls=tf.data.experimental.AUTOTUNE)
# 取出前buffer_size个数据放入buffer，并从其中随机采样，采样后的数据用后续数据替换
train_dataset = train_dataset.shuffle(buffer_size=500)
train_dataset = train_dataset.batch(batch_size)
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
resnet50_fine_tune = tf.keras.models.Sequential()

resnet50_fine_tune.add(tf.keras.applications.ResNet50(include_top = False, pooling = 'avg', weights = 'imagenet'))
resnet50_fine_tune.add(tf.keras.layers.Dense(48, activation = 'softmax'))
resnet50_fine_tune.layers[0].trainable = False
# resnet50_fine_tune.compile(loss="sparse_categorical_crossentropy", optimizer="sgd", metrics=['accuracy'])
resnet50_fine_tune.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
    loss=tf.keras.losses.sparse_categorical_crossentropy,
    metrics=[tf.keras.metrics.sparse_categorical_accuracy]
)
resnet50_fine_tune.summary()

In [None]:
resnet50_fine_tune.fit(train_dataset, epochs=num_epochs)

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torchvision import models
from torchvision.models.resnet import Bottleneck, BasicBlock, ResNet
import torch.utils.model_zoo as model_zoo

import numpy as np
import cv2
import pandas as pd

from model.model import *

In [2]:
image_net = ResidualNet()
image_net.eval()
image_net = image_net.cuda()

In [3]:
text_net = RoBertaChinese()

In [31]:
with torch.no_grad():
    best_seller = pd.read_csv('best_seller.csv')
    image_vectors = []
    text_vectors = []
    for i in range(0, len(best_seller)):
        image_path = 'data/best_seller/'+best_seller['id'][i]+'.jpg'
        image_space_vector = get_image_space_vector(image_path, image_net, 'avg')
        text_space_vector = get_text_space_vector(best_seller['title'][i], text_net.tokenizer, text_net.model).tolist()
        image_vectors.append(image_space_vector)
        text_vectors.append(text_space_vector)
    best_seller['image_vectors'] = pd.Series(image_vectors)
    best_seller['text_vectors'] = pd.Series(text_vectors)

In [32]:
with torch.no_grad():
    source_goods = pd.read_csv('source_goods.csv')
    image_vectors = []
    text_vectors = []
    for i in range(0, len(source_goods)):
        image_path = 'data/source_goods/'+source_goods['id'][i]+'.jpg'
        image_space_vector = get_image_space_vector(image_path, image_net, 'avg')
        text_space_vector = get_text_space_vector(source_goods['title'][i], text_net.tokenizer, text_net.model).tolist()
        image_vectors.append(image_space_vector)
        text_vectors.append(text_space_vector)
    source_goods['image_vectors'] = pd.Series(image_vectors)
    source_goods['text_vectors'] = pd.Series(text_vectors)

In [46]:
def get_good_mean_cosine(good_image_vector, list_best_seller_vectors):
    total_cosine = 0
    for vector in list_best_seller_vectors:
        total_cosine += get_cosine_similarity(good_image_vector, vector)
    return total_cosine / len(list_best_seller_vectors)
    

source_goods['image_mean_cosine'] = 0
list_best_seller_vectors = best_seller.image_vectors.values
for i in range(0, len(source_goods)):
    mean_cosine = get_good_mean_cosine(source_goods['image_vectors'][i], list_best_seller_vectors)
    source_goods.loc[i, 'image_mean_cosine'] = mean_cosine
    
source_goods['text_mean_cosine'] = 0
list_best_seller_vectors = best_seller.text_vectors.values
for i in range(0, len(source_goods)):
    mean_cosine = get_good_mean_cosine(source_goods['text_vectors'][i], list_best_seller_vectors)
    source_goods.loc[i, 'text_mean_cosine'] = mean_cosine