In [1]:
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
import sys
import os

sys.path.append(os.path.join(os.getcwd(), '../app'))

In [3]:
#!pip install -r ../requirements.txt

# データセットを読み込む

In [4]:
dataset = pd.read_csv(
    '../data/dataset/items.csv', 
    usecols=['アイテム名', '画像URL', 'カラー'], 
    dtype={'アイテム名': str, '画像URL': str, 'カラー': str}
)
dataset = dataset[~dataset['カラー'].isnull()]

In [5]:
dataset.head()

Unnamed: 0,アイテム名,画像URL,カラー
0,パールペンダントネックレス・全2色・d69488,https://www.dzimg.com/Dahong/202107/1172021_19...,シルバー
1,ハーフネックパフTシャツ・全5色・e71078,https://www.dzimg.com/Dahong/202111/1257842_19...,グリーン
2,ストライプ配色スウェット・全2色・b71679,https://www.dzimg.com/Dahong/202201/1309245_20...,ネイビー
3,ストライプ配色スウェット・全2色・b71679,https://www.dzimg.com/Dahong/202201/1309245_20...,グレイ
4,ストライプ配色スウェット・全2色・b71679,https://www.dzimg.com/Dahong/202201/1309245_20...,複数色


# 特徴量データの作成

In [6]:
from __future__ import annotations

import abc
import subprocess
from typing import Any

from hocho.tokenizer.impl import MeCabTokenizer


class classproperty:
    """ @classmethod+@property """
    def __init__(self, f):
        self.f = classmethod(f)

    def __get__(self, *args):
        return self.f.__get__(*args)()


class Feature(abc.ABC):
    @abc.abstractmethod
    def of(self, value: Any) -> Any:
        pass

    @classproperty
    def ItemImage(cls) -> ItemImage:
        from domain.model.feature.impl import ItemImage

        return ItemImage(300, 300)

    @classproperty
    def ItemName(cls) -> ItemName:
        from domain.model.feature.impl import ItemName

        dicdir = subprocess.getoutput("mecab-config --dicdir")
        return ItemName(MeCabTokenizer(f"{dicdir}/mecab-ipadic-neologd"))



from hocho.cleaning import clean_text
from hocho.normalization import normalize
from hocho.tokenizer import Tokenizer


class ItemName(Feature):
    def __init__(self, tokenizer: Tokenizer):
        self.__tokenizer = tokenizer

    def of(self, text: str) -> str:
        text = normalize(text)
        text = clean_text(text)
        return ' '.join(self.__tokenizer.wakati(text))



from dataclasses import dataclass
from typing import Union

import cv2
import numpy as np
import skimage

from domain.model.feature import Feature


@dataclass(init=False, unsafe_hash=True, frozen=True)
class ItemImage(Feature):
    height: int
    width: int

    def __init__(self, height: int, width: int):
        super().__setattr__('height', height)
        super().__setattr__('width', width)

    def of(self, value: Union[str, np.ndarray]) -> np.ndarray:
        if type(value) == str:
            value = skimage.io.imread(value)
        value = cv2.resize(value, dsize=(self.height, self.width))
        if value.shape != (self.height, self.width, 3):
            raise ValueError()
        return value / 255.

In [31]:
from tqdm import tqdm


class FeaturesFactory:
    def __init__(self, feature_dict: dict[str, Feature]):
        self.__columns = list(feature_dict.keys())
        self.__feature_dict = feature_dict

    def make(self, dataset: pd.DataFrame) -> tuple[np.ndarray, list[int]]:
        index: list[int] = []
        features: list[list[Any]] = []
        for i, arr in tqdm(enumerate(dataset[self.__columns].values)):
            try:
                features.append([self.__feature_of(j, value) for j, value in enumerate(arr)])
                index.append(i)
            except Exception:
                continue

        return np.array(features, dtype=object), index

    def __feature_of(self, j: int, value: Any) -> Any:
        column_name = self.__columns[j]
        return self.__feature_dict[column_name].of(value)

In [32]:
features_factory = FeaturesFactory({'アイテム名': Feature.ItemName, '画像URL': Feature.ItemImage})

X, index = features_factory.make(dataset[0:100])

100it [00:21,  4.63it/s]


In [60]:
from sklearn.feature_extraction.text import CountVectorizer


text_vectorizer = CountVectorizer()
text_vectorizer.fit(X[:, 0])

X = [np.array(list(X[:, 1])), text_vectorizer.fit_transform(X[:, 0]).toarray()]

In [62]:
from domain.model.color import Color


Y = np.array([Color.value_of_ja_name(y).id() for y in dataset.iloc[index]['カラー']])

In [65]:
X[0]

array([[[[0.84705882, 0.83529412, 0.81568627],
         [0.84705882, 0.83529412, 0.81568627],
         [0.84313725, 0.83137255, 0.80392157],
         ...,
         [0.8627451 , 0.84313725, 0.82745098],
         [0.8627451 , 0.85098039, 0.83137255],
         [0.86666667, 0.85490196, 0.83529412]],

        [[0.84313725, 0.83137255, 0.81176471],
         [0.84705882, 0.83529412, 0.81568627],
         [0.84313725, 0.83137255, 0.80392157],
         ...,
         [0.8627451 , 0.84313725, 0.82745098],
         [0.86666667, 0.85490196, 0.83529412],
         [0.8745098 , 0.8627451 , 0.84313725]],

        [[0.83529412, 0.83529412, 0.80392157],
         [0.83529412, 0.83529412, 0.80392157],
         [0.83529412, 0.83529412, 0.80392157],
         ...,
         [0.86666667, 0.84705882, 0.83529412],
         [0.87058824, 0.85882353, 0.83921569],
         [0.8745098 , 0.8627451 , 0.84313725]],

        ...,

        [[1.        , 1.        , 1.        ],
         [1.        , 1.        , 1.        ]

# 予測モデルの学習

In [117]:
from domain.model.color import Color
from domain.model.estimator.color import ColorEstimator

n_class = len([e for e in Color])
Y_train = to_categorical(Y_train.reshape(-1, 1), n_class)
Y_test = to_categorical(Y_test.reshape(-1, 1), n_class)


estimator = ColorEstimator(
    ColorEstimator.TextLayer(len(text_vectorizer.get_feature_names_out())),
    ColorEstimator.ImageLayer(300, 300, 3),
    n_class
)
estimator.fit((X_train, X_test), (Y_train, Y_test))

KeyboardInterrupt: 

In [19]:
import pickle


with open('estimator.pkl', 'wb') as pkl:
    pickle.dump(estimator, pkl)

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers
......concatenate
.........vars
......conv2d
.........vars
............0
............1
......conv2d_1
.........vars
............0
............1
......dense
.........vars
............0
............1
......dense_1
.........vars
............0
............1
......dense_2
.........vars
............0
............1
......dense_3
.........vars
............0
............1
......dense_4
.........vars
............0
............1
......flatten
.........vars
......flatten_1
.........vars
......input_layer
.........vars
......input_layer_1
.........vars
......max_pooling2d
.........vars
......max_pooling2d_1
.........vars
...metrics
......mean
.........vars
............0
............1
......mean_metric_wrapper
.........vars
............0
............1
...optimizer
......vars
.........0
.........1
.........10
.........11
.........12
.........13
.........14
.........15
.........16
.........17
.........18
.........19
.........2

In [20]:
with open('estimator.pkl', 'rb') as pkl:
    estimator = pickle.load(pkl)

Keras model archive loading:
File Name                                             Modified             Size
config.json                                    2022-12-30 15:45:36         6151
metadata.json                                  2022-12-30 15:45:36           64
variables.h5                                   2022-12-30 15:45:36    131316688
Keras weights file (<HDF5 file "variables.h5" (mode r)>) loading:
...layers
......concatenate
.........vars
......conv2d
.........vars
............0
............1
......conv2d_1
.........vars
............0
............1
......dense
.........vars
............0
............1
......dense_1
.........vars
............0
............1
......dense_2
.........vars
............0
............1
......dense_3
.........vars
............0
............1
......dense_4
.........vars
............0
............1
......flatten
.........vars
......flatten_1
.........vars
......input_layer
.........vars
......input_layer_1
.........vars
......max_pooling2d
.........