In [52]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
df = pd.read_csv("/content/drive/MyDrive/kream_data/product_data.csv", encoding='cp949')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6323 entries, 0 to 6322
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   product_id    6323 non-null   object 
 1   img_path      6323 non-null   object 
 2   brand         6323 non-null   object 
 3   name          6323 non-null   object 
 4   color1        6323 non-null   object 
 5   color2        6323 non-null   object 
 6   price_og      6323 non-null   object 
 7   price_resell  6323 non-null   int64  
 8   n_scrap       5507 non-null   float64
dtypes: float64(1), int64(1), object(7)
memory usage: 444.7+ KB


In [9]:
print(df['color1'].unique())
print("color1 의 고유색 개수:", df["color1"].unique().shape[0])

print(df['color2'].unique())
print("color2 의 고유색 개수:", df["color2"].unique().shape[0])

['WHITE' 'FLAX' 'LIGHTT IRON ORE' ... 'DALMATIAN' 'CHECK' 'INDIGO']
color1 의 고유색 개수: 1126
['BLACK' 'WHITE' 'GUM' ... '275500' '342000' '380000']
color2 의 고유색 개수: 2386


In [10]:
# 데이터를 살펴보다가 color가 2개가 아니라 1개인 경우를 크롤링할때 한칸씩 땡겨서 데이터가 저장됨.
# 그래서 n_scarp이 nan인 경우가 생김
# 아래는 n_scrap이 nan경우를 찾아서 컬럼값을 한칸씩 오른쪽으로 밀어주는 것이다.

nan_index = df[df.loc[:, "n_scrap"].isna()].index
print(df[df.loc[:, "n_scrap"].isna()].index)
df[df.loc[:, "n_scrap"].isna()]

Int64Index([  85,  115,  199,  205,  237,  254,  429,  455,  477,  498,
            ...
            6312, 6313, 6314, 6315, 6317, 6318, 6319, 6320, 6321, 6322],
           dtype='int64', length=816)


Unnamed: 0,product_id,img_path,brand,name,color1,color2,price_og,price_resell,n_scrap
85,78686,crawling\product_crawling\image\78686.jpg,Nike,Nike x Comme des Garcons Air Max 97 Black,BLACK,430800,245800,632,
115,78687,crawling\product_crawling\image\78687.jpg,Nike,Nike x Comme des Garcons Air Max 97 Glacier Grey,GLACIER GREY,430800,260000,681,
199,23464,crawling\product_crawling\image\23464.jpg,Nike,Nike x Comme des Garcons Homme Plus Air Max 95...,BLACK,419000,268800,547,
205,23463,crawling\product_crawling\image\23463.jpg,Nike,Nike x Comme des Garcons Homme Plus Air Max 95...,WHITE,419000,264000,504,
237,83680,crawling\product_crawling\image\83680.jpg,Nike,Nike x Supreme Air Max 98 TL Black,BLACK,206800,278600,796,
...,...,...,...,...,...,...,...,...,...
6318,26441,crawling\product_crawling\image\26441.jpg,Mihara Yasuhiro,Maison Mihara Yasuhiro OG Sole Over Dyed Lowcu...,ORANGE,324000,486000,85,
6319,26410,crawling\product_crawling\image\26410.jpg,Mihara Yasuhiro,Maison Mihara Yasuhiro x Nigel Cabourn Low Cut...,INDIGO,355300,503600,245,
6320,26224,crawling\product_crawling\image\26224.jpg,Mihara Yasuhiro,Maison Mihara Yasuhiro Hank OG Sole Over Dyed ...,BLACK,334400,450000,79,
6321,25717,crawling\product_crawling\image\25717.jpg,Mihara Yasuhiro,Maison Mihara Yasuhiro Peterson OG Sole Trick ...,WHITE,313500,457500,295,


In [11]:
nan_color2 = df.iloc[nan_index, 5]
nan_price_og = df.iloc[nan_index, 6]
nan_price_resell = df.iloc[nan_index, 7]
nan_n_scrap = df.iloc[nan_index, 8]

In [12]:
df.iloc[nan_index, 5] = np.nan
df.iloc[nan_index, 6] = nan_color2
df.iloc[nan_index, 7] = nan_price_og
df.iloc[nan_index, 8] = nan_price_resell

In [13]:
# price_og를 int형으로 변환키려고 함.
# 그렇게 하기 위해서는 price_og가 숫자여야하는데 아닌것들이 껴있어서 분류해줘야한다.

not_nan_price_og_index = df[df["price_og"] != "-"].index

In [14]:
df = df.iloc[not_nan_price_og_index]

In [15]:
price_og_error_list = []

for index,row in df.iterrows():
    try:
        int(row['price_og'])
    except:
        price_og_error_list.append(df[df['product_id'] ==row['product_id']].index[0])
price_og_error_list

[2794, 3011, 3889, 5027, 5289, 5349, 5769, 5790, 5794, 5804, 5823, 5824, 5836]

In [16]:
df = df.drop(price_og_error_list)

In [17]:
df['price_resell'] = df['price_resell'].astype(int)
df['price_og'] = df['price_og'].astype(int)

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6103 entries, 0 to 6322
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   product_id    6103 non-null   object 
 1   img_path      6103 non-null   object 
 2   brand         6103 non-null   object 
 3   name          6103 non-null   object 
 4   color1        6103 non-null   object 
 5   color2        5380 non-null   object 
 6   price_og      6103 non-null   int64  
 7   price_resell  6103 non-null   int64  
 8   n_scrap       6103 non-null   float64
dtypes: float64(1), int64(2), object(6)
memory usage: 476.8+ KB


In [19]:
# 리셀이 얼마나 붙었는지 알 수 있도록 컬럼을 추가해준다.
df['+price'] = df['price_resell'] - df['price_og']

In [20]:
null_df = df[df['color2'].isnull()]
null_df

Unnamed: 0,product_id,img_path,brand,name,color1,color2,price_og,price_resell,n_scrap,+price
85,78686,crawling\product_crawling\image\78686.jpg,Nike,Nike x Comme des Garcons Air Max 97 Black,BLACK,,430800,245800,632.0,-185000
115,78687,crawling\product_crawling\image\78687.jpg,Nike,Nike x Comme des Garcons Air Max 97 Glacier Grey,GLACIER GREY,,430800,260000,681.0,-170800
199,23464,crawling\product_crawling\image\23464.jpg,Nike,Nike x Comme des Garcons Homme Plus Air Max 95...,BLACK,,419000,268800,547.0,-150200
205,23463,crawling\product_crawling\image\23463.jpg,Nike,Nike x Comme des Garcons Homme Plus Air Max 95...,WHITE,,419000,264000,504.0,-155000
237,83680,crawling\product_crawling\image\83680.jpg,Nike,Nike x Supreme Air Max 98 TL Black,BLACK,,206800,278600,796.0,71800
...,...,...,...,...,...,...,...,...,...,...
6318,26441,crawling\product_crawling\image\26441.jpg,Mihara Yasuhiro,Maison Mihara Yasuhiro OG Sole Over Dyed Lowcu...,ORANGE,,324000,486000,85.0,162000
6319,26410,crawling\product_crawling\image\26410.jpg,Mihara Yasuhiro,Maison Mihara Yasuhiro x Nigel Cabourn Low Cut...,INDIGO,,355300,503600,245.0,148300
6320,26224,crawling\product_crawling\image\26224.jpg,Mihara Yasuhiro,Maison Mihara Yasuhiro Hank OG Sole Over Dyed ...,BLACK,,334400,450000,79.0,115600
6321,25717,crawling\product_crawling\image\25717.jpg,Mihara Yasuhiro,Maison Mihara Yasuhiro Peterson OG Sole Trick ...,WHITE,,313500,457500,295.0,144000


In [21]:
# 미하라 야시히로의 color2 == nan 인 경우에는 white로 설정해주면 될듯하다. 
null_df[null_df['brand']=='Mihara Yasuhiro'].head(3)

Unnamed: 0,product_id,img_path,brand,name,color1,color2,price_og,price_resell,n_scrap,+price
6050,46178,crawling\product_crawling\image\46178.jpg,Mihara Yasuhiro,Maison Mihara Yasuhiro Blakey OG Sole Canvas L...,BLACK,,313500,547400,7385.0,233900
6051,25708,crawling\product_crawling\image\25708.jpg,Mihara Yasuhiro,Maison Mihara Yasuhiro Peterson OG Sole Canvas...,WHITE,,292600,459200,2992.0,166600
6052,46177,crawling\product_crawling\image\46177.jpg,Mihara Yasuhiro,Maison Mihara Yasuhiro Blakey OG Sole Canvas L...,WHITE,,313500,428400,2659.0,114900


In [22]:
df.loc[null_df[null_df['brand']=='Mihara Yasuhiro'].index, ['color2']] = 'white'

In [23]:
# 미하라 마시히로를 제외한 브랜드의 color2는 그냥 nan으로 남겨놔도 될듯하다.

In [24]:
df.loc[df['color2'].isnull(), 'color2'] = 'Nan'

In [25]:
# 이제 타입도 적절하고 null 값도 없다.
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6103 entries, 0 to 6322
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   product_id    6103 non-null   object 
 1   img_path      6103 non-null   object 
 2   brand         6103 non-null   object 
 3   name          6103 non-null   object 
 4   color1        6103 non-null   object 
 5   color2        6103 non-null   object 
 6   price_og      6103 non-null   int64  
 7   price_resell  6103 non-null   int64  
 8   n_scrap       6103 non-null   float64
 9   +price        6103 non-null   int64  
dtypes: float64(1), int64(3), object(6)
memory usage: 653.5+ KB


In [29]:
df = df.set_index('product_id')

RNN 학습을 위해서 하는 단어임베딩이다.

In [53]:
bn_df = df[['brand', 'name']]
bn_df['input_data'] = "brand:" + bn_df['brand'] +"/"+"name:" + bn_df['name']
bn_df.head(5)

Unnamed: 0_level_0,brand,name,input_data
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
28029,Nike,Nike Dunk Low Retro Black,brand:Nike/name:Nike Dunk Low Retro Black
12831,Nike,Nike Air Force 1 '07 Low White,brand:Nike/name:Nike Air Force 1 '07 Low White
21935,Nike,Nike Air Force 1 '07 WB Flax,brand:Nike/name:Nike Air Force 1 '07 WB Flax
44653,Nike,Nike x Supreme Air Force 1 Low Flax,brand:Nike/name:Nike x Supreme Air Force 1 Low...
89548,Nike,Nike Zoom Vomero 5 PRM Light Iron Ore and Flat...,brand:Nike/name:Nike Zoom Vomero 5 PRM Light I...


In [54]:
color_df = df[['color1', 'color2']]
color_df['input_data'] = "color1:" + color_df['color1'] +"/"+"color2:" + color_df['color2']
color_df.head(5)

Unnamed: 0_level_0,color1,color2,input_data
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
28029,WHITE,BLACK,color1:WHITE/color2:BLACK
12831,WHITE,WHITE,color1:WHITE/color2:WHITE
21935,FLAX,GUM,color1:FLAX/color2:GUM
44653,FLAX,FLAX-GUM LIGHT BROWN,color1:FLAX/color2:FLAX-GUM LIGHT BROWN
89548,LIGHTT IRON ORE,METALLIC SILVER,color1:LIGHTT IRON ORE/color2:METALLIC SILVER


어떤 인코딩을 통해서 데이터프레임을 tokenize 해야 할까? 라는 생각이 들어서 chat-gpt에게 물어봤다.

The efficiency of sentence encoding for an RNN model depends on several factors, including the size of the vocabulary, the length of the sentences, and the complexity of the task. Some commonly used sentence encodings for RNNs include:

One-hot encoding: This encoding method represents each word as a one-hot vector, where each dimension corresponds to a unique word in the vocabulary. This method is simple and efficient, but can become computationally expensive for large vocabularies.

Word Embeddings: This encoding method represents words as dense, continuous-valued vectors, which can capture semantic and syntactic relationships between words. Word embeddings are typically learned from the training data and are more compact than one-hot encodings.

Sequence padding: This encoding method pads the sentences to the same length, allowing for batch processing of variable-length sequences. This method is widely used in NLP tasks, especially with RNNs and Transformers.

In general, the most efficient encoding method will depend on the specific task and available computational resources. It may be necessary to experiment with different encoding methods to determine the most effective approach for a given problem.

단어 임베딩을 사용하면 될것 같다. (요새 chat-gpt가 재밌긴 한데 대답이 사람 같지는 않다. 약간 모든 부분에서 중립인 사람같달까..? 유용하긴 한데 노잼이다).

In [111]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [114]:
# 브랜드/이름 단어임베딩
BN_sentences = bn_df['input_data'].values

BN_tokenizer = Tokenizer(num_words = 2000, oov_token="<OOV>")
BN_tokenizer.fit_on_texts(BN_sentences)
bn_word_index = BN_tokenizer.word_index

tokenized_bn = BN_tokenizer.texts_to_sequences(BN_sentences)
padded_bn = pad_sequences(tokenized_bn)

bn_df['tokenized_input'] = list(padded_bn)

print(bn_word_index)
bn_df['tokenized_input']

{'<OOV>': 1, 'brand': 2, 'name': 3, 'nike': 4, 'jordan': 5, 'new': 6, 'balance': 7, 'adidas': 8, 'x': 9, 'black': 10, 'white': 11, '1': 12, 'asics': 13, 'low': 14, 'retro': 15, 'air': 16, 'grey': 17, 'mihara': 18, 'yasuhiro': 19, 'made': 20, 'og': 21, 'gel': 22, 'in': 23, 'blue': 24, 'maison': 25, 'high': 26, 'red': 27, 'green': 28, 'usa': 29, 'force': 30, 'sneakers': 31, 'dunk': 32, 'max': 33, 'mid': 34, 'yeezy': 35, 'sole': 36, 'top': 37, 'navy': 38, '2': 39, 'se': 40, 'and': 41, "'07": 42, '5': 43, 'sp': 44, 'orange': 45, 'boost': 46, 'margiela': 47, 'canvas': 48, 'core': 49, 'sb': 50, 'uk': 51, 'silver': 52, '327': 53, 'light': 54, 'brown': 55, 'pack': 56, 'zoom': 57, '4': 58, '3': 59, 'v2': 60, 'dark': 61, 'superstar': 62, 'forum': 63, 'cloud': 64, 'triple': 65, 'gold': 66, 'yellow': 67, '574': 68, 'lyte': 69, 'pro': 70, 'off': 71, '2002r': 72, '6': 73, 'ultraboost': 74, 'kayano': 75, 'the': 76, '350': 77, 'converse': 78, 'sail': 79, 'purple': 80, 'wide': 81, 'of': 82, 'royal': 83

product_id
28029    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, ...
12831    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 3, 4, ...
21935    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 3, 4, ...
44653    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 3, 4, 9, ...
89548    [0, 0, 0, 0, 0, 0, 0, 2, 4, 3, 4, 57, 585, 43,...
                               ...                        
26441    [0, 0, 0, 0, 0, 0, 0, 2, 18, 19, 3, 25, 18, 19...
26410    [0, 0, 0, 0, 2, 18, 19, 3, 25, 18, 19, 9, 562,...
26224    [0, 0, 0, 0, 0, 0, 2, 18, 19, 3, 25, 18, 19, 2...
25717    [0, 0, 0, 0, 0, 2, 18, 19, 3, 25, 18, 19, 103,...
25714    [0, 0, 0, 0, 0, 2, 18, 19, 3, 25, 18, 19, 103,...
Name: tokenized_input, Length: 6103, dtype: object

In [118]:
# 색 단어 임베딩
color_sentences = color_df['input_data'].values

color_tokenizer = Tokenizer(num_words = 700, oov_token="<OOV>")
color_tokenizer.fit_on_texts(color_sentences)
color_word_index = color_tokenizer.word_index

tokenized_color = color_tokenizer.texts_to_sequences(color_sentences)
padded_color = pad_sequences(tokenized_color)

color_df['tokenized_input'] = list(padded_color)

print(color_word_index)
color_df['tokenized_input']

{'<OOV>': 1, 'color1': 2, 'color2': 3, 'white': 4, 'black': 5, 'grey': 6, 'red': 7, 'blue': 8, 'nan': 9, 'core': 10, 'green': 11, 'sail': 12, 'silver': 13, 'light': 14, 'metallic': 15, 'cloud': 16, 'navy': 17, 'orange': 18, 'university': 19, 'gold': 20, 'dark': 21, 'brown': 22, 'varsity': 23, 'yellow': 24, 'footwear': 25, 'multi': 26, 'royal': 27, 'purple': 28, 'gym': 29, 'off': 30, 'color': 31, 'cream': 32, 'pink': 33, 'pure': 34, 'midnight': 35, 'team': 36, 'beige': 37, 'summit': 38, 'hyper': 39, 'smoke': 40, 'wolf': 41, 'bright': 42, 'bone': 43, 'platinum': 44, 'crimson': 45, 'olive': 46, 'fire': 47, 'cool': 48, 'obsidian': 49, 'tan': 50, 'gum': 51, 'volt': 52, 'sea': 53, 'medium': 54, 'anthracite': 55, 'collegiate': 56, 'salt': 57, 'teal': 58, 'cement': 59, 'burgundy': 60, 'tint': 61, 'fog': 62, 'infrared': 63, 'ivory': 64, 'stone': 65, 'gray': 66, 'clear': 67, 'khaki': 68, 'neutral': 69, 'sand': 70, 'true': 71, 'court': 72, 'deep': 73, 'desert': 74, 'slate': 75, 'indigo': 76, '23'

product_id
28029          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 3, 5]
12831          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 3, 4]
21935        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 82, 3, 51]
44653     [0, 0, 0, 0, 0, 0, 0, 2, 82, 3, 82, 51, 14, 22]
89548    [0, 0, 0, 0, 0, 0, 0, 2, 1, 116, 188, 3, 15, 13]
                               ...                       
26441         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 18, 3, 4]
26410         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 76, 3, 4]
26224          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 5, 3, 4]
25717          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 3, 4]
25714          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 3, 4]
Name: tokenized_input, Length: 6103, dtype: object

In [119]:
df['tokenized_color'] = color_df['tokenized_input']
df['tokenized_bn'] = bn_df['tokenized_input']

df

Unnamed: 0_level_0,img_path,brand,name,color1,color2,price_og,price_resell,n_scrap,+price,tokenized_color,tokenized_bn
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
28029,crawling\product_crawling\image\28029.jpg,Nike,Nike Dunk Low Retro Black,WHITE,BLACK,129000,152800,123000.0,23800,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 3, 5]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, ..."
12831,crawling\product_crawling\image\12831.jpg,Nike,Nike Air Force 1 '07 Low White,WHITE,WHITE,139000,136200,151000.0,-2800,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 3, 4]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 3, 4, ..."
21935,crawling\product_crawling\image\21935.jpg,Nike,Nike Air Force 1 '07 WB Flax,FLAX,GUM,169000,173200,69000.0,4200,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 82, 3, 51]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 3, 4, ..."
44653,crawling\product_crawling\image\44653.jpg,Nike,Nike x Supreme Air Force 1 Low Flax,FLAX,FLAX-GUM LIGHT BROWN,184600,281400,27000.0,96800,"[0, 0, 0, 0, 0, 0, 0, 2, 82, 3, 82, 51, 14, 22]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 3, 4, 9, ..."
89548,crawling\product_crawling\image\89548.jpg,Nike,Nike Zoom Vomero 5 PRM Light Iron Ore and Flat...,LIGHTT IRON ORE,METALLIC SILVER,209000,258000,5810.0,49000,"[0, 0, 0, 0, 0, 0, 0, 2, 1, 116, 188, 3, 15, 13]","[0, 0, 0, 0, 0, 0, 0, 2, 4, 3, 4, 57, 585, 43,..."
...,...,...,...,...,...,...,...,...,...,...,...
26441,crawling\product_crawling\image\26441.jpg,Mihara Yasuhiro,Maison Mihara Yasuhiro OG Sole Over Dyed Lowcu...,ORANGE,white,324000,486000,85.0,162000,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 18, 3, 4]","[0, 0, 0, 0, 0, 0, 0, 2, 18, 19, 3, 25, 18, 19..."
26410,crawling\product_crawling\image\26410.jpg,Mihara Yasuhiro,Maison Mihara Yasuhiro x Nigel Cabourn Low Cut...,INDIGO,white,355300,503600,245.0,148300,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 76, 3, 4]","[0, 0, 0, 0, 2, 18, 19, 3, 25, 18, 19, 9, 562,..."
26224,crawling\product_crawling\image\26224.jpg,Mihara Yasuhiro,Maison Mihara Yasuhiro Hank OG Sole Over Dyed ...,BLACK,white,334400,450000,79.0,115600,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 5, 3, 4]","[0, 0, 0, 0, 0, 0, 2, 18, 19, 3, 25, 18, 19, 2..."
25717,crawling\product_crawling\image\25717.jpg,Mihara Yasuhiro,Maison Mihara Yasuhiro Peterson OG Sole Trick ...,WHITE,white,313500,457500,295.0,144000,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 3, 4]","[0, 0, 0, 0, 0, 2, 18, 19, 3, 25, 18, 19, 103,..."


# 데이터 저장

In [120]:
df.to_csv('/content/drive/MyDrive/kream_data/product_data_dpp.csv')

내 생각에는 brand, name, color1, color2 를 rnn 모델에 학습시키면 될듯하다. (원래 색데이터들을 간단하게 표현해서 one-hot encoding을 진행하려 했는데 그게 어려울것 같아서 RNN을 사용하는 것이다).

brand/name RNN model(B/N_RNN), color1/color2 RNN model(Color_RNN) 이렇게 두개를 만들것이다. 마지막 layer 는 dense layer로 설정해 회귀를 진행할 것이다.

brand/name RNN model input: "brand:OOO/name:OOO", target: price_resell<br>
color1/color2 RNN model input: "color1:OOO/color2:OOO", target: +price
<br>이렇게 input과 target을 설정하면 될듯하다.

timestep 설정은 <br>
B/N_RNN: 6 <br>
Color_RNN: 7

정도 하면 될 것 같다. 


내가 이미지 크롤링을 할때 일부러 사진데이터를 흑백으로 받았는데 굳이??? 같기도 하다. (학습이 너무 오래 걸릴까봐 두려웠나보다).
<br>일단 RNN 모델로 색을 학습하고 썩 만족스럽지 않은 결과가 나온다면 다시 color_image 크롤링을 진행하면 될듯하다.

또한 흑백이미지 학습을 위한 image_CNN 을 만들것이다. <br>
target은 price_og 또는 price_resell 을 둘다 실험해보고 더 나은 모델을 사용할 것이다.

참고로 이미지의 크기는 256x256 이다. 


B/N_RNN, Color_RNN, image_CNN 들의 회귀값들을 받아서 또 Dense layer를 통해서 회귀를 진행하는 final 모델을 만들 계획이다. 

하위 모델의 output을 정규화해서 final 모델의 인풋값으로 넣을 것이다. <br>
역전파 과정에서 하위모델의 가중치도 수정되게 할지(이거 되는지는 모르겠다. final_model의 loss를 적당히 분배해서 학습하면 되지 않을까?)<br>
이미 적절하게 학습된 하위 모델을 만들고 나서 final 모델로 집어 넣을지는 final 모델 설계를 진행 하면서 생각해보자

final_model 은 3-2-1 순으로 Dense layer 를 쌓으면 될듯 하다.