<a href="https://colab.research.google.com/github/donghui-0126/mini-project/blob/main/shoes-project/resell_regressor/bn_dnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.linear_model import LinearRegression
import warnings
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences



warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/kream_data/product_data_dpp.csv')

In [None]:
df.shape

(6103, 15)

In [None]:
df_nike = df[df['brand']=='Nike'][['brand', 'color1', 'color2', 'price_og', 'price_resell']]
df_jordan = df[df['brand']=='Jordan'][['brand', 'color1', 'color2', 'price_og', 'price_resell']]
df_adidas = df[df['brand']=='Adidas'][['brand', 'color1', 'color2', 'price_og', 'price_resell']]

In [None]:
print(df_nike.shape)
df_nike.head()

(1455, 5)


Unnamed: 0,brand,color1,color2,price_og,price_resell
0,Nike,WHITE,BLACK,129000,152800
1,Nike,WHITE,WHITE,139000,136200
2,Nike,FLAX,GUM,169000,173200
3,Nike,FLAX,FLAX-GUM LIGHT BROWN,184600,281400
4,Nike,LIGHTT IRON ORE,METALLIC SILVER,209000,258000


In [None]:
print(df_jordan.shape)
df_jordan.head()

(1376, 5)


Unnamed: 0,brand,color1,color2,price_og,price_resell
1455,Jordan,VARSITY RED,BLACK,209000,480800
1456,Jordan,WHITE,BLACK,139000,163600
1457,Jordan,WHITE,BLACK,249000,254400
1458,Jordan,BLACK,PHANTOM,189000,698000
1459,Jordan,SAIL,UNIVERSITY RED-RIDGEROCK,189000,1417800


In [None]:
df_nike['price_resell'].describe()

count    1.454000e+03
mean     2.509836e+05
std      2.129401e+05
min      4.560000e+04
25%      1.388500e+05
50%      1.900000e+05
75%      2.814000e+05
max      2.531000e+06
Name: price_resell, dtype: float64

In [None]:
df_jordan['price_resell'].describe()

count    1.376000e+03
mean     3.379037e+05
std      5.207507e+05
min      5.000000e+04
25%      1.740500e+05
50%      2.399000e+05
75%      3.340500e+05
max      9.676000e+06
Name: price_resell, dtype: float64

In [None]:
(df_nike['price_resell']//100000).value_counts()

1     668
2     367
3     137
0     111
4      62
5      27
6      25
9      15
8      12
7      11
11      5
10      4
14      2
13      2
22      2
20      1
25      1
17      1
12      1
Name: price_resell, dtype: int64

In [None]:
df_nike = df_nike.drop(df_nike[df_nike['price_resell']>8000000].index)

In [None]:
df_nike['price_class'] = np.where(df_nike['price_resell'] <1000000,\
                                  df_nike['price_resell']//100000,\
                                  10)

-----

In [None]:
((df_jordan['price_resell'])//100000).value_counts()

2     468
1     428
3     199
4      78
5      52
0      51
6      27
7      13
8      12
10      6
9       5
19      5
13      4
15      2
24      2
30      2
17      2
14      2
20      2
12      2
33      1
22      1
64      1
41      1
18      1
65      1
32      1
31      1
16      1
79      1
21      1
96      1
11      1
28      1
Name: price_resell, dtype: int64

In [None]:
df_jordan = df_jordan.drop(df_nike[df_nike['price_resell']>5000000].index)

In [None]:
df_jordan['price_class'] = np.where(df_jordan['price_resell'] <1100000,\
                                  df_jordan['price_resell']//100000,\
                                  13)

---------

In [None]:
def make_input_col(df_list):
    for df in df_list:
        df['input'] = df['brand'] + " | " + df['color1'] + " | " + df['color2']
        df['target'] = df['price_resell']/10000.0

make_input_col([df_nike, df_jordan, df_adidas])

In [None]:
def tokenize_input(df_list, pd_list):
    return_list = []

    for df, pd in zip(df_list, pd_list):
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(df['input'])
        word_index = tokenizer.word_index

        print(f"word_index의 크기: {len(word_index)}")
            
        # 문장을 토큰화 한다.
        tokenized_name = tokenizer.texts_to_sequences(df['input'])
        padded_name = pad_sequences(tokenized_name)

        return_list.append((padded_name))
    return return_list

df_nike_input_ = pd.DataFrame()
df_jordan_input_ = pd.DataFrame()
df_adidas_input_ = pd.DataFrame()

[df_nike_input, df_jordan_input, df_adidas_input] = tokenize_input([df_nike, df_jordan, df_adidas], [df_nike_input_, df_jordan_input_, df_adidas_input_])

word_index의 크기: 515
word_index의 크기: 389
word_index의 크기: 319


In [None]:
# tokenizer 저장
# python 변수(객체)를 pickle 파일로 저장
with open('color_tokenizer.pickle', 'wb') as f:
    pickle.dump(tokenizer, f)

# tokenizer load 하기
with open('color_tokenizer.pickle', 'rb') as f:
    tokenizer = pickle.load(f)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
nike_train_X, nike_test_X, nike_train_y, nike_test_y = train_test_split(df_nike_input, df_nike['target'], test_size=0.3, shuffle=True, stratify=df_nike['price_class'])
nike_valid_X, nike_test_X, nike_valid_y, nike_test_y = train_test_split(nike_test_X, nike_test_y, test_size=0.5)

jordan_train_X, jordan_test_X, jordan_train_y, jordan_test_y = train_test_split(df_jordan_input, df_jordan['target'], test_size=0.3, shuffle=True, stratify=df_jordan['price_class'])
jordan_valid_X, jordan_test_X, jordan_valid_y, jordan_test_y = train_test_split(jordan_test_X, jordan_test_y, test_size=0.5)

adidas_train_X, adidas_test_X, adidas_train_y, adidas_test_y = train_test_split(df_adidas_input, df_adidas['target'], test_size=0.3, shuffle=True)
adidas_valid_X, adidas_test_X, adidas_valid_y, adidas_test_y = train_test_split(adidas_test_X, adidas_test_y, test_size=0.5)

In [None]:
%cd /content/drive/MyDrive/kream_data/model

/content/drive/MyDrive/kream_data/model


In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

es=EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=30)
mc_nike=ModelCheckpoint('nike_color_best_model.h5', monitor='val_loss', verbose=1, save_best_only=True)
mc_jordan=ModelCheckpoint('jordan_color_best_model.h5', monitor='val_loss', verbose=1, save_best_only=True)

In [None]:
# 임베딩 층에 입력될 단어의 수를 지정한다.
word_size = 1216 + 1


nike_bn_model = keras.models.Sequential([
    # 임베딩층
    keras.layers.Embedding(word_size, word_size, input_length=13),
    # 회귀를 위해 임베딩층의 output값을 1차원의 array로 바꾸어준다. 
    keras.layers.Flatten(),
    keras.layers.Dense(64),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(32),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(16),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(8),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(4),
    keras.layers.Dropout(0.5),

    keras.layers.Dense(2),
    keras.layers.Dense(1)
 ])

nike_bn_model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.005), loss=tf.keras.losses.Huber(delta=3), metrics= ["mean_absolute_percentage_error"])

nike_bn_model.fit(nike_train_X, nike_train_y, batch_size=16, epochs=300, validation_data = (nike_valid_X,nike_valid_y),  callbacks=[es, mc_nike])

Epoch 1/300
Epoch 1: val_loss improved from inf to 49.50441, saving model to nike_color_best_model.h5
Epoch 2/300
Epoch 2: val_loss improved from 49.50441 to 36.68742, saving model to nike_color_best_model.h5
Epoch 3/300
Epoch 3: val_loss improved from 36.68742 to 34.82493, saving model to nike_color_best_model.h5
Epoch 4/300
Epoch 4: val_loss improved from 34.82493 to 30.80205, saving model to nike_color_best_model.h5
Epoch 5/300
Epoch 5: val_loss did not improve from 30.80205
Epoch 6/300
Epoch 6: val_loss did not improve from 30.80205
Epoch 7/300
Epoch 7: val_loss did not improve from 30.80205
Epoch 8/300
Epoch 8: val_loss did not improve from 30.80205
Epoch 9/300
Epoch 9: val_loss did not improve from 30.80205
Epoch 10/300
Epoch 10: val_loss did not improve from 30.80205
Epoch 11/300
Epoch 11: val_loss did not improve from 30.80205
Epoch 12/300
Epoch 12: val_loss did not improve from 30.80205
Epoch 13/300
Epoch 13: val_loss did not improve from 30.80205
Epoch 14/300
Epoch 14: val_lo

<keras.callbacks.History at 0x7f0abc592080>

In [None]:
model = keras.models.load_model('nike_color_best_model.h5', custom_objects=None, compile=True)

model.evaluate(nike_test_X, nike_test_y)



[28.724876403808594, 38.65422439575195]

In [None]:
# 임베딩 층에 입력될 단어의 수를 지정한다.
word_size = 967 + 1


nike_bn_model = keras.models.Sequential([
    # 임베딩층
    keras.layers.Embedding(word_size, word_size, input_length=13),
    # 회귀를 위해 임베딩층의 output값을 1차원의 array로 바꾸어준다. 
    keras.layers.Flatten(),
    keras.layers.Dense(64),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(32),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(16),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(8),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(4),
    keras.layers.Dropout(0.5),

    keras.layers.Dense(2),
    keras.layers.Dense(1)
 ])

nike_bn_model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.005), loss=tf.keras.losses.Huber(delta=3), metrics= ["mean_absolute_percentage_error"])

nike_bn_model.fit(jordan_train_X, jordan_train_y, batch_size=16, epochs=300, validation_data = (jordan_valid_X,jordan_valid_y),  callbacks=[es, mc_jordan])

Epoch 1/300
Epoch 1: val_loss improved from inf to 59.66909, saving model to jordan_color_best_model.h5
Epoch 2/300
Epoch 2: val_loss improved from 59.66909 to 50.24213, saving model to jordan_color_best_model.h5
Epoch 3/300
Epoch 3: val_loss did not improve from 50.24213
Epoch 4/300
Epoch 4: val_loss did not improve from 50.24213
Epoch 5/300
Epoch 5: val_loss did not improve from 50.24213
Epoch 6/300
Epoch 6: val_loss did not improve from 50.24213
Epoch 7/300
Epoch 7: val_loss did not improve from 50.24213
Epoch 8/300
Epoch 8: val_loss did not improve from 50.24213
Epoch 9/300
Epoch 9: val_loss did not improve from 50.24213
Epoch 10/300
Epoch 10: val_loss did not improve from 50.24213
Epoch 11/300
Epoch 11: val_loss did not improve from 50.24213
Epoch 12/300
Epoch 12: val_loss did not improve from 50.24213
Epoch 13/300
Epoch 13: val_loss did not improve from 50.24213
Epoch 14/300
Epoch 14: val_loss did not improve from 50.24213
Epoch 15/300
Epoch 15: val_loss did not improve from 50.2

<keras.callbacks.History at 0x7f0aef2ee500>

In [None]:
model = keras.models.load_model('jordan_color_best_model.h5', custom_objects=None, compile=True)

model.evaluate(jordan_test_X, jordan_test_y)



[44.321311950683594, 39.72031784057617]