In [9]:
import numpy as np
import pandas as pd


csv_file = 'csv/lenskart_products_categorical.csv'

df = pd.read_csv(csv_file,
                 usecols=['product_id','parent_category',
                          'frame_shape','encodings'],
                 dtype={"product_id": np.int32,
                        'parent_category':"category",
                        'frame_shape':"category",
                        'encodings': "object"})

In [10]:
pc_value_count = df["parent_category"].value_counts(sort=False)
fs_value_count = df["frame_shape"].value_counts(sort=False)
print(pc_value_count)
print(fs_value_count)

pc_value_count = pc_value_count.to_numpy()
fs_value_count = fs_value_count.to_numpy()
frequencies = np.concatenate((pc_value_count,fs_value_count))
print(pc_value_count)
print(fs_value_count)
print(frequencies)
sum = np.sum(frequencies)
class_weights = (sum - frequencies)/sum
print(class_weights)

0    1637
1    2183
2    1750
Name: parent_category, dtype: int64
0     986
1     466
2    3395
3     723
Name: frame_shape, dtype: int64
[1637 2183 1750]
[ 986  466 3395  723]
[1637 2183 1750  986  466 3395  723]
[0.85305206 0.8040395  0.84290844 0.91149013 0.95816876 0.69524237
 0.93509874]


In [11]:
class_weights_dict = dict(enumerate(class_weights, 0))
print(class_weights_dict)

{0: 0.8530520646319569, 1: 0.8040394973070017, 2: 0.8429084380610413, 3: 0.9114901256732495, 4: 0.9581687612208258, 5: 0.6952423698384201, 6: 0.9350987432675045}


In [12]:
from sklearn.model_selection import train_test_split
import tensorflow as tf

num_category = 3
num_frame_shape = 4

train_df, val_df = train_test_split(df, test_size=0.25)

train_category_labels = np.array(train_df.pop('parent_category'))
train_category_labels = tf.one_hot(train_category_labels,num_category)

train_frame_labels = np.array(train_df.pop('frame_shape'))
train_frame_labels = tf.one_hot(train_frame_labels,num_frame_shape)

train_labels = np.concatenate((train_category_labels,train_frame_labels),axis=1)

val_category_labels = np.array(val_df.pop('parent_category'))
val_category_labels = tf.one_hot(val_category_labels,num_category)

val_frame_labels = np.array(val_df.pop('frame_shape'))
val_frame_labels = tf.one_hot(val_frame_labels,num_frame_shape)

val_labels = np.concatenate((val_category_labels,val_frame_labels),axis=1)

encoding_train_dataset = []
# encoding_dataset.append(encoding_arr)
for ind in train_df.index:
    encoding_str = train_df["encodings"][ind]
    if(type(train_df["encodings"][ind]) is not str):
        encoding_train_dataset.append(np.ones(128)*9999999)
        print(ind," is not string")
        continue
    encoding_arr = np.fromstring(encoding_str[1:-1], dtype=np.float32, sep=' ')
    #encoding = np.fromstring((df["encodings"][ind])[1:-1], dtype=np.float32, sep=' ')
    encoding_train_dataset.append(encoding_arr)

encoding_val_dataset = []
# encoding_dataset.append(encoding_arr)
for ind in val_df.index:
    encoding_str = val_df["encodings"][ind]
    if(type(val_df["encodings"][ind]) is not str):
        encoding_val_dataset.append(np.ones(128)*9999999)
        print(ind," is not string")
        continue
    encoding_arr = np.fromstring(encoding_str[1:-1], dtype=np.float32, sep=' ')
    #encoding = np.fromstring((df["encodings"][ind])[1:-1], dtype=np.float32, sep=' ')
    encoding_val_dataset.append(encoding_arr)

#train_features = np.array(train_df.pop('encodings'))
#val_features = np.array(val_df.pop('encodings'))

print(train_category_labels.shape,train_frame_labels.shape)
print(val_category_labels.shape,val_frame_labels.shape)
print(train_labels.shape,val_labels.shape)
print(np.array(encoding_train_dataset).shape, np.array(encoding_val_dataset).shape)

3883  is not string
3877  is not string
2631  is not string
3565  is not string
3863  is not string
3879  is not string
3861  is not string
2794  is not string
3873  is not string
3878  is not string
(4177, 3) (4177, 4)
(1393, 3) (1393, 4)
(4177, 7) (1393, 7)
(4177, 128) (1393, 128)


In [13]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(encoding_train_dataset)

train_features = scaler.transform(encoding_train_dataset)
val_features = scaler.transform(encoding_val_dataset)

In [14]:
import tensorflow as tf
from tensorflow import keras
import tensorflow_addons as tfa

# metrics = [
#     keras.metrics.BinaryAccuracy(name='accuracy'),
#     keras.metrics.Precision(name='precision'),
#     keras.metrics.Recall(name='recall')
# ]

model = keras.Sequential([
    keras.layers.Dense(16, activation='relu',
          input_shape=(train_features.shape[-1],)),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(train_labels.shape[-1], activation='sigmoid')
])
model.compile(optimizer=keras.optimizers.Adam(),
              loss=tfa.metrics.hamming.hamming_loss_fn(mode='multi-label',threshold=0.5),
              metrics=['accuracy'])

print(model.summary())

TypeError: hamming_loss_fn() missing 2 required positional arguments: 'y_true' and 'y_pred'

In [None]:
EPOCHS = 5000
BATCH_SIZE = 128

# early_stopping = tf.keras.callbacks.EarlyStopping(
#     monitor='val_loss',
#     verbose=1,
#     patience=20,
#     mode='min',
#     restore_best_weights=True)

In [None]:
history = model.fit(train_features,
          train_labels,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
                    validation_data=(val_features,val_labels),
                    class_weight=class_weights_dict)

model.save('saved_models_classifier/basic_attempt1')