## Machine Learning Programming Workshop Assessment

- **Dataset:** Mobile dataset, derived from Shopee National Data Science Challenge 2019
- **Goal:** Code out a neural network in Keras to classify product titles into the corresponding phone brands

In [52]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [53]:
import sklearn as sk
import tensorflow as tf
import tensorflow.keras as K

<br>

<h3>Load Data</h3>

In [54]:
data = pd.read_csv('./files/shopee_mobile_data.csv', index_col=0)

<h4>Load Label Names from JSON File</h4>

In [55]:
import json
with open('./files/mobile_profile_train.json') as f:
    mobile_profiles = json.load(f)

In [56]:
class_names = [pair[0] for pair in sorted(mobile_profiles['Brand'].items(), key=lambda x: x[1])]

In [57]:
num_classes = len(class_names)
print(num_classes)
print(class_names)

56
['google', 'htc', 'apple', 'wiko', 'polytron', 'gionee', 'leagoo', 'brandcode', 'luna', 'acer', 'sharp', 'blackview', 'prince', 'lg', 'spc', 'coolpad', 'smartfren', 'infinix', 'blaupunkt', 'lava', 'aldo', 'huawei', 'advan', 'leeco', 'nexcom', 'zyrex', 'axioo', 'elephone', 'himax', 'hp', 'nokia', 'nuu mobile', 'icherry', 'xiaomi', 'pixcom', 'mito', 'huang mi', 'maxtron', 'sony', 'indosat', 'philips', 'lenovo', 'alcatel', 'samsung', 'zyo', 'doogee', 'vivo', 'evercoss', 'strawberry', 'ifone', 'fujitsu', 'blackberry', 'asus', 'oneplus', 'honor', 'oppo']


In [58]:
data = data[['title', 'Brand']].dropna()

In [59]:
data.head()

Unnamed: 0_level_0,title,Brand
itemid,Unnamed: 1_level_1,Unnamed: 2_level_1
2346660,apple iphone 4s back glass spare part original...,2.0
2816338,iphone 4s 64gb white,2.0
2847602,samsung sm b310e piton dual sim,43.0
3116949,samsung caramel gt e1272 dual sim 32 mb putih,43.0
3794648,garskin sony experia z z1 z2 ultra,38.0


In [60]:
X = data['title']
Y = data['Brand']

<br>

<h3>Process Text</h3>

In [61]:
tokenizer = K.preprocessing.text.Tokenizer(num_words=1000)

In [62]:
tokenizer.fit_on_texts(X)

In [63]:
word_index = {k: v+2 for k,v in tokenizer.word_index.items()}

In [64]:
word_index["<PAD>"] = 0    # Used to fill sentences to make Sequence Lengths the same
word_index["<START>"] = 1  # To show the start of a sequence
word_index["UNK"] = 2      # Used to fill in the gap for unknown words

In [65]:
int_data = data['title'].apply(lambda x: [1] + [word_index.get(xi, 2) for xi in x.split()])

In [66]:
padded_data = K.preprocessing.sequence.pad_sequences(int_data, value=0, padding='post', maxlen=30)

In [67]:
print(padded_data)

[[  1  56   6 ...   0   0   0]
 [  1   6 243 ...   0   0   0]
 [  1   3 203 ...   0   0   0]
 ...
 [  1  22 494 ...   0   0   0]
 [  1  72  43 ...   0   0   0]
 [  1  11 197 ...   0   0   0]]


In [68]:
print(padded_data[0])

[   1   56    6  243  251  127 4531 2438   16  904  139    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]


<br>

<h3>Split Data</h3>

In [73]:
padded_data.shape

(155038, 30)

In [74]:
split_ratio = 0.2
split_idx = int(split_ratio*len(padded_data))

X_train = padded_data[split_idx:]
Y_train = Y[split_idx:]

X_val = padded_data[:split_idx]
Y_val = Y[:split_idx]

<br>

<h3>Build Model</h3>

In [40]:
gru_model = K.Sequential([
    K.layers.Embedding(len(word_index), 8),
    K.layers.GRU(4, return_sequences=False),
    K.layers.Dense(32, activation='relu'),
    K.layers.Dense(16, activation='relu'),
    K.layers.Dense(num_classes, activation='softmax'),   
])

In [41]:
# Compile
gru_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

<br>

<h3>Train Model</h3>

In [42]:
gru_model.fit(X_train, Y_train, epochs=5, batch_size=64)

Epoch 1/5
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x2ad9a02d8c8>

<br>

<h3>Evaluate Model</h3>

In [43]:
gru_model.evaluate(X_val, Y_val)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3


[0.08483380079269409, 0.9821653366088867]

In [44]:
class_names

['google',
 'htc',
 'apple',
 'wiko',
 'polytron',
 'gionee',
 'leagoo',
 'brandcode',
 'luna',
 'acer',
 'sharp',
 'blackview',
 'prince',
 'lg',
 'spc',
 'coolpad',
 'smartfren',
 'infinix',
 'blaupunkt',
 'lava',
 'aldo',
 'huawei',
 'advan',
 'leeco',
 'nexcom',
 'zyrex',
 'axioo',
 'elephone',
 'himax',
 'hp',
 'nokia',
 'nuu mobile',
 'icherry',
 'xiaomi',
 'pixcom',
 'mito',
 'huang mi',
 'maxtron',
 'sony',
 'indosat',
 'philips',
 'lenovo',
 'alcatel',
 'samsung',
 'zyo',
 'doogee',
 'vivo',
 'evercoss',
 'strawberry',
 'ifone',
 'fujitsu',
 'blackberry',
 'asus',
 'oneplus',
 'honor',
 'oppo']

In [45]:
preds = gru_model.predict(X_val)
class_preds = np.argmax(preds,1)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3


In [46]:
val_text = data['title'].iloc[:split_idx]

In [47]:
for i in range(20):
    print(val_text.iloc[i])
    print('True Value: {} | Predicted: {}'.format(class_names[int(Y_val.iloc[i])], class_names[class_preds[i]]))
    print()

apple iphone 4s back glass spare part original replacement putih
True Value: apple | Predicted: apple

iphone 4s 64gb white
True Value: apple | Predicted: apple

samsung sm b310e piton dual sim
True Value: samsung | Predicted: samsung

samsung caramel gt e1272 dual sim 32 mb putih
True Value: samsung | Predicted: samsung

garskin sony experia z z1 z2 ultra
True Value: sony | Predicted: sony

lcd xiaomi redmi 4+touchscreen
True Value: xiaomi | Predicted: xiaomi

samsung caramel gt e1272 dual sim 32mb black
True Value: samsung | Predicted: samsung

iphone 4g 8gb
True Value: apple | Predicted: apple

blackberry torch 1 9800 gsm garansi distributor 2 tahun white
True Value: blackberry | Predicted: blackberry

samsung keystone 3 sm b109e
True Value: samsung | Predicted: samsung

samsung galaxy j5 j 500g 8 gb hitam
True Value: samsung | Predicted: samsung

samsung galaxy j1 mini sm j105 8gb white
True Value: samsung | Predicted: samsung

iphone 5 white 16gb fullset mulus
True Value: apple | 

<br>

<h3>Predictor Function</h3>

In [48]:
def predictor(text):
    int_data = [1] + [word_index.get(xi, 2) for xi in text.lower().split()]
    padded_data = K.preprocessing.sequence.pad_sequences([int_data], value=0, padding='post', maxlen=30)
    pred = gru_model.predict(padded_data)
    idx = np.argmax(pred)
    class_pred = class_names[idx]
    return class_pred

In [49]:
print(class_names)

['google', 'htc', 'apple', 'wiko', 'polytron', 'gionee', 'leagoo', 'brandcode', 'luna', 'acer', 'sharp', 'blackview', 'prince', 'lg', 'spc', 'coolpad', 'smartfren', 'infinix', 'blaupunkt', 'lava', 'aldo', 'huawei', 'advan', 'leeco', 'nexcom', 'zyrex', 'axioo', 'elephone', 'himax', 'hp', 'nokia', 'nuu mobile', 'icherry', 'xiaomi', 'pixcom', 'mito', 'huang mi', 'maxtron', 'sony', 'indosat', 'philips', 'lenovo', 'alcatel', 'samsung', 'zyo', 'doogee', 'vivo', 'evercoss', 'strawberry', 'ifone', 'fujitsu', 'blackberry', 'asus', 'oneplus', 'honor', 'oppo']


In [51]:
text = "Philips EP2220/10 LatteGo 3000 Series Classic Milk Frother Black"
predictor(text)

'philips'