In [32]:
import numpy as np
import pandas as pd

In [33]:
df = pd.read_csv("SIMPrice.csv")

In [34]:
df.describe()

Unnamed: 0,sim_number,price_vnd
count,200000.0,200000.0
mean,730479700.0,13950270.0
std,240647900.0,591299700.0
min,325009800.0,99000.0
25%,392260500.0,500000.0
50%,833428700.0,1000000.0
75%,918087400.0,5000000.0
max,997979700.0,168000000000.0


In [35]:
df[df["price_vnd"]<10000000]["price_vnd"].value_counts()

450000     45119
1000000    38472
500000     30619
3000000    29199
5000000    21383
399000       138
299000        91
99000         58
250000        47
119000        32
350000        22
199000        18
290000         9
390000         3
280000         3
400000         2
220000         1
Name: price_vnd, dtype: int64

In [36]:
df[df["price_vnd"]>10000000]["price_vnd"].value_counts()

12000000     5124
11325000     2682
11000000     1760
13000000     1720
15000000     1417
             ... 
130500000       1
12150000        1
80100000        1
424000000       1
439000000       1
Name: price_vnd, Length: 921, dtype: int64

In [37]:
def get_sim_category(x):
    if x["price_vnd"] <= 450000:
        return 0
    elif x["price_vnd"] == 500000:
        return 1
    elif x["price_vnd"] == 1000000:
        return 2
    elif x["price_vnd"] == 3000000:
        return 3
    elif x["price_vnd"] == 5000000:
        return 4
    else:
        return 5
    

In [38]:
df["sim_price_cat"] = df.apply(get_sim_category, axis = 1)

In [39]:
df["sim_price_cat"].value_counts()

0    45543
2    38472
5    34784
1    30619
3    29199
4    21383
Name: sim_price_cat, dtype: int64

In [40]:
X = []
y = []
for index, row in df.iterrows():
    X.append([int(c) for c in str(row["sim_number"])])
    y.append(row["sim_price_cat"])
X, y = np.array(X), np.array(y)

In [41]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(y)

# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
y_one_hot = onehot_encoder.fit_transform(integer_encoded)
print(y_one_hot)

[[1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 ...
 [0. 0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]]


In [42]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_one_hot, test_size= 0.1, random_state = 42)

In [43]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.optimizers import Adam

In [44]:
model = Sequential()

model.add (LSTM(units = 512, return_sequences = True, input_shape = (X_train.shape[1],1)))
model.add (Dropout(0.2))
model.add (LSTM(units = 512, return_sequences = True))
model.add (Dropout(0.2))
model.add (LSTM(units = 512, return_sequences = True))
model.add (Dropout(0.2))

model.add (LSTM(units = 128, return_sequences = False))
model.add (Dropout(0.2))

model.add (Dense(units = 512))
model.add (Dense(units = 6, activation = "softmax"))

optimizer = Adam()
model.compile (loss = "categorical_crossentropy", metrics = ["accuracy"], optimizer = optimizer)



In [45]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_4 (LSTM)               (None, 9, 512)            1052672   
                                                                 
 dropout_4 (Dropout)         (None, 9, 512)            0         
                                                                 
 lstm_5 (LSTM)               (None, 9, 512)            2099200   
                                                                 
 dropout_5 (Dropout)         (None, 9, 512)            0         
                                                                 
 lstm_6 (LSTM)               (None, 9, 512)            2099200   
                                                                 
 dropout_6 (Dropout)         (None, 9, 512)            0         
                                                                 
 lstm_7 (LSTM)               (None, 128)              

In [47]:
from tensorflow.keras.callbacks import ModelCheckpoint
model_checkpoint_callback = ModelCheckpoint(
    filepath="model_checkpoint.hdf5",
    save_weights_only=True,
    monitor='val_accuracy',
    save_best_only=True, verbose=1)

history = model.fit(X_train, y_train, epochs = 30, batch_size = 64, validation_data = (X_test, y_test), callbacks = [model_checkpoint_callback])


Epoch 1/30
   1/2813 [..............................] - ETA: 19:15 - loss: 1.8852 - accuracy: 0.1875

KeyboardInterrupt: 

In [48]:
model.load_weights("ckpt_best.hdf5")

In [49]:
import random

for i in range(0,5):
    idx = np.random.randint(0, 20000)
    result = model.predict(np.expand_dims(X_test[idx], axis = 0))
    print("Số sim", X_test[idx])
    print("Kết quả dự đoán", result)
    print("Index dự đoán", np.argmax(result))
    print("Giá trị thật: ", y_test[idx])
    print("Đúng/sai: ", np.argmax(result) == np.argmax(y_test[idx]))



Số sim [8 6 8 1 9 3 3 5 9]
Kết quả dự đoán [[1.9270249e-03 9.9452823e-01 3.1455269e-03 3.8218169e-04 1.1894244e-05
  5.0635599e-06]]
Index dự đoán 1
Giá trị thật:  [0. 1. 0. 0. 0. 0.]
Đúng/sai:  True
Số sim [8 4 8 8 8 8 7 7 7]
Kết quả dự đoán [[3.8633409e-13 6.3803971e-12 3.3819449e-09 1.5870469e-08 7.6848352e-07
  9.9999928e-01]]
Index dự đoán 5
Giá trị thật:  [0. 0. 0. 0. 0. 1.]
Đúng/sai:  True
Số sim [7 9 9 0 5 0 7 6 7]
Kết quả dự đoán [[0.01214182 0.35184714 0.6324782  0.0014127  0.00148448 0.00063559]]
Index dự đoán 2
Giá trị thật:  [0. 1. 0. 0. 0. 0.]
Đúng/sai:  False
Số sim [9 6 5 7 2 9 1 9 9]
Kết quả dự đoán [[1.1976969e-04 5.9601734e-04 4.3453205e-02 9.5088708e-01 4.9319328e-03
  1.2083708e-05]]
Index dự đoán 3
Giá trị thật:  [0. 0. 0. 1. 0. 0.]
Đúng/sai:  True
Số sim [7 0 5 2 4 4 2 4 8]
Kết quả dự đoán [[7.7919701e-08 5.7874223e-07 9.9999905e-01 4.2063398e-08 4.9720633e-08
  1.5611258e-07]]
Index dự đoán 2
Giá trị thật:  [0. 0. 1. 0. 0. 0.]
Đúng/sai:  True


In [58]:
def return_price_sim(x):
    if x == 0:
        print("< 450.000")
    elif x == 1:
        print("500.000")
    elif x == 2:
        print("1.000.000")
    elif x == 3 :
        print("3.000.000")
    elif x == 4:
        print("5.000.000")
    else:
        print(">5.000.000")

In [66]:
test = []
a = str(877997799)
test.append([int(c) for c in str(a)])
test = np.array(test)

result = model.predict(test)
print("Số sim:" , "0"+a)
print("Kết quả dự đoán", result)
print("Index dự đoán", np.argmax(result))
return_price_sim(np.argmax(result))


    

Số sim: 0877997799
Kết quả dự đoán [[2.5287825e-06 1.3565948e-03 5.0797418e-02 1.4570585e-01 5.4647422e-01
  2.5566339e-01]]
Index dự đoán 4
5.000.000
