In [1]:
import re
import pickle
import numpy as np
import pandas as pd

from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

TOKENIZER_PATH = '../model/tokenizer.pkl'
MODEL_PATH = '../model/latest'
CSV_PATH = '../dataset/sms-raw.csv'

# load tokenizer
tokenizer = None
with open(TOKENIZER_PATH, 'rb') as f:
    tokenizer = pickle.load(f)
    
# load model
model = load_model(MODEL_PATH)

def basic_cleaning(text):
    # get all words (ignore number)
    words = re.findall("[a-zA-Z]+", str(text))

    # take words that has length > 2
    filtered = filter(lambda w: len(w) > 2, words)
    return ' '.join(filtered)

def predict_text(text):
    cleaned_text = basic_cleaning(text)
    tokenized_text = tokenizer.texts_to_sequences([cleaned_text])
    paded_sequences = pad_sequences(tokenized_text, 50)
    pred = model.predict(paded_sequences)
    return np.argmax(pred), pred[0, np.argmax(pred)]

# predict all data
csv_file = pd.read_csv(CSV_PATH)
csv_file.drop_duplicates(subset=['message'], inplace=True)
csv_file['pred'] = csv_file['message'].apply(predict_text)
csv_file

Unnamed: 0,type,message,sender,pred
0,Penipuan,10.10 festival selamat anda m-dapatkan hadiah ...,+6285245595958,"(0, 0.9947349)"
1,Penipuan,surat keputusan dari pt.shopee slamat anda m-d...,+6285283531407,"(0, 0.9984232)"
2,Penipuan,info pemenang slamat!!! no.anda t-pilih m-dapa...,+6285249229917,"(0, 0.99955255)"
3,Penipuan,no.and4 terpilih mndptkn rp.175jt program thun...,+6285245062487,"(0, 0.99729234)"
4,Penipuan,oktober untung !! kartu super bagus dan jackpo...,+6282116488844,"(1, 0.9968161)"
...,...,...,...,...
20824,Lain-lain,"terimakasih, pulsa rp7700/14hr terpakai utk pe...",171862,"(0, 0.42827252)"
20825,Lain-lain,"terima kasih, anda sudah tidak terdaftar di la...",93344,"(0, 0.6678783)"
20826,Lain-lain,paket bola1 3sms/mggu rp.2rb/sms telah berhasi...,868,"(0, 0.43825942)"
20827,Lain-lain,paket dangdut 3sms/minggu rp2200 telah berhasi...,868,"(0, 0.6550628)"


In [2]:
csv_file['type_pred'] = csv_file['pred'].apply(lambda p: p[0])
csv_file['proba'] = csv_file['pred'].apply(lambda p: p[1])
csv_file.drop(columns=['pred'], inplace=True)
csv_file

Unnamed: 0,type,message,sender,type_pred,proba
0,Penipuan,10.10 festival selamat anda m-dapatkan hadiah ...,+6285245595958,0,0.994735
1,Penipuan,surat keputusan dari pt.shopee slamat anda m-d...,+6285283531407,0,0.998423
2,Penipuan,info pemenang slamat!!! no.anda t-pilih m-dapa...,+6285249229917,0,0.999553
3,Penipuan,no.and4 terpilih mndptkn rp.175jt program thun...,+6285245062487,0,0.997292
4,Penipuan,oktober untung !! kartu super bagus dan jackpo...,+6282116488844,1,0.996816
...,...,...,...,...,...
20824,Lain-lain,"terimakasih, pulsa rp7700/14hr terpakai utk pe...",171862,0,0.428273
20825,Lain-lain,"terima kasih, anda sudah tidak terdaftar di la...",93344,0,0.667878
20826,Lain-lain,paket bola1 3sms/mggu rp.2rb/sms telah berhasi...,868,0,0.438259
20827,Lain-lain,paket dangdut 3sms/minggu rp2200 telah berhasi...,868,0,0.655063


In [3]:
csv_file[['type', 'type_pred', 'proba', 'message', 'sender']].to_csv('../dataset/sms-labeled-all.csv', index=False)