In [27]:
import json
import re, string
import pandas as pd
from tqdm import tqdm
import pickle

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from itertools import chain

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [2]:
with open('../model/model.pkl', 'rb') as f:
    model = pickle.load(f)

In [3]:
with open('../model/vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

In [8]:
def cleaning(text, remove_stop_words=True, lemmatize_words=True):
    text = text.lower() # Case folding
    text = text.strip() # Trim text
    # Remove punctuations, special characters, and double whitespace
    text = re.compile('<.*?>').sub('', text) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
    text = re.sub('\s+', ' ', text)
    # Number removal
    text = re.sub(r'\[[0-9]*\]', ' ', text) 
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    # Remove number and whitespaces
    text = re.sub(r'\d', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    
    if remove_stop_words:
        # load stopwords
        stop_words = set(chain(stopwords.words('indonesian'), stopwords.words('english')))
        text = text.split()
        text = [w for w in text if not w in stop_words]
        text = " ".join(text)
    
    if lemmatize_words:
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()
        
        text = text.split()
        lemmatized_words = stemmer.stem(' '.join(text)).split(' ')
        text = " ".join(lemmatized_words)

    return text

In [9]:
texts = [
    "Wishnutama Mau Sewa Influencer Asing, Pakai BTS?",
    "7 Manfaat Konsumsi Kurma Bagi Tubuh Saat Berbuka Puasa",
    "Mobil Terbakar karena Simpan Power Bank",
    "Video: Huawei Watch GT2e Resmi Hadir di Indonesia,inet",
    "MA Batalkan Kenaikan Iuran, BPJS Kesehatan Siap Kembalikan Selisih Pembayaran",
    "Kylian Mbappe akan segera meninggalkan Real Madrid pada bursa transfer musim dingin",
    "Charles Leclerc masih kesulitan dalam mengangkat performa Ferrari pada musim ini",
    "Diumumkan Siang Ini, Ekonom Perkirakan Bank Indonesia Tahan Suku Bunga Acuan",
    "Mau Liburan 5 Hari  ke Jepang Lihat Sakura? Yuk Simak Itinerary-nya"
]

In [18]:
LABEL = ['finance', 'food', 'health', 'inet', 'oto', 'sport', 'travel']

In [35]:
text_raw = texts[-1]
text = cleaning(text_raw)
text = vectorizer.transform([text]) 
text

<1x6379 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [36]:
sample_predict = model.predict(text)
output = sample_predict[0]

probability = model.predict_proba(text)
output_probability = "{:.5f}".format(float(probability[0][probability.argmax()]))
probability[0]

array([1.27317151e-03, 3.48803812e-04, 8.97906605e-05, 1.76985147e-04,
       9.59337007e-04, 1.87942393e-04, 9.96963969e-01])

In [41]:
result = []
prob = probability[0]
for i in range(len(LABEL)):
    each_label = {'label': LABEL[i], 'score': float("{:.3f}".format(prob[i]))}
    result.append(each_label)
result

[{'label': 'finance', 'score': 0.001},
 {'label': 'food', 'score': 0.0},
 {'label': 'health', 'score': 0.0},
 {'label': 'inet', 'score': 0.0},
 {'label': 'oto', 'score': 0.001},
 {'label': 'sport', 'score': 0.0},
 {'label': 'travel', 'score': 0.997}]

In [42]:
output = {'text': text_raw, 'result': result}
json.dumps(output)

'{"text": "Mau Liburan 5 Hari  ke Jepang Lihat Sakura? Yuk Simak Itinerary-nya", "result": [{"label": "finance", "score": 0.001}, {"label": "food", "score": 0.0}, {"label": "health", "score": 0.0}, {"label": "inet", "score": 0.0}, {"label": "oto", "score": 0.001}, {"label": "sport", "score": 0.0}, {"label": "travel", "score": 0.997}]}'