In [1]:
import pickle
import pandas as pd
import re
import string
import unicodedata
import nltk
from emo_unicode import UNICODE_EMO, EMOTICONS
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [2]:
# tf-idf object from 03_feature_engineering
tfidf_path = 'C:/Users/ASUS/TA01/03_feature_engineering/03_pickle/03_tfidf.pickle'
with open(tfidf_path, 'rb') as tfidf_path:
    tfidf = pickle.load(tfidf_path)

# trained model (nearest centroid classifier) from 04_model_training
model_path = 'C:/Users/ASUS/TA01/04_model_training/04_pickle/04_model_ncc.pickle'
with open(model_path, 'rb') as model_path:
    ncc = pickle.load(model_path)

In [3]:
category_codes = {
    'AKUNTANSI': 0,
    'BAA': 1,
    'BAGIAN ASRAMA': 2,
    'CAE': 3,
    'CELOE': 4,
    'LAA': 5,
    'LABORAN': 6,
    'LOGISTIK DAN ASET': 7,
    'OPEN LIBRARY': 8,
    'PENGELOLAAN KEGIATAN DAN KESEJAHTERAAN MAHASISWA': 9,
    'PPDU': 10,
    'PUSAT BAHASA': 11,
    'RISET DAN LAYANAN TEKNOLOGI INFORMASI': 12
}

In [7]:
def text_preprocessing(text):
    text = text.strip().lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text) # remove url
    text = re.sub('\S*@\S*\s?', '', text) # remove email
    text = re.sub('\[[^]]*\]', '', text) # remove beetwen square brackets []
    text = re.sub('[-+]?[0-9]+', '', text) # remove number
    emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in EMOTICONS) + u')')
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F" # emoticons
                               u"\U0001F300-\U0001F5FF" # symbols & pictographs
                               u"\U0001F680-\U0001F6FF" # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF" # flags (iOS)
                               u"\U00002500-\U00002BEF" # chinese char
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text) # remove emoji
    text = emoticon_pattern.sub(r'', text) # remove emoticon
    text = re.sub(r'[^\w\s]', '', text) # remove punctuation
    normal_word_path = pd.read_csv('C:/Users/ASUS/TA01/00_data/key_norm.csv')
    text = ' '.join([normal_word_path[normal_word_path['singkat'] == word]['hasil'].values[0] 
    if (normal_word_path['singkat'] == word).any() else word for word in text.split()])

    factory = StemmerFactory() #stemming
    stemmer = factory.create_stemmer()
    text = stemmer.stem(text)

    words = nltk.word_tokenize(text) # tokenize

    stopword = stopwords.words('indonesian') #remove stopword
    more_stopword = ['assalamualaikum', 'wr', 'wb', 'pak', 'bu', 'selamat',
                     'siang', 'pagi', 'sore', 'malam', 'saya', 'terimakasih',
                     'terima', 'kasih', 'kepada', 'bpk', 'ibu', 'mohon', 'tolong',
                     'maaf', 'dear', 'wassalamualaikum', 'regards', 'nbsp', 'amp', 'lg',
                     'lgi', 'kak', 'bapakibu', 'bapak', 'admin', 'pakbu','bupak','wrwb','ya','min','nim','jurus'] # add more stopword to default corpus
    stop_factory = stopword + more_stopword
    stop_factory.remove('tak')
    stop_factory.remove('akhir')     
    clean_words = []
    for word in words:
        if word not in stop_factory:
            clean_words.append(word)
    words = clean_words
    
    words = ' '.join(words) # join
    return words

def create_features(text):
    df = pd.DataFrame(columns=['text'])
    df.loc[0] = text
    df['text'] = df['text'].apply(text_preprocessing)

    features = tfidf.transform(df['text']).toarray()
    return features

def get_category_name(category_id):
    for category, id_ in category_codes.items():    
        if id_ == category_id:
            return category

def predict_from_text(text):
    # Predict using the input model
    ncc_prediction = ncc.predict(create_features(text))[0]
    
    # Return result
    ncc_category = get_category_name(ncc_prediction)
    
    print("Unit bagian yang dituju adalah %s." %(ncc_category))

In [12]:
text = 'apa saja syarat kelulusan sidang FTE'

In [13]:
predict_from_text(text)

Unit bagian yang dituju adalah LAA.
