In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
import pandas as pd
import re
import string
import unicodedata
import nltk
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [32]:
# load data traning
data_path = 'C:/Users/ASUS/TA01/00_data/data_perhitunganbab3.csv'
data = pd.read_csv(data_path, sep=';')

data

Unnamed: 0,keluhan,bagian
0,Saya sudah membayar EPRT tapi status di igraci...,PUSAT BAHASA
1,tolong adakan EPRT online agar mempermudah dal...,PUSAT BAHASA
2,Saya sudah mencoba membayar bpp melalui atm ma...,AKUNTANSI
3,"Selamat siang, saya mau menanyakan bagaimana p...",AKUNTANSI
4,Koleksi buku seni yang sangat sedikit tolong d...,OPEN LIBRARY
5,Kurangnya stop kontak untuk pengunjung dan sto...,OPEN LIBRARY
6,saya mengalami kesulitan dalam membayar Eprt d...,PUSAT BAHASA


In [33]:
class TextPreprocessing:
    def __init__(self, text="test"):
        self.text = text

    def lowercase(self):
        """Convert to lowercase"""
        self.text = str(self.text).lower()
        self.text = self.text.strip()
        return self

    def remove_url(self):
        """Remove URL (http/https/www) or custom URL"""
        self.text = re.sub(r"https?://\S+|www\.\S+", "", self.text)
        return self

    def remove_email(self):
        """Remove email"""
        self.text = re.sub("\S*@\S*\s?", "", self.text)
        return self

    def remove_between_square_brackets(self):
        """Remove string beetwen square brackets []"""
        self.text = re.sub("\[[^]]*\]", "", self.text)
        return self

    def remove_numbers(self):
        """Remove numbers"""
        self.text = re.sub("[-+]?[0-9]+", "", self.text)
        return self

    def remove_punctuation(self):
        """Remove punctuation"""
        self.text = re.sub(r"[^\w\s]", "", self.text)
        return self

    def normalize_word(self):
        """Normalize slang world"""
        normal_word_path = pd.read_csv("C:/Users/ASUS/TA01/00_data/key_norm.csv")

        self.text = " ".join(
            [
                normal_word_path[normal_word_path["singkat"] == word]["hasil"].values[0]
                if (normal_word_path["singkat"] == word).any()
                else word
                for word in self.text.split()
            ]
        )
        return self

    def stemming(self):
        """Stemming for Bahasa with Sastrawi"""
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()

        self.text = stemmer.stem(self.text)
        return self

    def tokenize(self):
        """Tokenize words"""
        self.words = nltk.word_tokenize(self.text)
        return self

    def stopwords_removal(self):
        """Stopword removal"""
        stopword = stopwords.words("indonesian")
        more_stopword = [
            "assalamualaikum", "wr", "wb", "pak",
            "bu", "selamat", "siang", "pagi",
            "sore", "malam", "saya",
            "terimakasih", "terima",
            "kasih", "kepada", "bpk",
            "ibu", "mohon", "tolong",
            "maaf", "dear", "wassalamualaikum",
            "regards", "nbsp", "amp", "lg", "lgi", "kak",
            "bapakibu","bapak", "admin","pakbu","bupak","wrwb","ya","min" ]  # add more stopword to default corpus
        stop_factory = stopword + more_stopword
        stop_factory.remove('tak')
        stop_factory.remove('akhir')
        
        clean_words = []
        for word in self.words:
            if word not in stop_factory:
                clean_words.append(word)
        self.words = clean_words  
        return self

    def join_words(self):
        """Join all words"""
        self.words = " ".join(self.words)
        return self
    
    def do_all(self, text):
        """Do all text preprocessing process""" 
        self.text = text
        self = self.lowercase()
        self = self.remove_url()
        self = self.remove_email()
        self = self.remove_between_square_brackets()
        self = self.remove_numbers()
        self = self.remove_punctuation()
        self = self.normalize_word()
        self = self.stemming()
        self = self.tokenize()
        self = self.stopwords_removal()
        self = self.join_words()
        return self.words

In [34]:
tp = TextPreprocessing() # load module text preprocessing

data['clean_keluhan'] = data['keluhan'].apply(tp.do_all)


In [36]:
columns = ['clean_keluhan', 'bagian']
data = data[columns]
data

Unnamed: 0,clean_keluhan,bagian
0,bayar eprt status igracias update bantu,PUSAT BAHASA
1,adakan eprt online mudah proses ketes eprt,PUSAT BAHASA
2,coba bayar bpp atm teller ganggu rek telkomnya...,AKUNTANSI
3,perihal alih uang bpp semester semester,AKUNTANSI
4,koleksi buku seni,OPEN LIBRARY
5,stop kontak unjung stop kontak fungsi,OPEN LIBRARY
6,alami sulit bayar eprt mbanking mandiri bantu,PUSAT BAHASA


In [8]:
data['bagian_label'] = LabelEncoder().fit_transform(data['bagian']) #memberi label pada setiap bagian
data

Unnamed: 0,clean_keluhan,bagian,bagian_label
0,bayar eprt status igracias update bantu,PUSAT BAHASA,2
1,adakan eprt online mudah proses ketes eprt,PUSAT BAHASA,2
2,coba bayar bpp atm teller ganggu rek telkomnya...,AKUNTANSI,0
3,perihal alih uang bpp semester semester,AKUNTANSI,0
4,koleksi buku seni,OPEN LIBRARY,1
5,stop kontak unjung stop kontak fungsi,OPEN LIBRARY,1
6,alami sulit bayar eprt mbanking mandiri bantu,PUSAT BAHASA,2


In [9]:
# Split train-test data
x_train=data['clean_keluhan']
y_train=data['bagian_label']                                                  

In [10]:
# Text representation: TF-IDF
# TF-IDF parameter
ngram_range = (1,1) #untuk menghitung tfidf unigram dan bigram
min_df = 1
max_df = 1.0 #mindf dan maxdf untuk memberi batasan minimum dan maksimum ngram yang akan digunakan pada fungsi TfidfVectorizer
max_features = 11 #untuk mendapatkan 1000 top term dengan term frequency terbesar

tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True) #membuat objek tfidfvectorizer ke variabel tfidf

In [11]:
features_train = tfidf.fit_transform(x_train) #memanggil method fit.transform dari objek tfidf pada x_train
labels_train = y_train
print(features_train)

  (0, 0)	0.637414434663118
  (0, 3)	0.5448407283248461
  (0, 1)	0.5448407283248461
  (1, 3)	1.0
  (2, 5)	0.34622197445231473
  (2, 6)	0.41709175289853345
  (2, 7)	0.7061977254549573
  (2, 2)	0.34622197445231473
  (2, 1)	0.2959390664291466
  (3, 8)	0.8978972949445114
  (3, 2)	0.4402050064814449
  (5, 4)	0.7071067811865475
  (5, 9)	0.7071067811865475
  (6, 10)	0.543530401770053
  (6, 5)	0.45117691147795724
  (6, 0)	0.45117691147795724
  (6, 3)	0.38565106731999843
  (6, 1)	0.38565106731999843


In [12]:
tfidf.get_feature_names()

['bantu',
 'bayar',
 'bpp',
 'eprt',
 'kontak',
 'mandiri',
 'pakai',
 'rek',
 'semester',
 'stop',
 'sulit']

In [13]:
import pandas as pd

df = pd.DataFrame(features_train.todense().T,
                  index=tfidf.get_feature_names(),
                  columns=[f'D{i+0}' for i in range(len(x_train))])

pd.set_option("max_columns", None)
pd.set_option("max_rows", None)
df

Unnamed: 0,D0,D1,D2,D3,D4,D5,D6
bantu,0.637414,0.0,0.0,0.0,0.0,0.0,0.451177
bayar,0.544841,0.0,0.295939,0.0,0.0,0.0,0.385651
bpp,0.0,0.0,0.346222,0.440205,0.0,0.0,0.0
eprt,0.544841,1.0,0.0,0.0,0.0,0.0,0.385651
kontak,0.0,0.0,0.0,0.0,0.0,0.707107,0.0
mandiri,0.0,0.0,0.346222,0.0,0.0,0.0,0.451177
pakai,0.0,0.0,0.417092,0.0,0.0,0.0,0.0
rek,0.0,0.0,0.706198,0.0,0.0,0.0,0.0
semester,0.0,0.0,0.0,0.897897,0.0,0.0,0.0
stop,0.0,0.0,0.0,0.0,0.0,0.707107,0.0


In [14]:
from sklearn.neighbors import NearestCentroid # nearest centroid classifier
from pprint import pprint
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [15]:
ncc = NearestCentroid()
ncc.fit(features_train, labels_train) # training
ncc_predict = ncc.predict(features_train) # prediction

In [16]:
# Training accuracy
print("The training accuracy is: ")
print(accuracy_score(labels_train, ncc.predict(features_train)))

The training accuracy is: 
1.0


In [17]:
# Classification report
print("Classification report")
print(classification_report(labels_train, ncc_predict))

Classification report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         2
           2       1.00      1.00      1.00         3

    accuracy                           1.00         7
   macro avg       1.00      1.00      1.00         7
weighted avg       1.00      1.00      1.00         7



In [18]:
# Label Encoding
category_codes = {
    'Akuntansi': 0,
    'Oplib': 1,
    'Pusat Bahasa': 2
}

In [19]:
tp = TextPreprocessing() # load module text preprocessing

def create_features(text):
    df = pd.DataFrame(columns=['keluhan'])
    df.loc[0] = text
    df['keluhan'] = df['keluhan'].apply(tp.do_all)

    features = tfidf.transform(df['keluhan']).toarray()
    return features

def get_category_name(category_id):
    for category, id_ in category_codes.items():    
        if id_ == category_id:
            return category

def predict_from_text(text):
    # Predict using the input model
    ncc_prediction = ncc.predict(create_features(text))[0]
    
    # Return result
    ncc_category = get_category_name(ncc_prediction)
    
    print("The predicted category using nearest centroid classifier model is %s." %(ncc_category))
    # print("The conditional probability is: %a" %(prediction_proba.max()*100))

In [20]:
text = 'saya mengalami kesulitan dalam membayar Eprt di m-banking mandiri, mohon bantuannya'

In [21]:
predict_from_text(text)

The predicted category using nearest centroid classifier model is Pusat Bahasa.
