In [1]:
# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install sastrawi --user



In [2]:
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import confusion_matrix, accuracy_score
from Sastrawi.Stemmer.Stemmer import Stemmer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.Dictionary.ArrayDictionary import ArrayDictionary

In [3]:
df = pd.read_csv('../../data/kompas-titles.csv', sep='|')

In [4]:
class CustomTokenizer(object):
    def __init__(self, allowed_punctuations=[], should_stem=True, extra_stemmer_words=[]):
        self.should_stem = should_stem
        if should_stem:
            factory = StemmerFactory()
            dictionary = ArrayDictionary(factory.get_words())
            dictionary.add_words(extra_stemmer_words)
            self.stemmer = Stemmer(dictionary)
        self.allowed_punctuations = allowed_punctuations

    def __call__(self, doc):
        tokens = re.findall(r"(?u)\b\w\w+\b|" + '\\' + '|\\'.join(self.allowed_punctuations), doc)
        processed_tokens = []
        for t in tokens:
            if self.should_stem and t not in self.allowed_punctuations:
                processed_tokens.append(self.stemmer.stem(t))
            else:
                processed_tokens.append(t)
        return processed_tokens

In [5]:
allowed_punctuations = ['!', '?', '"', "'", "."]
extra_stemmer_words = ['asian', 'games']
stop_words = ['yang', 'dan', 'atau', 'di', 'ke', 'dari', 'itu', 'satu', 'dua', 'tiga', 'empat', 'lima', 'enam', 'tujuh', 'delapan', 'sembilan', 'sepuluh', 'sebelas', 'belas']

In [6]:
def create_pipeline(classifier):
    tokenizer=CustomTokenizer(allowed_punctuations=allowed_punctuations, should_stem=False, extra_stemmer_words=extra_stemmer_words)
    count_vectorizer = CountVectorizer(tokenizer=tokenizer, analyzer='word', stop_words=stop_words)
    return Pipeline((
        ('count_vectorizer', count_vectorizer),
        ('classifier', classifier),
    ))

In [7]:
def evaluate(name, clf, x, y, cv=5):
    print('Evaluating %s...' % name)
    pipeline = create_pipeline(clf)
    y_predicted = cross_val_predict(pipeline, x, y, cv=cv)
    score = accuracy_score(y, y_predicted)
    cm = confusion_matrix(y, y_predicted)
    print('Accuracy: %f' % score)
    print('Confusion matrix:')
    print(cm)

In [8]:
clf = RandomForestClassifier(n_estimators=100)
evaluate('Random Forest', clf, df.titles, df.labels)

Evaluating Random Forest...
Accuracy: 0.716222
Confusion matrix:
[[822 483]
 [229 975]]
