# Model Creation Notebook

This Notebook is created to train machine learning model to extract Qur'an verse from indonesian text document. There are a few processes that included in this notebook start from create helper function, load and preprocess dataset, until evaluate the models

In [65]:
import numpy as np
import pandas as pd
import pickle
import warnings
import nltk
import json

from sklearn.metrics import classification_report, accuracy_score, hamming_loss
from owlready2 import *

from app.lib.dict import load_dict
from app.lib.datasets import load_labels
from app.lib.word_similarity import WordSimilarityClassifier
from app.lib.preprocess import IndoTextCleaner, StopWordsEliminator
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

## 1. Helper Function (Preprocess, Filter, Evaluation Report)

In [66]:
warnings.simplefilter('ignore')

stemmer = StemmerFactory().create_stemmer()
text_cleaner = IndoTextCleaner()
sw_elim = StopWordsEliminator()

def print_evaluation_report(y_pred, clf_name):
    print("Classifier: ", clf_name)
    print(classification_report(y_test,y_pred))
    print("accuracy: ", accuracy_score(y_test,y_pred))
    print("hamming loss: ", hamming_loss(y_test, y_pred))

## 2. Load & Preprocess Dataset

In [67]:
df_processed = pd.read_csv("processed_datasets.csv")

X = df_processed['text']
Y = df_processed.drop(columns=['text'])

In [68]:
from sklearn.model_selection import KFold

cv = KFold(n_splits=10, random_state=42, shuffle=False)

In [69]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3))

from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()

# vectorizer.fit(df_processed['text'])
tfidf.fit(vectorizer.fit_transform(df_processed['text']))

from sklearn.model_selection import train_test_split
train, test = train_test_split(df_processed, random_state=42, test_size=0.30, shuffle=True)

train_text = train['text']
test_text = test['text']

x_train = tfidf.transform(vectorizer.transform(train_text))
y_train = train.drop(labels = ['text'], axis=1)
x_test = tfidf.transform(vectorizer.transform(test_text))
y_test = test.drop(labels = ['text'], axis=1)

In [70]:
# X = vectorizer.transform(X)
x_train

<344x218339 sparse matrix of type '<class 'numpy.float64'>'
	with 214592 stored elements in Compressed Sparse Row format>

In [71]:
# X.shape
# Y.shape

## 3. Training Models

In [72]:
from sklearn.multiclass import OneVsRestClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB

bnb_clf = BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
gnb_clf = GaussianNB()
svm_clf = LinearSVC(random_state=0)
rf_clf = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
tree_clf = DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=0)

### 3a. Support-Vector Machine

In [73]:
svm = OneVsRestClassifier(svm_clf)
svm.fit(x_train,y_train)

pred = svm.predict(x_test)
print_evaluation_report(pred, svm_clf.__class__.__name__)

pickle.dump(svm, open("app/pkl/svm.pkl","wb"))

Classifier:  LinearSVC
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.00      0.00      0.00         3
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         4
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00        12
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         2
          10       0.00      0.00      0.00        12
          11       0.00      0.00      0.00         0
          12       0.78      0.79      0.78        71
          13       0.00      0.00      0.00        10
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00         2
    

In [74]:
# from sklearn import metrics

# scores = []

# for train_index, test_index in cv.split(X):
# #     print("Train Index: ", train_index, "\n")
# #     print("Test Index: ", test_index)
#     print(train_index)
    
#     X_train, X_test, Y_train, Y_test = X[train_index], X[test_index], Y[train_index], Y[test_index]
#     svm.fit(X_train, Y_train)
#     Y_pred = svm.predict(X_test)
    
#     scores.append(metrics.f1_score(Y_test, Y_pred, average='micro'))
    
# scores

### 3b. Random Forest

In [75]:
rf = OneVsRestClassifier(rf_clf)
rf.fit(x_train,y_train)

pred = rf.predict(x_test)

print_evaluation_report(pred, rf_clf.__class__.__name__)

Classifier:  RandomForestClassifier
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.00      0.00      0.00         3
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         4
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00        12
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         2
          10       0.00      0.00      0.00        12
          11       0.00      0.00      0.00         0
          12       0.68      0.42      0.52        71
          13       0.00      0.00      0.00        10
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00  

### 3c. Decision Tree

In [76]:
tree = OneVsRestClassifier(tree_clf)
tree.fit(x_train,y_train)

pred = tree.predict(x_test)

print_evaluation_report(pred, tree_clf.__class__.__name__)

pickle.dump(tree, open("app/pkl/tree.pkl","wb"))

Classifier:  DecisionTreeClassifier
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.50      0.33      0.40         3
           2       0.00      0.00      0.00         1
           3       1.00      0.25      0.40         4
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.77      0.83      0.80        12
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         2
          10       0.25      0.08      0.12        12
          11       0.00      0.00      0.00         0
          12       0.59      0.68      0.63        71
          13       0.33      0.20      0.25        10
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00  

### 3d. Bernoulli Naive-Bayes

In [77]:
bnb = OneVsRestClassifier(bnb_clf)
bnb.fit(x_train,y_train)

pred = bnb.predict(x_test)

print_evaluation_report(pred, bnb_clf.__class__.__name__)

Classifier:  BernoulliNB
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.00      0.00      0.00         3
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         4
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00        12
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         2
          10       0.00      0.00      0.00        12
          11       0.00      0.00      0.00         0
          12       0.58      0.10      0.17        71
          13       0.00      0.00      0.00        10
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00         2
  

### 3e. Gaussian Naive-Bayes

In [78]:
gnb = OneVsRestClassifier(gnb_clf)
gnb.fit(x_train.toarray(),y_train)

pred = gnb.predict(x_test.toarray())

print_evaluation_report(pred, gnb_clf.__class__.__name__)

Classifier:  GaussianNB
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.00      0.00      0.00         3
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         4
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.75      0.25      0.38        12
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         2
          10       0.33      0.08      0.13        12
          11       0.00      0.00      0.00         0
          12       0.73      0.66      0.70        71
          13       0.00      0.00      0.00        10
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00         2
   

### 3f. Word Similarity (Manual)

In [79]:
all_labels = load_labels()
wordsim_clf = WordSimilarityClassifier(all_labels)
clf_name = wordsim_clf.__class__.__name__

wordsim_pred = np.array(wordsim_clf.predict(test['text'].tolist()))
print_evaluation_report(wordsim_pred, clf_name)

pickle.dump(wordsim_clf, open("app/pkl/wordsim.pkl","wb"))

Classifier:  WordSimilarity
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.43      1.00      0.60         3
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         4
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.67      1.00      0.80        12
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         2
          10       0.45      0.75      0.56        12
          11       0.00      0.00      0.00         0
          12       0.50      0.03      0.05        71
          13       0.00      0.00      0.00        10
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       1.00      1.00      1.00         2

# 4. Testing

In [33]:
pickle.dump(vectorizer, open("app/pkl/vectorizer.pkl","wb"))

txt = "Seorang pemulung menyerahkan lima benda yang diduga bom aktif ke Polres Kota Cirebon. Benda itu ditempatkan di sebuah tempat khusus di halaman Mapolres Kota Cirebon, Jawa Barat. Seperti ditayangkan Liputan6 SCTV, Minggu (16/6/2019), kelima bom ini ditemukan di sebuah tempat sampah di kawasan Sukalila, Cirebon, dalam keadaan aktif dan kotor dipenuhi sampah. Tim Jibom dari Satbrimob Polda Jawa Barat yang datang ke lokasi langsung melakukan identifikasi kelima bom aktif berbentuk bulat kaleng tersebut dan mengamankannya dengan kantong khusus agar tidak membahayakan. Bahan peledak ini masih diidentifikasi, bentuknya bulat, ada dua jenis, jadi satu di dalam kaleng dan satu lagi seperti tabung, ucap Kapolres Kota Cirebon AKBP Roland Ronaldy. Usai dilakukan identifikasi dan pengamanan, kelima bom aktif yang terdiri dari dua jenis tersebut dibawa Tim Jibom ke Mapolda Jawa Barat untuk dilakukan penyelidikan lebih lanjut. Polisi masih memeriksa penemu bom aktif tersebut agar bisa melacak siapa pemiliknya."
new = "manusia dan jin serta malaikat"
input_text = pd.Series([new])

input_text = input_text.apply(lambda x: text_cleaner.transform(x))
input_text = input_text.apply(lambda x: sw_elim.transform(x))
input_text = input_text.apply(lambda x: stemmer.stem(x))

print(input_text[0])

test = vectorizer.transform(input_text)

results = np.array(svm.predict(test))

results

manusia jin malaikat


array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [34]:
answers = []
verse_results = []

for result in results:
    idx = 0
    for label in result:
        if label == 1:
            for name, key in target_dict.items():
                if key == idx:
                    answers.append(name)
        idx = idx + 1

for answer in answers:
    temp = quran_dict[answer]
    verse_results.append(temp)

answers

['manusia']

In [59]:
id_quran = pd.read_csv("quran/Indonesian.csv")
ar_quran = pd.read_csv("quran/Arabic.csv")
en_quran = pd.read_csv("quran/English.csv")

In [60]:
id_results = []
ar_results = []
en_results = []

for i in range(0,len(verse_results)):
    id_temp = []
    ar_temp = []
    en_temp = []
    for verse in verse_results[i]:
        surah, ayah, unused = verse.split('|')
        for id_text in id_quran['surah|ayah|text']:
            if id_text.find(verse) != -1:
                surah_temp, ayah_temp, txt_temp = id_text.split('|')
                id_temp.append(txt_temp)
                break
        for ar_text in ar_quran['surah|ayah|text']:
            if ar_text.find(verse) != -1:
                surah_temp, ayah_temp, txt_temp = ar_text.split('|')
                ar_temp.append(txt_temp)
                break
        for en_text in en_quran[['Surah','Ayah','Text']].values:
            if ((en_text[0] == int(surah)) and (en_text[1] == int(ayah))):
                en_temp.append(en_text[2])
                break
    id_results.append(id_temp)
    ar_results.append(ar_temp)
    en_results.append(en_temp)
        
id_results[0][0]

'Hai jama´ah jin dan manusia jika kamu sanggup menembus (melintasi) penjuru langit dan bumi maka lintasilah kamu tidak dapat menembusnya kecuali dengan kekuatan.'

In [61]:
verse_results[0]

['55|33|', '91|7|']