# Model Creation Notebook

This Notebook is created to train machine learning model to extract Qur'an verse from indonesian text document. There are a few processes that included in this notebook start from create helper function, load and preprocess dataset, until evaluate the models

In [1]:
import numpy as np
import pandas as pd
import pickle
import warnings
import nltk
import json

from sklearn.metrics import classification_report, accuracy_score
from owlready2 import *

from app.lib.dict import load_dict
from app.lib.datasets import load_labels
from app.lib.word_similarity import WordSimilarityClassifier
from app.lib.preprocess import IndoTextCleaner, StopWordsEliminator

## 1. Helper Function (Preprocess, Filter, Evaluation Report)

In [2]:
warnings.simplefilter('ignore')

text_cleaner = IndoTextCleaner()
sw_elim = StopWordsEliminator()

def print_evaluation_report(y_pred, clf_name):
    print("Classifier: ", clf_name)
    print(classification_report(y_test,y_pred))
    print("accuracy: ", accuracy_score(y_test,y_pred))

## 2. Load & Preprocess Dataset

In [3]:
with open('datasets.json', encoding="utf-8") as f:
    dataset = json.load(f)

In [4]:
target_dict, quran_dict = load_dict()
rawlist = []

for data in dataset:
    text = [data['content']]
    target = [0]*len(target_dict)
    
    if data['annotation'] is not None:
        labels = data['annotation']['labels']
        for label in labels:
            pos = target_dict.get(label)
            target[pos] = 1

        rawlist.append(text+target)

In [5]:
df = pd.DataFrame(rawlist,columns=['text','fenomena cuaca','bahasa','atribut fisik','zat fisik','artefak','ciptaan yang hidup','kitab suci','allah','singgasana allah','tuhan palsu','agama','benda astronomi','peristiwa','lokasi','petir','guntur','hujan','awan','arab','warna','hijau','logam','mineral','minyak','karang','tanah','mutiara','kaca','debu','sutra','tanah liat','besi','emas','perak','kuningan','permata','tempat ibadah','persenjataan','koin','tinta','pena','tabut','perahu','kapal','lampu','kunci','tangga','bahtera','masjid','gereja','biara','sinagog','masjidil haram','masjidil aqsa','kabah','pisau','panah','baju besi','objek organik','makhluk hidup','bagian tubuh','penyakit','makanan','organisme biologis','embrio','darah','tulang','telinga','mata','jari','dahi','gombak','jantung','tumit','usus','bibir','hidung','lidah','sayap','tulang rusuk','kusta','daging','madu','susu','garam','roti','wine','gandum','daging babi','bangkai','serangga','burung','tanaman','binatang','ikan','belalang','lebah','laba-laba','semut','nyamuk','lalat','gagak','puyuh','hud-hud','timun','kurma','ara','bawang putih','jahe','anggur','herba','daun','kacang','zaitun','bawang','lentil','pohon','delima','unta','sapi','babi','kambing','kuda','singa','keledai','kera','serigala','katak','domba','ular','anjing','malaikat','jin','manusia','daabbah','jibril','malaikat maut','harut','marut','malik','mikail','setan','iblis','raja','anak adam','orang bersejarah','orang-orang bersejarah','nabi','firaun','uzair','luqman','jalut','dzulkifli','samiri','talut','qarun','dzulkarnain','haman','aad','tsamud','madyan','quraisy','romawi','anshar','arab badui','tubba','bani israil','yajuj dan majuj','pemuda kahfi','tentara bergajah','pembuat parit','penduduk rass','penduduk aikah','penduduk al-hijr','abu lahab','ummu jamil','rasul','zakaria','yahya','harun','idris','ilyasa','ayyub','adam','daud','sulaiman','yusuf','yaqub','ishaq','habil','qabil','israil','azar','muhammad','isa','ibrahim','ismail','salih','hud','syuaib','yunus','musa','nuh','luth','ilyas','ahmad','zaid','maryam','messiah','quran','injil','zabur','taurat','islam','kristen','yahudi','shabiin','majusi','bulan','bumi','matahari','bintang','sirius','gugus bintang','uzza','manat','latta','suwa','baal','nasr','wadd','yaghuts','yauq','sesembahan','anak lembu emas','peristiwa bersejarah','peristiwa kalender','peristiwa fisik','peristiwa akhirat','masa jahiliyah','hari jumat','hari sabtu','haji','umrah','lailatul qadar','bulan ramadhan','fajar','hari kebangkitan','hari kiamat','lokasi di akhirat','lokasi geografis','surga','neraka','firdaus','adn','pohon bidara','salsabil','sijjin','saqar','zaqqum','ladha','ufuk','kiblat','gurun','kota','gunung','tempat bersejarah','badar','mekkah','madinah','babilonia','hunain','iram','yastrib','shafa','marwah','arafat','sinai','judiy','mesir','saba','al-ahqaf'])

df['text'] = df['text'].apply(lambda x: text_cleaner.transform(x))
df['text'] = df['text'].apply(lambda x: sw_elim.transform(x))

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3))

vectorizer.fit(df['text'])

from sklearn.model_selection import train_test_split
train, test = train_test_split(df, random_state=42, test_size=0.30, shuffle=True)

train_text = train['text']
test_text = test['text']

x_train = vectorizer.transform(train_text)
y_train = train.drop(labels = ['text'], axis=1)
x_test = vectorizer.transform(test_text)
y_test = test.drop(labels = ['text'], axis=1)

## 3. Training Models

In [6]:
from sklearn.multiclass import OneVsRestClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB

bnb_clf = BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
gnb_clf = GaussianNB()
svm_clf = LinearSVC(random_state=0)
rf_clf = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
tree_clf = DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=0)

### 3a. Support-Vector Machine

In [7]:
svm = OneVsRestClassifier(svm_clf)
svm.fit(x_train,y_train)

pred = svm.predict(x_test)
print_evaluation_report(pred, svm_clf.__class__.__name__)

pickle.dump(svm, open("app/pkl/svm.pkl","wb"))

Classifier:  LinearSVC
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.00      0.00      0.00         3
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         4
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.83      0.42      0.56        12
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         2
          10       0.60      0.25      0.35        12
          11       0.00      0.00      0.00         0
          12       0.71      0.82      0.76        71
          13       0.00      0.00      0.00        10
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00         2
    

### 3b. Random Forest

In [8]:
rf = OneVsRestClassifier(rf_clf)
rf.fit(x_train,y_train)

pred = rf.predict(x_test)

print_evaluation_report(pred, rf_clf.__class__.__name__)

Classifier:  RandomForestClassifier
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.00      0.00      0.00         3
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         4
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00        12
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         2
          10       0.00      0.00      0.00        12
          11       0.00      0.00      0.00         0
          12       0.69      0.44      0.53        71
          13       0.00      0.00      0.00        10
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00  

### 3c. Decision Tree

In [9]:
tree = OneVsRestClassifier(tree_clf)
tree.fit(x_train,y_train)

pred = tree.predict(x_test)

print_evaluation_report(pred, tree_clf.__class__.__name__)

Classifier:  DecisionTreeClassifier
              precision    recall  f1-score   support

           0       1.00      0.50      0.67         4
           1       0.50      0.67      0.57         3
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         4
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.67      0.67      0.67        12
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         2
          10       0.75      0.25      0.38        12
          11       0.00      0.00      0.00         0
          12       0.60      0.62      0.61        71
          13       0.00      0.00      0.00        10
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       1.00      1.00      1.00  

### 3d. Bernoulli Naive-Bayes

In [10]:
bnb = OneVsRestClassifier(bnb_clf)
bnb.fit(x_train,y_train)

pred = bnb.predict(x_test)

print_evaluation_report(pred, bnb_clf.__class__.__name__)

Classifier:  BernoulliNB
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.00      0.00      0.00         3
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         4
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00        12
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         2
          10       0.00      0.00      0.00        12
          11       0.00      0.00      0.00         0
          12       0.60      0.08      0.15        71
          13       0.00      0.00      0.00        10
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00         2
  

### 3e. Gaussian Naive-Bayes

In [11]:
# gnb = OneVsRestClassifier(gnb_clf)
# gnb.fit(x_train.toarray(),y_train)

# pred = gnb.predict(x_test.toarray())

# print_evaluation_report(pred, gnb_clf.__class__.__name__)

### 3f. Word Similarity (Manual)

In [12]:
all_labels = load_labels()
wordsim_clf = WordSimilarityClassifier(all_labels)
clf_name = wordsim_clf.__class__.__name__

wordsim_pred = np.array(wordsim_clf.predict(test['text'].tolist()))
print_evaluation_report(wordsim_pred, clf_name)

pickle.dump(svm, open("app/pkl/wordsim.pkl","wb"))

Classifier:  WordSimilarity
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.33      1.00      0.50         3
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         4
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.67      1.00      0.80        12
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         2
          10       0.43      0.75      0.55        12
          11       0.00      0.00      0.00         0
          12       0.50      0.03      0.05        71
          13       0.00      0.00      0.00        10
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       1.00      1.00      1.00         2

# 4. Testing

In [45]:
pickle.dump(vectorizer, open("app/pkl/vectorizer.pkl","wb"))

txt = "Seorang pemulung menyerahkan lima benda yang diduga bom aktif ke Polres Kota Cirebon. Benda itu ditempatkan di sebuah tempat khusus di halaman Mapolres Kota Cirebon, Jawa Barat. Seperti ditayangkan Liputan6 SCTV, Minggu (16/6/2019), kelima bom ini ditemukan di sebuah tempat sampah di kawasan Sukalila, Cirebon, dalam keadaan aktif dan kotor dipenuhi sampah. Tim Jibom dari Satbrimob Polda Jawa Barat yang datang ke lokasi langsung melakukan identifikasi kelima bom aktif berbentuk bulat kaleng tersebut dan mengamankannya dengan kantong khusus agar tidak membahayakan. Bahan peledak ini masih diidentifikasi, bentuknya bulat, ada dua jenis, jadi satu di dalam kaleng dan satu lagi seperti tabung, ucap Kapolres Kota Cirebon AKBP Roland Ronaldy. Usai dilakukan identifikasi dan pengamanan, kelima bom aktif yang terdiri dari dua jenis tersebut dibawa Tim Jibom ke Mapolda Jawa Barat untuk dilakukan penyelidikan lebih lanjut. Polisi masih memeriksa penemu bom aktif tersebut agar bisa melacak siapa pemiliknya."
new = "manusia dan jin serta malaikat"
input_text = pd.Series([new])

input_text = input_text.apply(lambda x: text_cleaner.transform(x))
input_text = input_text.apply(lambda x: sw_elim.transform(x))

print(input_text[0])

test = vectorizer.transform(input_text)

results = np.array(svm.predict(test))

results

manusia jin malaikat


array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [49]:
answers = []
verse_results = []

for result in results:
    idx = 0
    for label in result:
        if label == 1:
            for name, key in target_dict.items():
                if key == idx:
                    answers.append(name)
        idx = idx + 1

for answer in answers:
    temp = quran_dict[answer]
    verse_results.append(temp)

verse_results

[['55|33|', '91|7|']]

In [47]:
id_quran = pd.read_csv("quran/Indonesian.csv")
ar_quran = pd.read_csv("quran/Arabic.csv")
en_quran = pd.read_csv("quran/English.csv")

In [53]:
id_results = []
ar_results = []
en_results = []

for i in range(0,len(verse_results)):
    id_temp = []
    ar_temp = []
    en_temp = []
    for verse in verse_results[i]:
        surah, ayah, unused = verse.split('|')
        for id_text in id_quran['surah|ayah|text']:
            if id_text.find(verse) != -1:
                surah_temp, ayah_temp, txt_temp = id_text.split('|')
                id_temp.append(txt_temp)
                break
        for ar_text in ar_quran['surah|ayah|text']:
            if ar_text.find(verse) != -1:
                surah_temp, ayah_temp, txt_temp = ar_text.split('|')
                ar_temp.append(txt_temp)
                break
        for en_text in en_quran[['Surah','Ayah','Text']].values:
            if ((en_text[0] == int(surah)) and (en_text[1] == int(ayah))):
                en_temp.append(en_text[2])
                break
    id_results.append(id_temp)
    ar_results.append(ar_temp)
    en_results.append(en_temp)
        
id_results[0][0]

'Hai jama´ah jin dan manusia jika kamu sanggup menembus (melintasi) penjuru langit dan bumi maka lintasilah kamu tidak dapat menembusnya kecuali dengan kekuatan.'

In [52]:
verse_results[0]

['55|33|', '91|7|']