# Model Creation Notebook
This Notebook is created to train machine learning model to extract Qur'an verse from indonesian text document. There are a few processes that included in this notebook start from create helper function, load and preprocess dataset, until evaluate the models

In [26]:
import numpy as np
import pandas as pd
import pickle
import warnings
import nltk
import json

from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.metrics import classification_report, accuracy_score
from owlready2 import *
from dict import load_dict
from datasets import load_labels
from word_similarity import WordSimilarityClassifier

## 1. Helper Function (Preprocess, Filter, Evaluation Report)

In [27]:
warnings.simplefilter('ignore')

def clean_text(text):
    text = text.lower()
    text = re.sub(r"\'", ' ', text)
#     text = re.sub('\W', ' ', text) 
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

def stopwords_elim(text):
    factory = StopWordRemoverFactory()
    stopword = factory.create_stop_word_remover()
    
    text = stopword.remove(text)
    return text

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    return tokens

def print_evaluation_report(y_pred, clf_name):
    print("Classifier: ", clf_name)
    print(classification_report(y_test,y_pred))
    print("accuracy: ", accuracy_score(y_test,y_pred))

## 2. Load & Preprocess Dataset

In [28]:
with open('datasets.json', encoding="utf-8") as f:
    dataset = json.load(f)

In [29]:
target_dict = load_dict()
rawlist = []

for data in dataset:
    text = [data['content']]
    target = [0]*len(target_dict)
    
    if data['annotation'] is not None:
        labels = data['annotation']['labels']
        for label in labels:
            pos = target_dict.get(label)
            target[pos] = 1

        rawlist.append(text+target)

In [30]:
df = pd.DataFrame(rawlist,columns=['text','fenomena cuaca','bahasa','atribut fisik','zat fisik','artefak','ciptaan yang hidup','kitab suci','allah','singgasana allah','tuhan palsu','agama','benda astronomi','peristiwa','lokasi','petir','guntur','hujan','awan','arab','warna','hijau','logam','mineral','minyak','karang','tanah','mutiara','kaca','debu','sutra','tanah liat','besi','emas','perak','kuningan','permata','tempat ibadah','persenjataan','koin','tinta','pena','tabut','perahu','kapal','lampu','kunci','tangga','bahtera','masjid','gereja','biara','sinagog','masjidil haram','masjidil aqsa','kabah','pisau','panah','baju besi','objek organik','makhluk hidup','bagian tubuh','penyakit','makanan','organisme biologis','embrio','darah','tulang','telinga','mata','jari','dahi','gombak','jantung','tumit','usus','bibir','hidung','lidah','sayap','tulang rusuk','kusta','daging','madu','susu','garam','roti','wine','gandum','daging babi','bangkai','serangga','burung','tanaman','binatang','ikan','belalang','lebah','laba-laba','semut','nyamuk','lalat','gagak','puyuh','hud-hud','timun','kurma','ara','bawang putih','jahe','anggur','herba','daun','kacang','zaitun','bawang','lentil','pohon','delima','unta','sapi','babi','kambing','kuda','singa','keledai','kera','serigala','katak','domba','ular','anjing','malaikat','jin','manusia','daabbah','jibril','malaikat maut','harut','marut','malik','mikail','setan','iblis','raja','anak adam','orang bersejarah','orang-orang bersejarah','nabi','firaun','uzair','luqman','jalut','dzulkifli','samiri','talut','qarun','dzulkarnain','haman','aad','tsamud','madyan','quraisy','romawi','anshar','arab badui','tubba','bani israil','yajuj dan majuj','pemuda kahfi','tentara bergajah','pembuat parit','penduduk rass','penduduk aikah','penduduk al-hijr','abu lahab','ummu jamil','rasul','zakaria','yahya','harun','idris','ilyasa','ayyub','adam','daud','sulaiman','yusuf','yaqub','ishaq','habil','qabil','israil','azar','muhammad','isa','ibrahim','ismail','salih','hud','syuaib','yunus','musa','nuh','luth','ilyas','ahmad','zaid','maryam','messiah','quran','injil','zabur','taurat','islam','kristen','yahudi','shabiin','majusi','bulan','bumi','matahari','bintang','sirius','gugus bintang','uzza','manat','latta','suwa','baal','nasr','wadd','yaghuts','yauq','sesembahan','anak lembu emas','peristiwa bersejarah','peristiwa kalender','peristiwa fisik','peristiwa akhirat','masa jahiliyah','hari jumat','hari sabtu','haji','umrah','lailatul qadar','bulan ramadhan','fajar','hari kebangkitan','hari kiamat','lokasi di akhirat','lokasi geografis','surga','neraka','firdaus','adn','pohon bidara','salsabil','sijjin','saqar','zaqqum','ladha','ufuk','kiblat','gurun','kota','gunung','tempat bersejarah','badar','mekkah','madinah','babilonia','hunain','iram','yastrib','shafa','marwah','arafat','sinai','judiy','mesir','saba','al-ahqaf'])

df['text'] = df['text'].map(lambda com : clean_text(com))
df['text'] = df['text'].map(lambda com : stopwords_elim(com))

## 3. Vectorize Dataset

In [31]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, random_state=42, test_size=0.30, shuffle=True)

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,4))
# vectorizer.fit(df['text'])
vectorizer.fit(train['text'])
vectorizer.fit(test['text'])

# x = vectorizer.transform(df['text'])
# y = df.drop(labels = ['text'], axis=1)
x_train = vectorizer.transform(train['text'])
y_train = train.drop(labels = ['text'], axis=1)
x_test = vectorizer.transform(test['text'])
y_test = test.drop(labels = ['text'], axis=1)

## 4. Training Models

In [32]:
from sklearn.multiclass import OneVsRestClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB

bnb_clf = BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
gnb_clf = GaussianNB()
svm_clf = LinearSVC(random_state=0)
rf_clf = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
tree_clf = DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=0)

### 4a. Support-Vector Machine

In [39]:
svm = OneVsRestClassifier(svm_clf)
svm.fit(x_train,y_train)

pred = svm.predict(x_test)
print_evaluation_report(pred, svm_clf.__class__.__name__)

pickle.dump(svm, open("app/pkl/svm.pkl","wb"))

Classifier:  LinearSVC
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.00      0.00      0.00         3
           2       0.00      0.00      0.00         1
           3       0.50      0.25      0.33         4
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.80      0.67      0.73        12
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         2
          10       0.38      0.25      0.30        12
          11       0.00      0.00      0.00         0
          12       0.68      0.82      0.74        71
          13       0.11      0.10      0.11        10
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00         2
    

### 4b. Random Forest

In [34]:
rf = OneVsRestClassifier(rf_clf)
rf.fit(x_train,y_train)

pred = rf.predict(x_test)

print_evaluation_report(pred, rf_clf.__class__.__name__)

Classifier:  RandomForestClassifier
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.00      0.00      0.00         3
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         4
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       1.00      0.08      0.15        12
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         2
          10       0.00      0.00      0.00        12
          11       0.00      0.00      0.00         0
          12       0.60      0.55      0.57        71
          13       0.00      0.00      0.00        10
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00  

### 4c. Decision Tree

In [35]:
tree = OneVsRestClassifier(tree_clf)
tree.fit(x_train,y_train)

pred = tree.predict(x_test)

print_evaluation_report(pred, tree_clf.__class__.__name__)

Classifier:  DecisionTreeClassifier
              precision    recall  f1-score   support

           0       0.50      0.50      0.50         4
           1       0.50      0.67      0.57         3
           2       0.00      0.00      0.00         1
           3       0.14      0.25      0.18         4
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.67      0.67      0.67        12
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         2
          10       0.40      0.33      0.36        12
          11       0.00      0.00      0.00         0
          12       0.61      0.65      0.63        71
          13       0.07      0.10      0.08        10
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       1.00      1.00      1.00  

### 4d. Bernoulli Naive Bayes

In [36]:
bnb = OneVsRestClassifier(bnb_clf)
bnb.fit(x_train,y_train)

pred = bnb.predict(x_test)

print_evaluation_report(pred, bnb_clf.__class__.__name__)

Classifier:  BernoulliNB
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.00      0.00      0.00         3
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         4
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.17      0.08      0.11        12
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         2
          10       0.50      0.17      0.25        12
          11       0.00      0.00      0.00         0
          12       0.62      0.79      0.69        71
          13       0.00      0.00      0.00        10
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00         2
  

### 4e. Gaussian Naive Bayes

In [37]:
gnb = OneVsRestClassifier(gnb_clf)
gnb.fit(x_train.toarray(),y_train)

pred = gnb.predict(x_test.toarray())

print_evaluation_report(pred, gnb_clf.__class__.__name__)

Classifier:  GaussianNB
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.00      0.00      0.00         3
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         4
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.67      0.17      0.27        12
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         2
          10       0.00      0.00      0.00        12
          11       0.00      0.00      0.00         0
          12       0.70      0.66      0.68        71
          13       0.00      0.00      0.00        10
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00         2
   

### 4f. Word Similarity (Manual)

In [40]:
all_labels = load_labels()
wordsim_clf = WordSimilarityClassifier(all_labels)
clf_name = wordsim_clf.__class__.__name__

wordsim_pred = np.array(wordsim_clf.predict(test['text'].tolist()))
print_evaluation_report(wordsim_pred, clf_name)

pickle.dump(svm, open("app/pkl/wordsim.pkl","wb"))

Classifier:  WordSimilarity
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.33      1.00      0.50         3
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         4
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.67      1.00      0.80        12
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         2
          10       0.43      0.75      0.55        12
          11       0.00      0.00      0.00         0
          12       0.50      0.03      0.05        71
          13       0.00      0.00      0.00        10
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       1.00      1.00      1.00         2