# Model Creation Notebook

This Notebook is created to train machine learning model to extract Qur'an verse from indonesian text document. There are a few processes that included in this notebook start from create helper function, load and preprocess dataset, until evaluate the models

In [1]:
import numpy as np
import pandas as pd
import pickle
import warnings
import nltk
import json

from sklearn.metrics import classification_report, accuracy_score, hamming_loss

from app.lib.dict import load_dict
from app.lib.datasets import load_labels, load_old_labels
from app.lib.word_similarity import WordSimilarityClassifier
from app.lib.preprocess import IndoTextCleaner, StopWordsEliminator
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

## 1. Helper Function (Preprocess, Filter, Evaluation Report)

In [2]:
warnings.simplefilter('ignore')

stemmer = StemmerFactory().create_stemmer()
text_cleaner = IndoTextCleaner()
sw_elim = StopWordsEliminator()

def print_evaluation_report(y_pred, clf_name):
    print("Classifier: ", clf_name)
    print(classification_report(y_test,y_pred, digits=5))
    print("accuracy: ", accuracy_score(y_test,y_pred))
    print("hamming loss: ", hamming_loss(y_test, y_pred))

## 2. Load & Preprocess Dataset

In [20]:
df_processed = pd.read_csv("new_processed_datasets.csv")
# df_processed = pd.read_csv("processed_datasets.csv")

X = df_processed['text']
Y = df_processed.drop(columns=['text'])

In [21]:
from sklearn.model_selection import KFold

cv = KFold(n_splits=10, random_state=42, shuffle=False)

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3))

from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()

# vectorizer.fit(df_processed['text'])
tfidf.fit(vectorizer.fit_transform(df_processed['text']))

from sklearn.model_selection import train_test_split
train, test = train_test_split(df_processed, random_state=42, test_size=0.30, shuffle=True)

train_text = train['text']
test_text = test['text']

# x_train = vectorizer.transform(train_text)
x_train = tfidf.transform(vectorizer.transform(train_text))
y_train = train.drop(labels = ['text'], axis=1)
# x_test = vectorizer.transform(test_text)
x_test = tfidf.transform(vectorizer.transform(test_text))
y_test = test.drop(labels = ['text'], axis=1)

In [23]:
# X = vectorizer.transform(X)
x_cv_train = train_text.reset_index()
y_cv_train = y_train.reset_index()

x_cv_train = x_cv_train['text']
y_cv_train = y_cv_train.drop(labels = ['index'], axis=1)

In [24]:
train

Unnamed: 0,text,text_unused,bahasa,kitab,allah,arsy,agama,petir,guntur,hujan,...,sedekah,negara,bangsa,hukum,adil,kisah,hikmah,ilmu,perang,zalim
752,republikacoid bogor wakil ketua tim kampanye n...,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
327,republikacoid jakarta badan amil zakat nasiona...,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
793,republikacoid palembang menteri agama lukman h...,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
928,cuaca dki jakarta kira pagi sabtu 2072019 awan...,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
543,republikacoid jakarta pt usaha listrik negara ...,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
527,republikacoid gaza takbir menang gema hari ray...,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
578,republikacoid kuala lumpur moody s investor se...,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
902,republikacoid alwi shahab nabi muhammad saw ka...,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
439,republikacoid jakarta kepala pusat registrasi ...,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
868,republikacoid jakarta diri darul quran ustaz y...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
df_shape = []
count_labels = 0

for column in test:
    temp = test.loc[test[column] == 1]
    df_shape.append(temp.shape)
    count_labels += temp.shape[0]
    

df_shape
df_processed.shape
(count_labels / test.shape[0]) / 283
# count_labels / test.shape[0]

0.019003273573868135

## 3. Training Models

In [33]:
from sklearn.multiclass import OneVsRestClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB

svm = OneVsRestClassifier(LinearSVC(dual=False, fit_intercept=False, loss='squared_hinge', multi_class='ovr', penalty='l1', random_state=0)) 
rf = OneVsRestClassifier(RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=None, n_estimators=100, max_features='auto', oob_score=False, warm_start=True, random_state=0)) 
tree = OneVsRestClassifier(DecisionTreeClassifier(criterion='gini', max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, presort=False, splitter='best', random_state=0)) 
bnb = OneVsRestClassifier(BernoulliNB(alpha=0.0, binarize=0.0, fit_prior=True, class_prior=None)) 

### 3a. Support-Vector Machine

In [34]:
svm.fit(x_train,y_train)

pred = svm.predict(x_test)
print_evaluation_report(pred, svm.__class__.__name__)

Classifier:  OneVsRestClassifier
              precision    recall  f1-score   support

           0    0.00000   0.00000   0.00000         0
           1    1.00000   0.21429   0.35294        14
           2    0.00000   0.00000   0.00000        13
           3    0.85714   0.15000   0.25532        40
           4    0.00000   0.00000   0.00000         2
           5    1.00000   0.29787   0.45902        47
           6    0.00000   0.00000   0.00000         1
           7    0.00000   0.00000   0.00000         2
           8    0.00000   0.00000   0.00000         7
           9    0.00000   0.00000   0.00000         4
          10    0.00000   0.00000   0.00000         1
          11    0.57143   0.25000   0.34783        16
          12    0.00000   0.00000   0.00000         0
          13    0.00000   0.00000   0.00000         1
          14    0.00000   0.00000   0.00000         1
          15    0.00000   0.00000   0.00000         0
          16    0.00000   0.00000   0.00000     

In [10]:
#Grid Search CV

from sklearn.model_selection import GridSearchCV

parameters = {
    'estimator__multi_class':['ovr', 'crammer_singer'],
    'estimator__penalty':['l1', 'l2'],
    'estimator__dual':[True, False],
    'estimator__fit_intercept':[True, False],
    'estimator__loss':['squared_hinge', 'hinge'],
    'estimator__random_state':[0, 5, 10]
}

# tree_cls = DecisionTreeClassifier(min_samples_split=2, random_state=0)
# tree_clf_gridsearch = GridSearchCV(tree_cls, parameters, cv=5)
svm_tuning = GridSearchCV(svm, parameters, cv=5, scoring='f1_micro', error_score=0.0)

svm_tuning.fit(x_train, y_train)

print(svm_tuning.best_score_)
print(svm_tuning.best_params_)

0.5233469876943887
{'estimator__dual': False, 'estimator__fit_intercept': False, 'estimator__loss': 'squared_hinge', 'estimator__multi_class': 'ovr', 'estimator__penalty': 'l1', 'estimator__random_state': 0}


In [35]:
#Cross Validation

from sklearn import metrics

scores_accuracy = []
scores_precision = []
scores_recall = []
scores_f1 = []
scores_hloss = []
i = 1

for train_index, test_index in cv.split(x_cv_train):

    X_train = x_cv_train[train_index]
    X_test = x_cv_train[test_index]
    
#     X_train_bow = vectorizer.transform(X_train)
#     X_test_bow = vectorizer.transform(X_test)
    X_train_tfidf = tfidf.transform(vectorizer.transform(X_train))
    X_test_tfidf = tfidf.transform(vectorizer.transform(X_test))
    
    Y_train = y_cv_train.iloc[train_index] 
    Y_test = y_cv_train.iloc[test_index]
    svm.fit(X_train_tfidf, Y_train)
    Y_pred = svm.predict(X_test_tfidf)
    
    accuracy = metrics.accuracy_score(Y_test, Y_pred)
    f1_micro = metrics.f1_score(Y_test, Y_pred, average='micro')
    precision_micro = metrics.precision_score(Y_test, Y_pred, average='micro')
    recall_micro = metrics.recall_score(Y_test, Y_pred, average='micro')
    h_loss = metrics.hamming_loss(Y_test, Y_pred)
    
#     print('accuracy-', i, ' : ', accuracy)
#     print('precision-', i, ' : ', precision_micro)
#     print('recall-', i, ' : ', recall_micro)
#     print('f1_score-', i, ' : ', f1_micro)
#     print('h_loss-', i, ' : ', h_loss)
    
    scores_accuracy.append(accuracy)
    scores_precision.append(precision_micro)
    scores_recall.append(recall_micro)
    scores_f1.append(f1_micro)
    scores_hloss.append(h_loss)
    
    i += 1

# print('accuracy : ', scores_accuracy)
# print('precision : ', scores_precision)
# print('recall : ', scores_recall)
# print('f1_score : ', scores_f1)
# print('hloss : ', scores_hloss, '\n')

print('mean accuracy', sum(scores_accuracy)/len(scores_accuracy))
print('mean precision : ', sum(scores_precision)/len(scores_precision))
print('mean recall : ', sum(scores_recall)/len(scores_recall))
print('mean f1_score : ', sum(scores_f1)/len(scores_f1))
print('mean hloss : ', sum(scores_hloss)/len(scores_hloss))


mean accuracy 0.0158592132505176
mean precision :  0.9036171743344624
mean recall :  0.15663827187298907
mean f1_score :  0.2665419327300482
mean hloss :  0.014534094918548993


### 3b. Random Forest

In [36]:
rf.fit(x_train,y_train)

pred = rf.predict(x_test)

print_evaluation_report(pred, rf.__class__.__name__)

Classifier:  OneVsRestClassifier
              precision    recall  f1-score   support

           0    0.00000   0.00000   0.00000         0
           1    1.00000   0.07143   0.13333        14
           2    0.00000   0.00000   0.00000        13
           3    0.75000   0.07500   0.13636        40
           4    0.00000   0.00000   0.00000         2
           5    0.37500   0.06383   0.10909        47
           6    0.00000   0.00000   0.00000         1
           7    0.00000   0.00000   0.00000         2
           8    0.00000   0.00000   0.00000         7
           9    1.00000   0.25000   0.40000         4
          10    0.00000   0.00000   0.00000         1
          11    0.75000   0.18750   0.30000        16
          12    0.00000   0.00000   0.00000         0
          13    0.00000   0.00000   0.00000         1
          14    0.00000   0.00000   0.00000         1
          15    0.00000   0.00000   0.00000         0
          16    0.00000   0.00000   0.00000     

In [15]:
#Grid Search CV

from sklearn.model_selection import GridSearchCV

parameters = {
    'estimator__criterion':['gini', 'entropy'],
    'estimator__n_estimators':[10, 100],
    'estimator__bootstrap':[True, False],
    'estimator__oob_score':[True, False],
    'estimator__warm_start':[True, False],
    'estimator__max_depth':[None, 10, 100],
#     'estimator__min_samples_split':[2,3,4],
#     'estimator__min_samples_leaf':[1,2,3,4]
    'estimator__random_state':[None, 0, 5, 10]
}

# tree_cls = DecisionTreeClassifier(min_samples_split=2, random_state=0)
# tree_clf_gridsearch = GridSearchCV(tree_cls, parameters, cv=5)
rf_tuning = GridSearchCV(rf, parameters, cv=5, scoring='f1_micro', error_score=0.0)

rf_tuning.fit(x_train, y_train)

print(rf_tuning.best_score_)
print(rf_tuning.best_params_)

0.37957325834001415
{'estimator__bootstrap': False, 'estimator__criterion': 'entropy', 'estimator__max_depth': None, 'estimator__n_estimators': 100, 'estimator__oob_score': False, 'estimator__random_state': 10, 'estimator__warm_start': True}


In [37]:
#Cross Validation

from sklearn import metrics

scores_accuracy = []
scores_precision = []
scores_recall = []
scores_f1 = []
scores_hloss = []
i = 1

for train_index, test_index in cv.split(x_cv_train):

    X_train = x_cv_train[train_index]
    X_test = x_cv_train[test_index]
    
#     X_train_bow = vectorizer.transform(X_train)
#     X_test_bow = vectorizer.transform(X_test)
    X_train_tfidf = tfidf.transform(vectorizer.transform(X_train))
    X_test_tfidf = tfidf.transform(vectorizer.transform(X_test))
    
    Y_train = y_cv_train.iloc[train_index] 
    Y_test = y_cv_train.iloc[test_index]
    rf.fit(X_train_tfidf, Y_train)
    Y_pred = rf.predict(X_test_tfidf)
    
    accuracy = metrics.accuracy_score(Y_test, Y_pred)
    f1_micro = metrics.f1_score(Y_test, Y_pred, average='micro')
    precision_micro = metrics.precision_score(Y_test, Y_pred, average='micro')
    recall_micro = metrics.recall_score(Y_test, Y_pred, average='micro')
    h_loss = metrics.hamming_loss(Y_test, Y_pred)
    
#     print('accuracy-', i, ' : ', accuracy)
#     print('precision-', i, ' : ', precision_micro)
#     print('recall-', i, ' : ', recall_micro)
#     print('f1_score-', i, ' : ', f1_micro)
#     print('h_loss-', i, ' : ', h_loss)
    
    scores_accuracy.append(accuracy)
    scores_precision.append(precision_micro)
    scores_recall.append(recall_micro)
    scores_f1.append(f1_micro)
    scores_hloss.append(h_loss)
    
    i += 1

# print('accuracy : ', scores_accuracy)
# print('precision : ', scores_precision)
# print('recall : ', scores_recall)
# print('f1_score : ', scores_f1)
# print('hloss : ', scores_hloss, '\n')

print('mean accuracy', sum(scores_accuracy)/len(scores_accuracy))
print('mean precision : ', sum(scores_precision)/len(scores_precision))
print('mean recall : ', sum(scores_recall)/len(scores_recall))
print('mean f1_score : ', sum(scores_f1)/len(scores_f1))
print('mean hloss : ', sum(scores_hloss)/len(scores_hloss))


mean accuracy 0.025900621118012425
mean precision :  0.793645238687944
mean recall :  0.1521939030100729
mean f1_score :  0.2541299949468415
mean hloss :  0.015030057318607576


### 3c. Decision Tree

In [38]:
tree.fit(x_train,y_train)

pred = tree.predict(x_test)

print_evaluation_report(pred, tree.__class__.__name__)

# pickle.dump(tree, open("./app/pkl/tree.pkl","wb"))

Classifier:  OneVsRestClassifier
              precision    recall  f1-score   support

           0    0.00000   0.00000   0.00000         0
           1    0.75000   0.42857   0.54545        14
           2    0.37500   0.23077   0.28571        13
           3    0.60526   0.57500   0.58974        40
           4    1.00000   1.00000   1.00000         2
           5    0.58491   0.65957   0.62000        47
           6    0.00000   0.00000   0.00000         1
           7    0.00000   0.00000   0.00000         2
           8    0.75000   0.85714   0.80000         7
           9    1.00000   1.00000   1.00000         4
          10    0.00000   0.00000   0.00000         1
          11    0.52381   0.68750   0.59459        16
          12    0.00000   0.00000   0.00000         0
          13    0.50000   1.00000   0.66667         1
          14    0.00000   0.00000   0.00000         1
          15    0.00000   0.00000   0.00000         0
          16    0.00000   0.00000   0.00000     

In [18]:
#Grid Search CV

from sklearn.model_selection import GridSearchCV

parameters = {
    'estimator__criterion':['gini', 'entropy'],
    'estimator__splitter':['best','random'],
    'estimator__presort':[True, False],
    'estimator__max_features':[None, 'auto', 'log2'],
    'estimator__max_depth':[None, 10, 100],
    'estimator__min_samples_split':[2,3,4],
    'estimator__min_samples_leaf':[1,2,3,4]
#     'estimator__random_state':[None, 0, 5, 10]
}

# tree_cls = DecisionTreeClassifier(min_samples_split=2, random_state=0)
# tree_clf_gridsearch = GridSearchCV(tree_cls, parameters, cv=5)
tree_tuning = GridSearchCV(tree, parameters, cv=5, scoring='f1_micro', error_score=0.0)

tree_tuning.fit(x_train, y_train)

print(tree_tuning.best_score_)
print(tree_tuning.best_params_)

0.5578800164628228
{'estimator__criterion': 'gini', 'estimator__max_depth': None, 'estimator__max_features': None, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'estimator__presort': False, 'estimator__splitter': 'best'}


In [39]:
#Cross Validation

from sklearn import metrics

scores_accuracy = []
scores_precision = []
scores_recall = []
scores_f1 = []
scores_hloss = []
i = 1

for train_index, test_index in cv.split(x_cv_train):

    X_train = x_cv_train[train_index]
    X_test = x_cv_train[test_index]
    
#     X_train_bow = vectorizer.transform(X_train)
#     X_test_bow = vectorizer.transform(X_test)
    X_train_tfidf = tfidf.transform(vectorizer.transform(X_train))
    X_test_tfidf = tfidf.transform(vectorizer.transform(X_test))
    
    Y_train = y_cv_train.iloc[train_index] 
    Y_test = y_cv_train.iloc[test_index]
    tree.fit(X_train_tfidf, Y_train)
    Y_pred = tree.predict(X_test_tfidf)
    
    accuracy = metrics.accuracy_score(Y_test, Y_pred)
    f1_micro = metrics.f1_score(Y_test, Y_pred, average='micro')
    precision_micro = metrics.precision_score(Y_test, Y_pred, average='micro')
    recall_micro = metrics.recall_score(Y_test, Y_pred, average='micro')
    h_loss = metrics.hamming_loss(Y_test, Y_pred)
    
#     print('accuracy-', i, ' : ', accuracy)
#     print('precision-', i, ' : ', precision_micro)
#     print('recall-', i, ' : ', recall_micro)
#     print('f1_score-', i, ' : ', f1_micro)
#     print('h_loss-', i, ' : ', h_loss)
    
    scores_accuracy.append(accuracy)
    scores_precision.append(precision_micro)
    scores_recall.append(recall_micro)
    scores_f1.append(f1_micro)
    scores_hloss.append(h_loss)
    
    i += 1

# print('accuracy : ', scores_accuracy)
# print('precision : ', scores_precision)
# print('recall : ', scores_recall)
# print('f1_score : ', scores_f1)
# print('hloss : ', scores_hloss, '\n')

print('mean accuracy', sum(scores_accuracy)/len(scores_accuracy))
print('mean precision : ', sum(scores_precision)/len(scores_precision))
print('mean recall : ', sum(scores_recall)/len(scores_recall))
print('mean f1_score : ', sum(scores_f1)/len(scores_f1))
print('mean hloss : ', sum(scores_hloss)/len(scores_hloss))


mean accuracy 0.028778467908902695
mean precision :  0.5943124326794863
mean recall :  0.4973953114993649
mean f1_score :  0.5414150846681272
mean hloss :  0.014248766751213277


### 3d. Bernoulli Naive-Bayes

In [40]:
bnb.fit(x_train,y_train)

pred = bnb.predict(x_test)

print_evaluation_report(pred, bnb.__class__.__name__)

Classifier:  OneVsRestClassifier
              precision    recall  f1-score   support

           0    0.00000   0.00000   0.00000         0
           1    0.54545   0.42857   0.48000        14
           2    0.22222   0.15385   0.18182        13
           3    0.48276   0.70000   0.57143        40
           4    0.33333   1.00000   0.50000         2
           5    0.62069   0.38298   0.47368        47
           6    0.00000   0.00000   0.00000         1
           7    0.00000   0.00000   0.00000         2
           8    0.60000   0.42857   0.50000         7
           9    0.75000   0.75000   0.75000         4
          10    0.00000   0.00000   0.00000         1
          11    0.33333   0.31250   0.32258        16
          12    0.00000   0.00000   0.00000         0
          13    0.00000   0.00000   0.00000         1
          14    0.00000   0.00000   0.00000         1
          15    0.00000   0.00000   0.00000         0
          16    0.00000   0.00000   0.00000     

In [9]:
#Grid Search CV

from sklearn.model_selection import GridSearchCV

parameters = {
    'estimator__alpha':[0.0, 1.0, 2.0],
    'estimator__binarize':[None, 0.0, 1.0],
    'estimator__fit_prior':[True, False]
}

# tree_cls = DecisionTreeClassifier(min_samples_split=2, random_state=0)
# tree_clf_gridsearch = GridSearchCV(tree_cls, parameters, cv=5)
bnb_tuning = GridSearchCV(bnb, parameters, cv=5, scoring='f1_micro', error_score=0.0)

bnb_tuning.fit(x_train, y_train)

print(bnb_tuning.best_score_)
print(bnb_tuning.best_params_)

0.5350074216621447
{'estimator__alpha': 0.0, 'estimator__binarize': 0.0, 'estimator__fit_prior': True}


In [41]:
#Cross Validation

from sklearn import metrics

scores_accuracy = []
scores_precision = []
scores_recall = []
scores_f1 = []
scores_hloss = []
i = 1

for train_index, test_index in cv.split(x_cv_train):

    X_train = x_cv_train[train_index]
    X_test = x_cv_train[test_index]
    
#     X_train_bow = vectorizer.transform(X_train)
#     X_test_bow = vectorizer.transform(X_test)
    X_train_tfidf = tfidf.transform(vectorizer.transform(X_train))
    X_test_tfidf = tfidf.transform(vectorizer.transform(X_test))
    
    Y_train = y_cv_train.iloc[train_index] 
    Y_test = y_cv_train.iloc[test_index]
    bnb.fit(X_train_tfidf, Y_train)
    Y_pred = bnb.predict(X_test_tfidf)
    
    accuracy = metrics.accuracy_score(Y_test, Y_pred)
    f1_micro = metrics.f1_score(Y_test, Y_pred, average='micro')
    precision_micro = metrics.precision_score(Y_test, Y_pred, average='micro')
    recall_micro = metrics.recall_score(Y_test, Y_pred, average='micro')
    h_loss = metrics.hamming_loss(Y_test, Y_pred)
    
#     print('accuracy-', i, ' : ', accuracy)
#     print('precision-', i, ' : ', precision_micro)
#     print('recall-', i, ' : ', recall_micro)
#     print('f1_score-', i, ' : ', f1_micro)
#     print('h_loss-', i, ' : ', h_loss)
    
    scores_accuracy.append(accuracy)
    scores_precision.append(precision_micro)
    scores_recall.append(recall_micro)
    scores_f1.append(f1_micro)
    scores_hloss.append(h_loss)
    
    i += 1

# print('accuracy : ', scores_accuracy)
# print('precision : ', scores_precision)
# print('recall : ', scores_recall)
# print('f1_score : ', scores_f1)
# print('hloss : ', scores_hloss, '\n')

print('mean accuracy', sum(scores_accuracy)/len(scores_accuracy))
print('mean precision : ', sum(scores_precision)/len(scores_precision))
print('mean recall : ', sum(scores_recall)/len(scores_recall))
print('mean f1_score : ', sum(scores_f1)/len(scores_f1))
print('mean hloss : ', sum(scores_hloss)/len(scores_hloss))


mean accuracy 0.03171842650103519
mean precision :  0.45376227150621934
mean recall :  0.47322956145333184
mean f1_score :  0.4571985493411016
mean hloss :  0.019439129769061265


### 3f. Word Similarity (Manual)

In [188]:
all_labels = load_labels()
old_labels = load_old_labels()
wordsim_clf = WordSimilarityClassifier(all_labels)
clf_name = wordsim_clf.__class__.__name__

wordsim_pred = np.array(wordsim_clf.predict(test['text'].tolist()))
print_evaluation_report(wordsim_pred, clf_name)

# pickle.dump(wordsim_clf, open("app/pkl/wordsim.pkl","wb"))

Classifier:  WordSimilarity
              precision    recall  f1-score   support

           0    0.00000   0.00000   0.00000         0
           1    0.34211   0.92857   0.50000        14
           2    0.33333   0.76923   0.46512        13
           3    0.33898   1.00000   0.50633        40
           4    1.00000   1.00000   1.00000         2
           5    0.53425   0.82979   0.65000        47
           6    1.00000   1.00000   1.00000         1
           7    0.66667   1.00000   0.80000         2
           8    0.63636   1.00000   0.77778         7
           9    0.05405   1.00000   0.10256         4
          10    0.25000   1.00000   0.40000         1
          11    0.38095   1.00000   0.55172        16
          12    0.00000   0.00000   0.00000         0
          13    0.25000   1.00000   0.40000         1
          14    0.14286   1.00000   0.25000         1
          15    0.00000   0.00000   0.00000         0
          16    0.12500   1.00000   0.22222         1

In [189]:
#Cross Validation

from sklearn import metrics

scores_accuracy = []
scores_precision = []
scores_recall = []
scores_f1 = []
scores_hloss = []
i = 1

for train_index, test_index in cv.split(X):

    X_train = X[train_index]
    X_test = X[test_index]
    
#     X_train_bow = vectorizer.transform(X_train)
#     X_test_bow = vectorizer.transform(X_test)
#     X_train_tfidf = tfidf.transform(vectorizer.transform(X_train))
#     X_test_tfidf = tfidf.transform(vectorizer.transform(X_test))
    
    Y_train = Y.iloc[train_index] 
    Y_test = Y.iloc[test_index]
#     svm.fit(X_train_tfidf, Y_train)
    Y_pred = wordsim_clf.predict(X_test.tolist())
    
    accuracy = metrics.accuracy_score(Y_test, np.array(Y_pred))
    f1_micro = metrics.f1_score(Y_test, np.array(Y_pred), average='micro')
    precision_micro = metrics.precision_score(Y_test, np.array(Y_pred), average='micro')
    recall_micro = metrics.recall_score(Y_test, np.array(Y_pred), average='micro')
    h_loss = metrics.hamming_loss(Y_test, np.array(Y_pred))
    
    print('accuracy-', i, ' : ', accuracy)
    print('precision-', i, ' : ', precision_micro)
    print('recall-', i, ' : ', recall_micro)
    print('f1_score-', i, ' : ', f1_micro)
    print('h_loss-', i, ' : ', h_loss)
    
    scores_accuracy.append(accuracy)
    scores_precision.append(precision_micro)
    scores_recall.append(recall_micro)
    scores_f1.append(f1_micro)
    scores_hloss.append(h_loss)
    
    i += 1

print('accuracy : ', scores_accuracy)
print('precision : ', scores_precision)
print('recall : ', scores_recall)
print('f1_score : ', scores_f1)
print('hloss : ', scores_hloss, '\n')

print('mean accuracy', sum(scores_accuracy)/len(scores_accuracy))
print('mean precision : ', sum(scores_precision)/len(scores_precision))
print('mean recall : ', sum(scores_recall)/len(scores_recall))
print('mean f1_score : ', sum(scores_f1)/len(scores_f1))
print('mean hloss : ', sum(scores_hloss)/len(scores_hloss))


accuracy- 1  :  0.0
precision- 1  :  0.44575471698113206
recall- 1  :  0.7354085603112841
f1_score- 1  :  0.5550660792951542
h_loss- 1  :  0.02922829581993569
accuracy- 2  :  0.0
precision- 2  :  0.44731610337972166
recall- 2  :  0.6787330316742082
f1_score- 2  :  0.5392450569203115
h_loss- 2  :  0.02472668810289389
accuracy- 3  :  0.0
precision- 3  :  0.26693877551020406
recall- 3  :  0.5934664246823956
f1_score- 3  :  0.3682432432432432
h_loss- 3  :  0.036077170418006434
accuracy- 4  :  0.0
precision- 4  :  0.2507530120481928
recall- 4  :  0.6283018867924528
f1_score- 4  :  0.3584499461786868
h_loss- 4  :  0.03832797427652733
accuracy- 5  :  0.0
precision- 5  :  0.24755989352262645
recall- 5  :  0.5849056603773585
f1_score- 5  :  0.34788029925187036
h_loss- 5  :  0.03397317223683783
accuracy- 6  :  0.0
precision- 6  :  0.1979301423027167
recall- 6  :  0.4101876675603217
f1_score- 6  :  0.2670157068062827
h_loss- 6  :  0.02728247101237455
accuracy- 7  :  0.0
precision- 7  :  0.2750621

# 4. Testing

In [None]:
# pickle.dump(vectorizer, open("app/pkl/vectorizer.pkl","wb"))

txt = "Seorang pemulung menyerahkan lima benda yang diduga bom aktif ke Polres Kota Cirebon. Benda itu ditempatkan di sebuah tempat khusus di halaman Mapolres Kota Cirebon, Jawa Barat. Seperti ditayangkan Liputan6 SCTV, Minggu (16/6/2019), kelima bom ini ditemukan di sebuah tempat sampah di kawasan Sukalila, Cirebon, dalam keadaan aktif dan kotor dipenuhi sampah. Tim Jibom dari Satbrimob Polda Jawa Barat yang datang ke lokasi langsung melakukan identifikasi kelima bom aktif berbentuk bulat kaleng tersebut dan mengamankannya dengan kantong khusus agar tidak membahayakan. Bahan peledak ini masih diidentifikasi, bentuknya bulat, ada dua jenis, jadi satu di dalam kaleng dan satu lagi seperti tabung, ucap Kapolres Kota Cirebon AKBP Roland Ronaldy. Usai dilakukan identifikasi dan pengamanan, kelima bom aktif yang terdiri dari dua jenis tersebut dibawa Tim Jibom ke Mapolda Jawa Barat untuk dilakukan penyelidikan lebih lanjut. Polisi masih memeriksa penemu bom aktif tersebut agar bisa melacak siapa pemiliknya."
new = "manusia dan jin serta malaikat"
input_text = pd.Series([txt])

input_text = input_text.apply(lambda x: text_cleaner.transform(x))
input_text = input_text.apply(lambda x: sw_elim.transform(x))
input_text = input_text.apply(lambda x: stemmer.stem(x))

print(input_text[0])

test = vectorizer.transform(input_text)

results = np.array(tree.predict(test))

results

In [None]:
answers = []
verse_results = []

for result in results:
    idx = 0
    for label in result:
        if label == 1:
            for name, key in target_dict.items():
                if key == idx:
                    answers.append(name)
        idx = idx + 1

for answer in answers:
    temp = quran_dict[answer]
    verse_results.append(temp)

answers

In [None]:
id_quran = pd.read_csv("quran/Indonesian.csv")
ar_quran = pd.read_csv("quran/Arabic.csv")
en_quran = pd.read_csv("quran/English.csv")

In [None]:
id_results = []
ar_results = []
en_results = []

for i in range(0,len(verse_results)):
    id_temp = []
    ar_temp = []
    en_temp = []
    for verse in verse_results[i]:
        surah, ayah, unused = verse.split('|')
        for id_text in id_quran['surah|ayah|text']:
            if id_text.find(verse) != -1:
                surah_temp, ayah_temp, txt_temp = id_text.split('|')
                id_temp.append(txt_temp)
                break
        for ar_text in ar_quran['surah|ayah|text']:
            if ar_text.find(verse) != -1:
                surah_temp, ayah_temp, txt_temp = ar_text.split('|')
                ar_temp.append(txt_temp)
                break
        for en_text in en_quran[['Surah','Ayah','Text']].values:
            if ((en_text[0] == int(surah)) and (en_text[1] == int(ayah))):
                en_temp.append(en_text[2])
                break
    id_results.append(id_temp)
    ar_results.append(ar_temp)
    en_results.append(en_temp)
        
id_results[0][0]

In [None]:
verse_results[0]