In [1]:
import string
import pandas as pd
import numpy as np
from pprint import pprint
from sklearn.pipeline import Pipeline
from seqeval.metrics import classification_report as seq_met
from sklearn.metrics import classification_report as skl_met
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import MultinomialNB

## Read Test and Train Data

In [2]:
df_train = pd.read_excel("Datasets/200_Hadits.xlsx", sheet_name="train")
df_test = pd.read_excel("Datasets/200_Hadits.xlsx", sheet_name="test")

# df_train = pd.read_csv('100_dataset - 100.csv', encoding = "ISO-8859-1")
# df_test = pd.read_csv('100_test_2.csv', encoding = "ISO-8859-1")

In [3]:
# Count the tokens for each entity
df_test.groupby('entity').size().reset_index(name='counts')

Unnamed: 0,entity,counts
0,B-LOC,6
1,B-PER,674
2,I-LOC,2
3,I-PER,1391
4,O,7793


In [4]:
# Count the tokens for each entity
df_train.groupby('entity').size().reset_index(name='counts')

Unnamed: 0,entity,counts
0,B-LOC,24
1,B-PER,1564
2,I-LOC,22
3,I-PER,3067
4,O,16466


## Grouping Tokens by Hadith

In [5]:
def sencentes_getter(data):
    agg_func = lambda s: [(s.name, t, p, e) for t, p, e in zip(s['token'].values.tolist(), 
                                                           s['pos'].values.tolist(), 
                                                           s['entity'].values.tolist())]
    grouped = data.groupby('title').apply(agg_func)
    sentences = [s for s in grouped]
    
    return sentences

In [6]:
train_sents = sencentes_getter(df_train)
test_sents = sencentes_getter(df_test)

## Feature Extraction

In [7]:
def word2features(sent, i):
        word = sent[i][1]
        postag = sent[i][2]

        features = {
            'bias': 1.0, 
            'word': word,
            'word.istitle()': word.istitle(),
            'postag': postag,
            
            'word_pls_1': '',
            'word_pls_1.istitle()': False,
            'postag_pls_1': '',
            
            'word_min_1': '',
            'word_min_1.istitle()': False,
            'postag_min_1': '',
            
            'BOS': True if i == 0 else False,
            'EOS': True if i == len(sent)-1 else False,
        }
        
        
        if i < len(sent)-1:
            word_pls_1 = sent[i+1][1]
            postag_pls_1 = sent[i+1][2]
            features.update({
                'word_pls_1': word_pls_1,
                'word_pls_1.istitle()': word_pls_1.istitle(),
                'postag_pls_1': postag_pls_1
            })
        if i > 0:
            word_min_1 = sent[i-1][1]
            postag_min_1 = sent[i-1][2]
            features.update({
                'word_min_1': word_min_1,
                'word_min_1.istitle()': word_min_1.istitle(),
                'postag_min_1': postag_min_1
            })
        
        return features

def sent2features(sent):  
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
#     print(sent)
    return [label for title, token, postag, label in sent]

def sent2labelspred(sent):
    return [pred for title, token, postag, label, pred in sent]

In [8]:
# pd.DataFrame(X_train)

In [9]:
# Training
X_train = [x for s in train_sents for x in sent2features(s)]
y_train = [y for s in train_sents for y in sent2labels(s)]

# Test
X_test = [x for s in test_sents for x in sent2features(s)]
y_test = [y for s in test_sents for y in sent2labels(s)]

In [10]:
classes = ['B-LOC', 'B-PER', 'I-LOC', 'I-PER', 'O']
new_classes = classes.copy()
new_classes.pop()

'O'

## Training Model

In [11]:
clf = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))
])

clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vectorizer', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=False)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [12]:
print("Accuracy of Training:")
print(clf.score(X_train, y_train))
print("Accuracy of Validation:")
print(clf.score(X_test, y_test))

print(skl_met(y_test, clf.predict(X_test), labels=new_classes))

Accuracy of Training:
0.9756893534503145
Accuracy of Validation:
0.9754713156294345
              precision    recall  f1-score   support

       B-LOC       0.00      0.00      0.00         6
       B-PER       0.91      0.92      0.91       674
       I-LOC       0.00      0.00      0.00         2
       I-PER       0.92      0.98      0.95      1391

   micro avg       0.92      0.95      0.94      2073
   macro avg       0.46      0.47      0.47      2073
weighted avg       0.92      0.95      0.93      2073



  'precision', 'predicted', average, warn_for)


In [13]:
df_result = df_test.copy()
df_result['predicted'] = clf.predict(X_test)
df_result_grouped = df_result.groupby('title')

y_pred = df_result_grouped['predicted']
y_true = df_result_grouped['entity']

y_grouped_pred = [p[1].tolist() for p in y_pred]
y_grouped_true = [p[1].tolist() for p in y_true]



print(seq_met(y_grouped_true, y_grouped_pred))
# ne_per = 2251
# ne_loc = 41
print("Training Data")
print("Jumlah Hadits: ", 140)
print("Jumlah Entitas PER: ", 1572)
print("Jumlah Entitas LOC: ", 33)
print("\n")
print("Test Data")
print("Jumlah Hadits: ", 60)
print("Jumlah Entitas PER: ", 679)
print("Jumlah Entitas LOC: ", 8)


             precision    recall  f1-score   support

        PER       0.81      0.87      0.84       679
        LOC       0.00      0.00      0.00         8

avg / total       0.80      0.86      0.83       687

Training Data
Jumlah Hadits:  140
Jumlah Entitas PER:  1572
Jumlah Entitas LOC:  33


Test Data
Jumlah Hadits:  60
Jumlah Entitas PER:  679
Jumlah Entitas LOC:  8


In [14]:
def get_predicted_sent(data):
    func = lambda s: [(s.name, t, p) for t, p in zip(s['token'].values.tolist(),
                                                     s['predicted'].values.tolist())]
    groupedx = data.groupby('title').apply(func)
    sents = [g for g in groupedx]
    
    return sents

In [15]:
pred_sents = get_predicted_sent(df_result)

def names_by_hadith(data):
    titl = {}
    sent_nes = []

    for sent in data:
        for i in range(len(sent)):
            if sent[i][-1] == 'B-PER':
                ne = []
                while sent[i][-1] != 'O':
                    ne.append(sent[i][1])
                    if i<len(sent)-1:
                        i+=1
                    else:
                        break
                str_name = "".join([i+" " if not i.startswith("'") and i not in string.punctuation else i for i in ne]).strip()
                if str_name not in sent_nes:
                    sent_nes.append(str_name)
        titl[sent[0][0]] = sent_nes.copy()
        sent_nes.clear()
    return titl

# pprint(names_by_hadith(pred_sents))

In [16]:
def hadith_by_name(data):
    nms = {}
    for sent in data:
        for i in range(len(sent)):
            if sent[i][-1] == 'B-PER':
                ne = []
                while sent[i][-1] != 'O':
                    ne.append(sent[i][1])
                    if i<len(sent)-1:
                        i+=1
                    else:
                        break
                str_name = "".join([i+" " 
                                    if not i.startswith("'") and i not in string.punctuation 
                                    else i for i in ne]).strip().title()
                if str_name not in nms:
                    nms[str_name] = [sent[0][0]]
                else:
                    if sent[0][0] not in nms[str_name]:
                        nms[str_name].append(sent[0][0])
    return nms

pprint(hadith_by_name(pred_sents))

{"'Abbad Ibnu Mansur": ['Hadits Darimi Nomor 11'],
 "'Amr Bin Abu Qais": ['Hadits Darimi Nomor 10'],
 "'Amru Bin Huraits": ['Hadits Ahmad Nomor 12'],
 "'Amru Bin Yahya Al Mazini": ['Hadits Malik Nomor 13'],
 "'Atha Bin Yazid": ['Hadits Nasai Nomor 21'],
 "'Atha `": ['Hadits Darimi Nomor 10'],
 "'Aun Bin Abdullah": ['Hadits Ibnu Majah Nomor 19'],
 "'Dulukus Syamsi": ['Hadits Malik Nomor 16'],
 "'Ubaidah Bin Humaid": ['Hadits Nasai Nomor 13'],
 "'Utbah Bin 'Abd As Sulami Bahwa": ['Hadits Darimi Nomor 13'],
 'Aban Bin Shalih': ['Hadits Abu Daud Nomor 12'],
 'Aban Bin Utsman': ['Hadits Ahmad Nomor 378'],
 'Abbad Ibnu Mansur': ['Hadits Darimi Nomor 11'],
 'Abdah Bin Sulaiman': ['Hadits Tirmidzi Nomor 11',
                        'Hadits Tirmidzi Nomor 22',
                        'Hadits Tirmidzi Nomor 23'],
 "Abdul 'Aziz Bin Shuhaib": ['Hadits Bukhari Nomor 14'],
 "Abdul A'La": ['Hadits Abu Daud Nomor 16'],
 'Abdul Karim': ['Hadits Tirmidzi Nomor 12'],
 'Abdul Karim Bin Abul Mukhariq': ['H

In [17]:
"asdasd asd".title()

'Asdasd Asd'

In [18]:
# print(df_test['title'].unique())

In [19]:
# df_train_cp = df_train.copy()
# df_train_cp['predicted'] = clf.predict(X_train)

# y_pred_tr = df_train_cp.groupby('title')['predicted']
# y_true_tr = df_train_cp.groupby('title')['entity']

# y_grouped_pred_tr = [p[1].tolist() for p in y_pred_tr]
# y_grouped_true_tr = [p[1].tolist() for p in y_true_tr]

# print(seq_met(y_grouped_true_tr, y_grouped_pred_tr))