In [1]:
import string
import pandas as pd
import numpy as np
from pprint import pprint
from pprint import pprint
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from seqeval.metrics import classification_report as seq_met
from sklearn.metrics import classification_report as skl_met
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import confusion_matrix

## Read Test and Train Data

In [2]:
df_train = pd.read_excel("Datasets/dataset.xlsx", sheetname="train")
df_test = pd.read_excel("Datasets/dataset.xlsx", sheetname="test")

  return func(*args, **kwargs)


In [3]:
# Count the tokens for each entity
df_test.groupby('entity').size().reset_index(name='counts')

Unnamed: 0,entity,counts
0,B-LOC,6
1,B-PER,677
2,I-LOC,2
3,I-PER,1386
4,O,7795


In [4]:
# Count the tokens for each entity
df_train.groupby('entity').size().reset_index(name='counts')

Unnamed: 0,entity,counts
0,B-LOC,24
1,B-PER,1564
2,I-LOC,22
3,I-PER,3067
4,O,16466


## Grouping Tokens by Hadith

In [5]:
def sencentes_getter(data):
    agg_func = lambda s: [(s.name, t, p, e) for t, p, e in zip(s['token'].values.tolist(), 
                                                           s['pos'].values.tolist(), 
                                                           s['entity'].values.tolist())]
    grouped = data.groupby('title').apply(agg_func)
    sentences = [s for s in grouped]
    
    return sentences

In [6]:
train_sents = sencentes_getter(df_train)
test_sents = sencentes_getter(df_test)

## Feature Extraction

In [7]:
def word2features(sent, i):
        word = sent[i][1]
        postag = sent[i][2]

        features = {
            'bias': 1.0, 
            'word': word,
            'word[-3:]': word[-3:],
            'word[-2:]': word[-2:],
            'word[-1:]': word[-1:],
            'word[:3]': word[:3],
            'word[:2]': word[:2],
            'word[:1]': word[:1],            
            'word.islower()': word.islower(),
            'word.istitle()': word.istitle(),
            'postag': postag,
            
            'word_pls_1': '',
            'word_pls_1[-3:]': '',
            'word_pls_1[-2:]': '',
            'word_pls_1[-1:]': '',
            'word_pls_1[:3]': '',
            'word_pls_1[:2]': '',
            'word_pls_1[:1]': '',
            'word_pls_1.islower()': False,
            'word_pls_1.istitle()': False,
            'postag_pls_1': '',
            
            'word_min_1': '',
            'word_min_1[-3:]': '',
            'word_min_1[-2:]': '',
            'word_min_1[-1:]': '',
            'word_min_1[:3]': '',
            'word_min_1[:2]': '',
            'word_min_1[:1]': '',
            'word_min_1.islower()': False,
            'word_min_1.istitle()': False,
            'postag_min_1': '',
            
            'BOS': True if i == 0 else False,
            'EOS': True if i == len(sent)-1 else False,
        }
        
        
        if i < len(sent)-1:
            word_pls_1 = sent[i+1][1]
            postag_pls_1 = sent[i+1][2]
            features.update({
                'word_pls_1': word_pls_1,
                'word_pls_1[-3:]': word_pls_1[-3:],
                'word_pls_1[-2:]': word_pls_1[-2:],
                'word_pls_1[-1:]': word_pls_1[-1:],
                'word_pls_1[:3]': word_pls_1[:3],
                'word_pls_1[:2]': word_pls_1[:2],
                'word_pls_1[:1]': word_pls_1[:1],
                'word_pls_1.islower()': word_pls_1.islower(),
                'word_pls_1.istitle()': word_pls_1.istitle(),
                'postag_pls_1': postag_pls_1
            })
        if i > 0:
            word_min_1 = sent[i-1][1]
            postag_min_1 = sent[i-1][2]
            features.update({
                'word_min_1': word_min_1,
                'word_min_1[-3:]': word_min_1[-3:],
                'word_min_1[-2:]': word_min_1[-2:],
                'word_min_1[-1:]': word_min_1[-1:],
                'word_min_1[:3]': word_min_1[:3],
                'word_min_1[:2]': word_min_1[:2],
                'word_min_1[:1]': word_min_1[:1],
                'word_min_1.islower()': word_min_1.islower(),
                'word_min_1.istitle()': word_min_1.istitle(),
                'postag_min_1': postag_min_1
            })
        
        return features

def sent2features(sent):  
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
#     print(sent)
    return [label for title, token, postag, label in sent]

def sent2labelspred(sent):
    return [pred for title, token, postag, label, pred in sent]

In [8]:
# Training
X_train = [x for s in train_sents for x in sent2features(s)]
y_train = [y for s in train_sents for y in sent2labels(s)]

# Test
X_test = [x for s in test_sents for x in sent2features(s)]
y_test = [y for s in test_sents for y in sent2labels(s)]

In [9]:
classes = ['B-LOC', 'B-PER', 'I-LOC', 'I-PER', 'O']
new_classes = classes.copy()
new_classes.pop()

'O'

In [10]:
pd.DataFrame(X_train)

Unnamed: 0,BOS,EOS,bias,postag,postag_min_1,postag_pls_1,word,word.islower(),word.istitle(),word[-1:],...,word_min_1[:3],word_pls_1,word_pls_1.islower(),word_pls_1.istitle(),word_pls_1[-1:],word_pls_1[-2:],word_pls_1[-3:],word_pls_1[:1],word_pls_1[:2],word_pls_1[:3]
0,True,False,1.0,ADV,,VERB,Telah,False,True,h,...,,menceritakan,True,False,n,an,kan,m,me,men
1,False,False,1.0,VERB,ADV,ADP,menceritakan,True,False,n,...,Tel,kepada,True,False,a,da,ada,k,ke,kep
2,False,False,1.0,ADP,VERB,PRON,kepada,True,False,a,...,men,kami,True,False,i,mi,ami,k,ka,kam
3,False,False,1.0,PRON,ADP,PROPN,kami,True,False,i,...,kep,Abdullah,False,True,h,ah,lah,A,Ab,Abd
4,False,False,1.0,PROPN,PRON,PROPN,Abdullah,False,True,h,...,kam,bin,True,False,n,in,bin,b,bi,bin
5,False,False,1.0,PROPN,PROPN,PROPN,bin,True,False,n,...,Abd,Maslamah,False,True,h,ah,mah,M,Ma,Mas
6,False,False,1.0,PROPN,PROPN,PROPN,Maslamah,False,True,h,...,bin,bin,True,False,n,in,bin,b,bi,bin
7,False,False,1.0,PROPN,PROPN,PROPN,bin,True,False,n,...,Mas,Qa'nab,False,False,b,ab,nab,Q,Qa,Qa'
8,False,False,1.0,PROPN,PROPN,PROPN,Qa'nab,False,False,b,...,bin,al,True,False,l,al,al,a,al,al
9,False,False,1.0,PROPN,PROPN,PROPN,al,True,False,l,...,Qa',Qa'nabi,False,False,i,bi,abi,Q,Qa,Qa'


## Training Model

In [11]:
clf = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', LinearSVC(multi_class='ovr'))
])

clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vectorizer', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=False)), ('classifier', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [12]:
print("Accuracy of Training:")
print(clf.score(X_train, y_train))
print("Accuracy of Validation:")
print(clf.score(X_test, y_test))

type(skl_met(y_test, clf.predict(X_test), labels=new_classes))

Accuracy of Training:
0.9966419145816582
Accuracy of Validation:
0.9869247922156903


str

In [13]:
df_result = df_test.copy()
df_result['predicted'] = clf.predict(X_test)
df_result_grouped = df_result.groupby('title')

y_pred = df_result_grouped['predicted']
y_true = df_result_grouped['entity']

y_grouped_pred = [p[1].tolist() for p in y_pred]
y_grouped_true = [p[1].tolist() for p in y_true]


# print(df_result_grouped)
print(seq_met(y_grouped_true, y_grouped_pred))
# ne_per = 2251
# ne_loc = 41
print("Training Data")
print("Jumlah Hadits: ", 140)
print("Jumlah Entitas PER: ", 1572)
print("Jumlah Entitas LOC: ", 33)
print("\n")
print("Test Data")
print("Jumlah Hadits: ", 60)
print("Jumlah Entitas PER: ", 679)
print("Jumlah Entitas LOC: ", 8)


             precision    recall  f1-score   support

        PER       0.88      0.91      0.89       678
        LOC       0.27      0.38      0.32         8

avg / total       0.87      0.91      0.89       686

Training Data
Jumlah Hadits:  140
Jumlah Entitas PER:  1572
Jumlah Entitas LOC:  33


Test Data
Jumlah Hadits:  60
Jumlah Entitas PER:  679
Jumlah Entitas LOC:  8


In [14]:
def get_predicted_sent(data):
    func = lambda s: [(s.name, t, p) for t, p in zip(s['token'].values.tolist(),
                                                     s['predicted'].values.tolist())]
    groupedx = data.groupby('title').apply(func)
    sents = [g for g in groupedx]
    
    return sents

In [15]:
from pandas import ExcelWriter

pred_sents = get_predicted_sent(df_result)
writer = ExcelWriter('Datasets/result_tes.xlsx')
df_result.to_excel(writer, 'result', index=False)
writer.save()

def names_by_hadith(data):
    titl = {}
    sent_nes = []

    for sent in data:
        for i in range(len(sent)):
            if sent[i][-1] == 'B-PER':
                ne = []
                while sent[i][-1] != 'O':
                    ne.append(sent[i][1])
                    if i<len(sent)-1:
                        i+=1
                    else:
                        break
                str_name = "".join([i+" " if not i.startswith("'") and i not in string.punctuation else i for i in ne]).strip()
                if str_name not in sent_nes:
                    sent_nes.append(str_name)
        titl[sent[0][0]] = sent_nes.copy()
        sent_nes.clear()
    return titl

# pprint(names_by_hadith(pred_sents))

In [16]:
df_result = df_test.copy()
df_result['predicted'] = clf.predict(X_test)

In [17]:
# df_result

In [18]:
writer = ExcelWriter('res.xlsx')
df_result.to_excel(writer,'Sheet1',index=False)
writer.save()

In [19]:
# print(df_test['title'].unique())

In [20]:
df_train_cp = df_train.copy()
df_train_cp['predicted'] = clf.predict(X_train)

y_pred_tr = df_train_cp.groupby('title')['predicted']
y_true_tr = df_train_cp.groupby('title')['entity']

y_grouped_pred_tr = [p[1].tolist() for p in y_pred_tr]
y_grouped_true_tr = [p[1].tolist() for p in y_true_tr]

print(seq_met(y_grouped_true_tr, y_grouped_pred_tr))

             precision    recall  f1-score   support

        PER       0.97      0.98      0.98      1572
        LOC       0.96      0.82      0.89        33

avg / total       0.97      0.98      0.98      1605

