In [1]:
import pandas as pd
import numpy as np
from pprint import pprint
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import Pipeline
from seqeval.metrics import classification_report as seq_met
from sklearn.metrics import classification_report as skl_met
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

## Feature Extraction

In [2]:
class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent= 1
        self.data = data
        self.empty = False
#         agg_func_x = lambda s: [(s.name, t, p, e, pr) for t, p, e, pr in zip(s['token'].values.tolist(), 
#                                                            s['pos'].values.tolist(), 
#                                                            s['entity'].values.tolist(), 
#                                                            s['predicted_entity'].values.tolist())]
        
        agg_func = lambda s: [(s.name, t, p, e) for t, p, e in zip(s['token'].values.tolist(), 
                                                           s['pos'].values.tolist(), 
                                                           s['entity'].values.tolist())]
        
        self.grouped = self.data.groupby('title').apply(agg_func)
#         self.grouped_x = self.data.groupby('title').apply(agg_func_x)
        
        self.sentences = [s for s in self.grouped]
#         self.sentences_x = [s for s in self.grouped_x]

In [3]:
def word2features(sent, i):
        word = sent[i][1]
        postag = sent[i][2]

        features = {
            'bias': 1.0, 
#             'word.lower()': word.lower(), 
            'word': word,
            'word[-3:]': word[-3:],
            'word[-2:]': word[-2:],
            'word[-1:]': word[-1:],
            'word[:3]': word[:3],
            'word[:2]': word[:2],
            'word[:1]': word[:1],
            
#             'word.isupper()': word.isupper(),
            'word.islower()': word.islower(),
            'word.istitle()': word.istitle(),
#             'word.isdigit()': word.isdigit(),
            'postag': postag,
            
#             'word_pls_1.lower()': '', 
            'word_pls_1': '',
            'word_pls_1[-3:]': '',
            'word_pls_1[-2:]': '',
            'word_pls_1[-1:]': '',
            'word_pls_1[:3]': '',
            'word_pls_1[:2]': '',
            'word_pls_1[:1]': '',
#             'word_pls_1.isupper()': False,
            'word_pls_1.islower()': False,
            'word_pls_1.istitle()': False,
#             'word_pls_1.isdigit()': False,
            'postag_pls_1': '',
            
#             'word_min_1.lower()': '', 
            'word_min_1': '',
            'word_min_1[-3:]': '',
            'word_min_1[-2:]': '',
            'word_min_1[-1:]': '',
            'word_min_1[:3]': '',
            'word_min_1[:2]': '',
            'word_min_1[:1]': '',
#             'word_min_1.isupper()': False,
            'word_min_1.islower()': False,
            'word_min_1.istitle()': False,
#             'word_min_1.isdigit()': False,
            'postag_min_1': '',
            
            'BOS': True if i == 0 else False,
            'EOS': True if i == len(sent)-1 else False,
        }
        
        
        if i < len(sent)-1:
            word_pls_1 = sent[i+1][1]
            postag_pls_1 = sent[i+1][2]
            features.update({
#                 'word_pls_1.lower()': word_pls_1.lower(), 
                'word_pls_1': word_pls_1,
                'word_pls_1[-3:]': word_pls_1[-3:],
                'word_pls_1[-2:]': word_pls_1[-2:],
                'word_pls_1[-1:]': word_pls_1[-1:],
                'word_pls_1[:3]': word_pls_1[:3],
                'word_pls_1[:2]': word_pls_1[:2],
                'word_pls_1[:1]': word_pls_1[:1],
#                 'word_pls_1.isupper()': word_pls_1.isupper(),
                'word_pls_1.islower()': word_pls_1.islower(),
                'word_pls_1.istitle()': word_pls_1.istitle(),
#                 'word_pls_1.isdigit()': word_pls_1.isdigit(),
                'postag_pls_1': postag_pls_1
            })
        if i > 0:
            word_min_1 = sent[i-1][1]
            postag_min_1 = sent[i-1][2]
            features.update({
#                 'word_min_1.lower()': word_min_1.lower(), 
                'word_min_1': word_min_1,
                'word_min_1[-3:]': word_min_1[-3:],
                'word_min_1[-2:]': word_min_1[-2:],
                'word_min_1[-1:]': word_min_1[-1:],
                'word_min_1[:3]': word_min_1[:3],
                'word_min_1[:2]': word_min_1[:2],
                'word_min_1[:1]': word_min_1[:1],
#                 'word_min_1.isupper()': word_min_1.isupper(),
                'word_min_1.islower()': word_min_1.islower(),
                'word_min_1.istitle()': word_min_1.istitle(),
#                 'word_min_1.isdigit()': word_min_1.isdigit(),
                'postag_min_1': postag_min_1
            })
        
        return features

def sent2features(sent):  
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for title, token, postag, label in sent]

# def sent2labels(sent):
#     return [label for title, token, postag, label, pred in sent]

def sent2labelspred(sent):
    return [pred for title, token, postag, label, pred in sent]

In [4]:
df = pd.read_excel("Datasets/100_merged.xlsx")
df.head()

Unnamed: 0,title,token,pos,entity
0,Hadits Abu Daud Nomor 1,Telah,ADV,O
1,Hadits Abu Daud Nomor 1,menceritakan,VERB,O
2,Hadits Abu Daud Nomor 1,kepada,ADP,O
3,Hadits Abu Daud Nomor 1,kami,PRON,O
4,Hadits Abu Daud Nomor 1,Abdullah,PROPN,B-PER


In [5]:
df.groupby('entity').size().reset_index(name='counts')

Unnamed: 0,entity,counts
0,B-LOC,30
1,B-PER,2239
2,I-LOC,24
3,I-PER,4460
4,O,24254


In [6]:
df.index[df['entity'] == 'B'].tolist()

[]

In [7]:
# get formated sentences from dataset
getter = SentenceGetter(df)
sentences = getter.sentences

In [8]:
# Split train and test
X = [x for s in sentences for x in sent2features(s)]
y = [x for s in sentences for x in sent2labels(s)]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0, shuffle=False)

In [9]:
classes = np.unique(y)
classes = classes.tolist()

# remove the most common tag 'O'(other)
new_classes = classes.copy()
new_classes.pop()
new_classes

['B-LOC', 'B-PER', 'I-LOC', 'I-PER', 'O']

In [10]:
clf = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', LinearSVC(multi_class='ovr'))
])

clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vectorizer', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=False)), ('classifier', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [11]:
print("Accuracy of Training:")
print(clf.score(X_train, y_train))
print("Accuracy of Validation:")
print(clf.score(X_test, y_test))

print(skl_met(y_test, clf.predict(X_test), labels=new_classes))

Accuracy of Training:
0.9966787003610108
Accuracy of Validation:
0.9777191439460569
              precision    recall  f1-score   support

       B-LOC       0.00      0.00      0.00         3
       B-PER       0.95      0.94      0.94       936
       I-LOC       0.00      0.00      0.00         2
       I-PER       0.97      0.94      0.95      1903
           O       0.98      0.99      0.99      7389

   micro avg       0.98      0.98      0.98     10233
   macro avg       0.58      0.57      0.58     10233
weighted avg       0.98      0.98      0.98     10233



In [12]:
df_test = pd.read_excel("Datasets/100_merged_test.xlsx")
df_test.head()

Unnamed: 0,title,token,pos,entity
0,Hadits Muslim Nomor 15,-,PUNCT,I-PER
1,Hadits Muslim Nomor 15,Ahwash,NUM,I-PER
2,Hadits Muslim Nomor 15,dari,ADP,O
3,Hadits Muslim Nomor 15,Abu,PROPN,B-PER
4,Hadits Muslim Nomor 15,Ishaq,PROPN,I-PER


In [13]:
class SentenceGetterX(object):
    def __init__(self, data):
        self.n_sent= 1
        self.data = data
        self.empty = False
        agg_func_x = lambda s: [(s.name, t, p, e, pr) for t, p, e, pr in zip(s['token'].values.tolist(), 
                                                           s['pos'].values.tolist(), 
                                                           s['entity'].values.tolist(), 
                                                           s['predicted_entity'].values.tolist())]
        
#         agg_func = lambda s: [(s.name, t, p, e) for t, p, e in zip(s['token'].values.tolist(), 
#                                                            s['pos'].values.tolist(), 
#                                                            s['entity'].values.tolist())]
        
#         self.grouped = self.data.groupby('title').apply(agg_func)
        self.grouped_x = self.data.groupby('title').apply(agg_func_x)
        
#         self.sentences = [s for s in self.grouped]
        self.sentences_x = [s for s in self.grouped_x]

In [14]:
# get formated sentences from dataset
getter_test = SentenceGetter(df_test)
sentences_test = getter_test.sentences

In [15]:
X_manual_test = [x for s in sentences_test for x in sent2features(s)]
predict = clf.predict(X_manual_test)

In [16]:
df_test['predicted_entity'] = predict
df_test.to_excel('Datasets/predicted.xlsx', encoding='utf-8')

In [17]:
getter_test = SentenceGetterX(df_test)
sentences_test_baru = getter_test.sentences_x

In [18]:
def sent2labelsx(sent):
    return [label for title, token, postag, label, pred in sent]

def sent2labelspredx(sent):
    return [pred for title, token, postag, label, pred in sent]

In [19]:
y_true_grouped = [sent2labelsx(s) for s in sentences_test_baru]
y_pred_grouped = [sent2labelspredx(s) for s in sentences_test_baru]

y_true_ungrouped = [x for s in sentences_test_baru for x in sent2labelsx(s)]
y_pred_ungrouped = [x for s in sentences_test_baru for x in sent2labelspred(s)]

y_pred_grouped[0]

['O',
 'I-PER',
 'O',
 'B-PER',
 'I-PER',
 'O',
 'B-PER',
 'I-PER',
 'I-PER',
 'O',
 'B-PER',
 'I-PER',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-PER',
 'I-PER',
 'I-PER',
 'I-PER',
 'I-PER',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-PER',
 'I-PER',
 'I-PER',
 'I-PER',
 'I-PER',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-PER',
 'I-PER',
 'I-PER',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [20]:
y_manual_grouped = [x for s in sentences_test_baru for x in sent2labelsx(s)]

In [21]:
print("PER ENTITAS")
print(seq_met(y_true_grouped, y_pred_grouped))
print("PER TOKEN")
print(skl_met(y_true_ungrouped, y_pred_ungrouped, labels=new_classes))

PER ENTITAS
             precision    recall  f1-score   support

        PER       0.83      0.86      0.85       893
        LOC       0.00      0.00      0.00         4

avg / total       0.83      0.86      0.84       897

PER TOKEN
              precision    recall  f1-score   support

       B-LOC       0.00      0.00      0.00         3
       B-PER       0.95      0.93      0.94       889
       I-LOC       0.00      0.00      0.00         2
       I-PER       0.98      0.94      0.96      1792
           O       0.98      0.99      0.99      6618

   micro avg       0.98      0.98      0.98      9304
   macro avg       0.58      0.57      0.58      9304
weighted avg       0.98      0.98      0.98      9304

