In [1]:
import pandas as pd
import numpy as np
from pprint import pprint
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import Pipeline
from seqeval.metrics import classification_report as seq_met
from sklearn.metrics import classification_report as skl_met
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

# Read Data from File

In [2]:
df = pd.read_csv("Datasets/100_dataset - 100.csv", encoding = "ISO-8859-1")
df.head()

Unnamed: 0,title,token,pos,entity
0,Hadits Muslim Nomor 1,Dan,CONJ,O
1,Hadits Muslim Nomor 1,ia,PRON,O
2,Hadits Muslim Nomor 1,merupakan,VERB,O
3,Hadits Muslim Nomor 1,atsar,NOUN,O
4,Hadits Muslim Nomor 1,yang,PRON,O


In [3]:
df.groupby('entity').size().reset_index(name='counts')

Unnamed: 0,entity,counts
0,B-LOC,38
1,B-PER,1125
2,I-LOC,9
3,I-PER,2330
4,O,10367


# Preprocess

In [4]:
class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent= 1
        self.data = data
        self.empty = False
#         agg_func_x = lambda s: [(s.name, t, p, e, pr) for t, p, e, pr in zip(s['token'].values.tolist(), 
#                                                            s['pos'].values.tolist(), 
#                                                            s['entity'].values.tolist(), 
#                                                            s['predicted_entity'].values.tolist())]
        
        agg_func = lambda s: [(s.name, t, p, e) for t, p, e in zip(s['token'].values.tolist(), 
                                                           s['pos'].values.tolist(), 
                                                           s['entity'].values.tolist())]
        
        self.grouped = self.data.groupby('title').apply(agg_func)
#         self.grouped_x = self.data.groupby('title').apply(agg_func_x)
        
        self.sentences = [s for s in self.grouped]
#         self.sentences_x = [s for s in self.grouped_x]

In [5]:
def word2features(sent, i):
        word = sent[i][1]
        postag = sent[i][2]

        features = {
            'bias': 1.0, 
#             'word.lower()': word.lower(), 
            'word': word,
            'word[-3:]': word[-3:],
            'word[-2:]': word[-2:],
            'word[-1:]': word[-1:],
            'word[:3]': word[:3],
            'word[:2]': word[:2],
            'word[:1]': word[:1],
            
#             'word.isupper()': word.isupper(),
            'word.islower()': word.islower(),
            'word.istitle()': word.istitle(),
#             'word.isdigit()': word.isdigit(),
            'postag': postag,
            
#             'word_pls_1.lower()': '', 
            'word_pls_1': '',
            'word_pls_1[-3:]': '',
            'word_pls_1[-2:]': '',
            'word_pls_1[-1:]': '',
            'word_pls_1[:3]': '',
            'word_pls_1[:2]': '',
            'word_pls_1[:1]': '',
#             'word_pls_1.isupper()': False,
            'word_pls_1.islower()': False,
            'word_pls_1.istitle()': False,
#             'word_pls_1.isdigit()': False,
            'postag_pls_1': '',
            
#             'word_min_1.lower()': '', 
            'word_min_1': '',
            'word_min_1[-3:]': '',
            'word_min_1[-2:]': '',
            'word_min_1[-1:]': '',
            'word_min_1[:3]': '',
            'word_min_1[:2]': '',
            'word_min_1[:1]': '',
#             'word_min_1.isupper()': False,
            'word_min_1.islower()': False,
            'word_min_1.istitle()': False,
#             'word_min_1.isdigit()': False,
            'postag_min_1': '',
            
            'BOS': True if i == 0 else False,
            'EOS': True if i == len(sent)-1 else False,
        }
        
        
        if i < len(sent)-1:
            word_pls_1 = sent[i+1][1]
            postag_pls_1 = sent[i+1][2]
            features.update({
#                 'word_pls_1.lower()': word_pls_1.lower(), 
                'word_pls_1': word_pls_1,
                'word_pls_1[-3:]': word_pls_1[-3:],
                'word_pls_1[-2:]': word_pls_1[-2:],
                'word_pls_1[-1:]': word_pls_1[-1:],
                'word_pls_1[:3]': word_pls_1[:3],
                'word_pls_1[:2]': word_pls_1[:2],
                'word_pls_1[:1]': word_pls_1[:1],
#                 'word_pls_1.isupper()': word_pls_1.isupper(),
                'word_pls_1.islower()': word_pls_1.islower(),
                'word_pls_1.istitle()': word_pls_1.istitle(),
#                 'word_pls_1.isdigit()': word_pls_1.isdigit(),
                'postag_pls_1': postag_pls_1
            })
        if i > 0:
            word_min_1 = sent[i-1][1]
            postag_min_1 = sent[i-1][2]
            features.update({
#                 'word_min_1.lower()': word_min_1.lower(), 
                'word_min_1': word_min_1,
                'word_min_1[-3:]': word_min_1[-3:],
                'word_min_1[-2:]': word_min_1[-2:],
                'word_min_1[-1:]': word_min_1[-1:],
                'word_min_1[:3]': word_min_1[:3],
                'word_min_1[:2]': word_min_1[:2],
                'word_min_1[:1]': word_min_1[:1],
#                 'word_min_1.isupper()': word_min_1.isupper(),
                'word_min_1.islower()': word_min_1.islower(),
                'word_min_1.istitle()': word_min_1.istitle(),
#                 'word_min_1.isdigit()': word_min_1.isdigit(),
                'postag_min_1': postag_min_1
            })
        
        return features

def sent2features(sent):  
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for title, token, postag, label in sent]

# def sent2labels(sent):
#     return [label for title, token, postag, label, pred in sent]

def sent2labelspred(sent):
    return [pred for title, token, postag, label, pred in sent]

In [6]:
# get formated sentences from dataset
getter = SentenceGetter(df)
sentences = getter.sentences

In [29]:
sentences

[[('Hadits Abu Daud Nomor 1', 'Telah', 'ADV', 'O'),
  ('Hadits Abu Daud Nomor 1', 'menceritakan', 'VERB', 'O'),
  ('Hadits Abu Daud Nomor 1', 'kepada', 'ADP', 'O'),
  ('Hadits Abu Daud Nomor 1', 'kami', 'PRON', 'O'),
  ('Hadits Abu Daud Nomor 1', 'Abdullah', 'PROPN', 'B-PER'),
  ('Hadits Abu Daud Nomor 1', 'bin', 'PROPN', 'I-PER'),
  ('Hadits Abu Daud Nomor 1', 'Maslamah', 'PROPN', 'I-PER'),
  ('Hadits Abu Daud Nomor 1', 'bin', 'PROPN', 'I-PER'),
  ('Hadits Abu Daud Nomor 1', "Qa'nab", 'PROPN', 'I-PER'),
  ('Hadits Abu Daud Nomor 1', 'al', 'PROPN', 'I-PER'),
  ('Hadits Abu Daud Nomor 1', "Qa'nabi", 'PROPN', 'I-PER'),
  ('Hadits Abu Daud Nomor 1', 'telah', 'ADV', 'O'),
  ('Hadits Abu Daud Nomor 1', 'menceritakan', 'VERB', 'O'),
  ('Hadits Abu Daud Nomor 1', 'kepada', 'ADP', 'O'),
  ('Hadits Abu Daud Nomor 1', 'kami', 'PRON', 'O'),
  ('Hadits Abu Daud Nomor 1', 'Abdul', 'PROPN', 'B-PER'),
  ('Hadits Abu Daud Nomor 1', 'Aziz', 'PROPN', 'I-PER'),
  ('Hadits Abu Daud Nomor 1', 'yakni', 'ADV

# Split train and test data

In [7]:
# Split train and test
X = [x for s in sentences for x in sent2features(s)]
y = [x for s in sentences for x in sent2labels(s)]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0, shuffle=False)

In [27]:
X[0]

{'bias': 1.0,
 'word': 'Telah',
 'word[-3:]': 'lah',
 'word[-2:]': 'ah',
 'word[-1:]': 'h',
 'word[:3]': 'Tel',
 'word[:2]': 'Te',
 'word[:1]': 'T',
 'word.islower()': False,
 'word.istitle()': True,
 'postag': 'ADV',
 'word_pls_1': 'menceritakan',
 'word_pls_1[-3:]': 'kan',
 'word_pls_1[-2:]': 'an',
 'word_pls_1[-1:]': 'n',
 'word_pls_1[:3]': 'men',
 'word_pls_1[:2]': 'me',
 'word_pls_1[:1]': 'm',
 'word_pls_1.islower()': True,
 'word_pls_1.istitle()': False,
 'postag_pls_1': 'VERB',
 'word_min_1': '',
 'word_min_1[-3:]': '',
 'word_min_1[-2:]': '',
 'word_min_1[-1:]': '',
 'word_min_1[:3]': '',
 'word_min_1[:2]': '',
 'word_min_1[:1]': '',
 'word_min_1.islower()': False,
 'word_min_1.istitle()': False,
 'postag_min_1': '',
 'BOS': True,
 'EOS': False}

In [8]:
xx = df["token"]
yy = df["entity"]

xx_train, xx_test, yy_train, yy_test = train_test_split(xx, yy, test_size=0.33, random_state=0, shuffle=False)

# X_test

In [9]:
classes = np.unique(y)
classes = classes.tolist()

# remove the most common tag 'O'(other)
new_classes = classes.copy()
new_classes.pop()
new_classes

['B-LOC', 'B-PER', 'I-LOC', 'I-PER']

# Training Model

In [10]:
clf = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', LinearSVC(multi_class='ovr'))
])

# clf = Pipeline([
#     ('vectorizer', DictVectorizer(sparse=False)),
#     ('classifier', SVC(kernel='poly', degree=3, C=1.0))
# ])

clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vectorizer', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=False)), ('classifier', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

# Evaluation

In [11]:
print("Accuracy of Training:")
print(clf.score(X_train, y_train))
print("Accuracy of Validation:")
print(clf.score(X_test, y_test))

print(skl_met(y_test, clf.predict(X_test), labels=new_classes))

Accuracy of Training:
0.9994619027120103
Accuracy of Validation:
0.9724710509067075
              precision    recall  f1-score   support

       B-LOC       0.50      0.67      0.57         6
       B-PER       0.94      0.93      0.94       445
       I-LOC       0.00      0.00      0.00         0
       I-PER       0.96      0.94      0.95       996

   micro avg       0.95      0.94      0.95      1447
   macro avg       0.60      0.63      0.61      1447
weighted avg       0.96      0.94      0.95      1447



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [12]:
y_predict = clf.predict(X_test)
evaluation = pd.DataFrame({'real': y_test, 'prediction': y_predict})
wrong_pred = []
correct_pred = []

for i in range(len(evaluation["real"])):
    if evaluation["real"][i] != evaluation["prediction"][i]:
        wrong_pred.append((i, y_predict[i], y_test[i]))
    else:
        correct_pred.append((i, y_predict[i], y_test[i]))
print("Prediksi Salah:")
print(len(wrong_pred))

Prediksi Salah:
126


# Manual testing with data test

In [13]:
df_test = pd.read_csv("Datasets/100_test_2.csv", encoding = "ISO-8859-1")
df_test.head()

Unnamed: 0,title,token,pos,entity
0,Hadits Muslim Nomor 10,',PUNCT,O
1,Hadits Muslim Nomor 10,Dia,PRON,O
2,Hadits Muslim Nomor 10,bertanya,VERB,O
3,Hadits Muslim Nomor 10,lagi,ADV,O
4,Hadits Muslim Nomor 10,',PUNCT,O


In [14]:
class SentenceGetterX(object):
    def __init__(self, data):
        self.n_sent= 1
        self.data = data
        self.empty = False
        agg_func_x = lambda s: [(s.name, t, p, e, pr) for t, p, e, pr in zip(s['token'].values.tolist(), 
                                                           s['pos'].values.tolist(), 
                                                           s['entity'].values.tolist(), 
                                                           s['predicted_entity'].values.tolist())]
        
#         agg_func = lambda s: [(s.name, t, p, e) for t, p, e in zip(s['token'].values.tolist(), 
#                                                            s['pos'].values.tolist(), 
#                                                            s['entity'].values.tolist())]
        
#         self.grouped = self.data.groupby('title').apply(agg_func)
        self.grouped_x = self.data.groupby('title').apply(agg_func_x)
        
#         self.sentences = [s for s in self.grouped]
        self.sentences_x = [s for s in self.grouped_x]

In [15]:
# get formated sentences from dataset
getter_test = SentenceGetter(df_test)
sentences_test = getter_test.sentences

In [25]:
sentences_test

[[('Hadits Muslim Nomor 10', "'", 'PUNCT', 'O'),
  ('Hadits Muslim Nomor 10', 'Dia', 'PRON', 'O'),
  ('Hadits Muslim Nomor 10', 'bertanya', 'VERB', 'O'),
  ('Hadits Muslim Nomor 10', 'lagi', 'ADV', 'O'),
  ('Hadits Muslim Nomor 10', "'", 'PUNCT', 'O'),
  ('Hadits Muslim Nomor 10', 'Wahai', 'PROPN', 'O'),
  ('Hadits Muslim Nomor 10', 'Rasulullah', 'PROPN', 'B-PER'),
  ('Hadits Muslim Nomor 10', 'apakah', 'ADV', 'O'),
  ('Hadits Muslim Nomor 10', 'ihsan', 'NOUN', 'O'),
  ('Hadits Muslim Nomor 10', 'itu', 'DET', 'O'),
  ('Hadits Muslim Nomor 10', '?', 'PUNCT', 'O'),
  ('Hadits Muslim Nomor 10', "'", 'PUNCT', 'O'),
  ('Hadits Muslim Nomor 10', 'Beliau', 'PRON', 'O'),
  ('Hadits Muslim Nomor 10', 'menjawab', 'VERB', 'O'),
  ('Hadits Muslim Nomor 10', "'", 'PUNCT', 'O'),
  ('Hadits Muslim Nomor 10', 'Kamu', 'PROPN', 'O'),
  ('Hadits Muslim Nomor 10', 'menyembah', 'VERB', 'O'),
  ('Hadits Muslim Nomor 10', 'Allah', 'PROPN', 'O'),
  ('Hadits Muslim Nomor 10', 'seakan', 'ADV', 'O'),
  ('Hadits 

In [16]:
X_manual_test = [x for s in sentences_test for x in sent2features(s)]
predict = clf.predict(X_manual_test)

In [17]:
df_test['predicted_entity'] = predict
df_test.to_csv('Datasets/predicted.csv', encoding='utf-8')

In [18]:
getter_test = SentenceGetterX(df_test)
sentences_test_baru = getter_test.sentences_x

In [19]:
def sent2labelsx(sent):
    return [label for title, token, postag, label, pred in sent]

def sent2labelspredx(sent):
    return [pred for title, token, postag, label, pred in sent]

In [20]:
# X_manual_test = [sent2features(s) for s in sentences_test]
y_true_grouped = [sent2labelsx(s) for s in sentences_test_baru]
y_pred_grouped = [sent2labelspredx(s) for s in sentences_test_baru]

y_true_ungrouped = [x for s in sentences_test_baru for x in sent2labelsx(s)]
y_pred_ungrouped = [x for s in sentences_test_baru for x in sent2labelspred(s)]

len(y_pred_ungrouped)

# print(y_manual_test)

4591

# Grouping token by Hadith

In [21]:
# X_manual_grouped = [x for s in sentences_test for x in sent2features(s)]
y_manual_grouped = [x for s in sentences_test_baru for x in sent2labelsx(s)]

In [22]:
# print(classification_report(y_manual_test, y_pred_manual_test))

print("PER ENTITAS")
print(seq_met(y_true_grouped, y_pred_grouped))
print("PER TOKEN")
print(skl_met(y_true_ungrouped, y_pred_ungrouped, labels=new_classes))

PER ENTITAS
             precision    recall  f1-score   support

        PER       0.79      0.83      0.81       447
        LOC       0.44      0.67      0.53         6

avg / total       0.78      0.83      0.80       453

PER TOKEN
              precision    recall  f1-score   support

       B-LOC       0.44      0.67      0.53         6
       B-PER       0.94      0.93      0.94       446
       I-LOC       0.00      0.00      0.00         0
       I-PER       0.96      0.94      0.95       996

   micro avg       0.95      0.94      0.95      1448
   macro avg       0.59      0.64      0.61      1448
weighted avg       0.95      0.94      0.95      1448



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [23]:
zz = [1,2,3,4,5]

zz[-3:]

[3, 4, 5]