In [3]:
!pip install pyprind
!pip install sklearn-crfsuite
!pip install utils

Collecting pyprind
  Downloading https://files.pythonhosted.org/packages/ab/b3/1f12ebc5009c65b607509393ad98240728b4401bc3593868fb161fdd3760/PyPrind-2.11.3-py2.py3-none-any.whl
Installing collected packages: pyprind
Successfully installed pyprind-2.11.3
Collecting sklearn-crfsuite
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Collecting python-crfsuite>=0.8.3
[?25l  Downloading https://files.pythonhosted.org/packages/79/47/58f16c46506139f17de4630dbcfb877ce41a6355a1bbf3c443edb9708429/python_crfsuite-0.9.7-cp37-cp37m-manylinux1_x86_64.whl (743kB)
[K     |████████████████████████████████| 747kB 6.8MB/s 
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.7 sklearn-crfsuite-0.3.6
Collecting utils
  Downloading https://files.pythonhosted.org/packages/55/e6/c2d2b2703e7debc8b501caae0e6f7ead148fd0faa3c8131292a599930029/u

In [4]:
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report

from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers
import scipy
from pathlib import Path

from joblib import dump, load
import pandas as pd


In [5]:
import base64
import requests
req = requests.get('https://raw.githubusercontent.com/dfaizaditya/Data/master/Indonesian_Manually_Tagged_Corpus.tsv.txt')
tagged_sentences = req.text

In [6]:
import csv
tagged_sentences = pd.read_csv('https://raw.githubusercontent.com/dfaizaditya/Data/master/Indonesian_Manually_Tagged_Corpus.tsv.txt', sep='\t', header=None, quoting=csv.QUOTE_NONE)
tagged_sentences.columns = ["token", "tag"]

# Post tag : menambahkan fitur dengan melakukan postag ke train data

In [7]:
tagged_sentences

Unnamed: 0,token,tag
0,Kera,NN
1,untuk,SC
2,amankan,VB
3,pesta olahraga,NN
4,Pemerintah,NNP
...,...,...
256617,",",Z
256618,ujar,VB
256619,-nya,PRP
256620,menambahkan,VB


In [8]:
lol = tagged_sentences.values.tolist()
print("Number of Tagged Sentences ", len(tagged_sentences))
print("Total Number of Tagged words", len(lol))
vocab = set([word for word, tag in lol])
print("Vocabulary of the Corpus", len(vocab))
tags = set([tag for word, tag in lol])
print("Number of Tags in the Corpus ", len(tags))

Number of Tagged Sentences  256622
Total Number of Tagged words 256622
Vocabulary of the Corpus 18287
Number of Tags in the Corpus  24


In [9]:
from collections import Counter 

Counter([tag for word, tag in lol])

Counter({'CC': 7438,
         'CD': 17819,
         'DT': 381,
         'FW': 2365,
         'IN': 21311,
         'JJ': 9724,
         'MD': 5248,
         'NEG': 1520,
         'NN': 61940,
         'NND': 1414,
         'NNP': 34649,
         'OD': 738,
         'PR': 5348,
         'PRP': 7583,
         'RB': 4903,
         'RP': 183,
         'SC': 13080,
         'SYM': 2210,
         'UH': 30,
         'VB': 31733,
         'WH': 260,
         'X': 397,
         'Z': 26347,
         'fw': 1})

In [10]:
lst = []
lst_lst = []

for x in lol:
  lst.append(x)
  if(x[0] == '.'):
    lst_lst.append(lst)
    lst = []

In [11]:
import re
from sklearn_crfsuite import CRF
from joblib import dump, load
from sklearn.model_selection import train_test_split 

def prepare_train_test(tagged_sentences, seed, path):
    train_set, test_set = train_test_split(tagged_sentences, test_size=0.2, random_state=seed)
    prefix, suffix = select_params(path)
    X_train, y_train = prepare_data(train_set, prefix=prefix, suffix=suffix)
    X_test, y_test = prepare_data(test_set, prefix=prefix, suffix=suffix)
    return X_train, y_train, X_test, y_test

def features(sentence, index, prefix=True, suffix=True):
    ### sentence is of the form [w1,w2,w3,..], index is the position of the word in the sentence
    ### and entriestoRemove type is tuple
    d = {
        'is_first_capital':int(sentence[index][0].isupper()),
        'is_first_word': int(index == 0),
        'is_last_word':int(index == len(sentence)-1),
        'is_complete_capital': int(sentence[index].upper() == sentence[index]),
        'prev_word':'' if index == 0 else sentence[index-1],
        'next_word':'' if index == len(sentence)-1 else sentence[index+1],
        'is_numeric':int(sentence[index].isdigit()),
        'is_alphanumeric': int(bool((re.match('^(?=.*[0-9]$)(?=.*[a-zA-Z])', sentence[index])))),
        'word_has_hyphen': 1 if '-' in sentence[index] else 0 
         }
    
    if prefix:
        d.update({'prefix_1':sentence[index][0],
                 'prefix_2': sentence[index][:2],
                 'prefix_3':sentence[index][:3],
                 'prefix_4':sentence[index][:4]})
    
    if suffix:
        d.update({'suffix_1':sentence[index][-1],
                  'suffix_2':sentence[index][-2:],
                  'suffix_3':sentence[index][-3:],
                  'suffix_4':sentence[index][-4:]})
        
    return d
    
def untag(sentence):
      return [word for word, tag in sentence]

def prepare_data(tagged_sentences, prefix, suffix):
    X, y = [], []
    for sentences in tagged_sentences:
        X.append([features(untag(sentences), index, prefix, suffix) for index in range(len(sentences))])
        y.append([tag for word, tag in sentences])
    return X, y

def fit_and_dump(X_train, y_train, path):
    crf = CRF(
        algorithm='lbfgs',
        c1=0.01,
        c2=0.1,
        max_iterations=100,
        all_possible_transitions=True
        )
    crf.fit(X_train, y_train)
    dump(crf, path)
    return crf

def select_params(path):
    if 'baseline' in str(path):
        prefix, suffix = False, False
    elif 'prefix' in str(path):
        prefix, suffix = True, False
    elif 'suffix' in str(path):
        prefix, suffix = False, True
    else:
        prefix, suffix = True, True
    return prefix, suffix

In [12]:
from joblib import dump, load

path = '/content/drive/MyDrive/Data mining/crf-with-allfix-fold01-seed42.joblib'
crf = load(path)
seed = int(str(path).split('.')[0][-2:])
X_train, y_train, X_test, y_test = prepare_train_test(lst_lst, seed, path)
train_set, test_set = train_test_split(lst_lst, test_size=0.2, random_state=seed)

y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred, labels=crf.classes_, digits=3))

              precision    recall  f1-score   support

          IN      0.985     0.985     0.985      4160
          NN      0.988     0.989     0.988     12296
          JJ      0.982     0.980     0.981      1943
          PR      1.000     0.999     1.000      1068
           Z      0.998     1.000     0.999      5204
          CD      0.999     0.998     0.998      3538
         SYM      1.000     1.000     1.000       446
          VB      0.995     0.997     0.996      6352
          CC      0.995     0.989     0.992      1533
          RB      0.989     0.979     0.984       971
          MD      0.997     0.996     0.997      1034
         PRP      0.998     0.999     0.998      1522
         NNP      0.981     0.984     0.982      6990
          SC      0.976     0.983     0.979      2582
         NND      0.981     0.970     0.976       269
          FW      0.985     0.947     0.966       473
         NEG      1.000     1.000     1.000       326
           X      0.958    

In [13]:
# get sentence index for every tag
d = []
for index, sentence in enumerate(y_test):
    for tag in sentence:
        d.append((index, tag))

In [14]:
from sklearn.metrics import confusion_matrix

def post_tag(set):
    path = '/content/drive/MyDrive/Data mining/crf-with-allfix-fold01-seed42.joblib'
    crf = load(path)
    X_test, y_test = prepare_data(set, prefix=True, suffix=True)
    y_pred = crf.predict(X_test)
    words = [word for sent in set for word, _ in sent]
    return pd.DataFrame({'words': words, 'pos': y_pred[0], 'ne': y_test[0]})

In [15]:
e1 = [[('adik', 'a'),
  ('makan','a'),
  ('kentang','a'),
  ('canada','a'),
  ]]

In [16]:
post_tag(e1)

Unnamed: 0,words,pos,ne
0,adik,JJ,a
1,makan,VB,a
2,kentang,IN,a
3,canada,JJ,a


# NER

In [17]:
test_ner1 = ['Tengkoe Amir Hamzah yang bernama lengkap Tengkoe Amir Hamzah Pangeran Indra Poetera, atau lebih dikenal hanya dengan nama pena Amir Hamzah adalah sastrawan Indonesia angkatan Poedjangga Baroe dan Pahlawan Nasional Indonesia.',
'Lahir dari keluarga bangsawan Melayu Kesultanan Langkat di Sumatra Utara, ia dididik di Sumatra dan Jawa. Saat berguru di SMA di Surakarta sekitar 1930, Amir muda terlibat dengan gerakan nasionalis dan jatuh cinta dengan seorang teman sekolahnya, Ilik Soendari.',
'Bahkan setelah Amir melanjutkan studinya di sekolah hukum di Batavia (sekarang Jakarta) keduanya tetap dekat, hanya berpisah pada tahun 1937 ketika Amir dipanggil kembali ke Sumatra untuk menikahi putri sultan dan mengambil tanggung jawab di lingkungan keraton.',
'Meskipun tidak bahagia dengan pernikahannya, dia memenuhi tugas kekeratonannya.',
'Setelah Indonesia memproklamasikan kemerdekaannya pada tahun 1945, ia menjabat sebagai wakil pemerintah di Langkat.',
'Namun siapa nyana, pada tahun pertama negara Indonesia yang baru lahir, ia meninggal dalam peristiwa konflik sosial berdarah di Sumatra yang disulut oleh faksi dari Partai Komunis Indonesia dan dimakamkan di sebuah kuburan massal.']

test_ner2 = ['Lukisan Galileo menunjukkan bahwa ia pertama melihat Neptunus pada tanggal 28 Desember 1612 dan 27 Januari 1613.',
'Pada kedua hari tersebut, Galileo salah menganggap Neptunus sebagai sebuah bintang tetap ketika planet ini muncul sangat dekat—konjungsi—dengan Jupiter pada langit malam.',
'Ia tidak dianggap sebagai penemu Neptunus.',
'Pada masa pengamatan pertamanya bulan Desember 1612, Neptunus bersifat tetap di langit karena planet ini baru saja mengalami penghuluan pada hari itu.',
'Gerakan ke belakang ini terbentuk ketika orbit Bumi membawa Bumi melewati planet terluar.',
'Karena Neptunus baru saja memulai siklus penghuluan tahunannya, gerakan planet ini terlalu sulit dilacak menggunakan teleskop kecil Galileo.',
'pada Juli 2009, fisikawan Universitas Melbourne, David Jamieson mengumumkan adanya bukti baru yang menyatakan bahwa Galileo setidaknya sadar bahwa bintang yang ia amati telah berpindah relatif terhadap bintang tetap.']

test_merg = test_ner1 + test_ner2

In [18]:
import math
import warnings
import pyprind
import pandas as pd
import numpy as np

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.model_selection import RandomizedSearchCV
from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics
from sklearn.metrics import make_scorer
from sklearn.exceptions import UndefinedMetricWarning

# from model_plots import plot_learning_curve

In [20]:
ner_data = pd.read_csv('https://raw.githubusercontent.com/dfaizaditya/Data/master/train_data.tsv', sep='\t', header=None)
ner_data.columns = ["token", "ne"]
ner_data

Unnamed: 0,token,ne
0,Agus,O
1,digantikan,O
2,Gatot,O
3,Lupri,O
4,Jantomo,O
...,...,...
52936,poin,O
52937,metode,O
52938,penghitungan,O
52939,suara,O


In [21]:
# Explore the distribution of NE tags in the dataset
tag_distribution = ner_data.groupby("ne").size().reset_index(name='counts')
print(tag_distribution)

               ne  counts
0      B-LOCATION    1017
1  B-ORGANIZATION     822
2        B-PERSON    1491
3      I-LOCATION     588
4  I-ORGANIZATION     879
5        I-PERSON     901
6               O   44898


In [22]:
# Extract the useful classes (not 'O' or NaN values) as a list
classes = list(filter(lambda x: x not in ["O", np.nan], list(ner_data["ne"].unique())))

print(classes)

['B-PERSON', 'I-PERSON', 'B-LOCATION', 'I-LOCATION', 'B-ORGANIZATION', 'I-ORGANIZATION']


In [23]:
# Create a sentences dictionary and an initial single sentence dictionary
sentences, sentence = [], []

# Create a progress bar
pbar = pyprind.ProgBar(len(ner_data))

# For each row in the NER data...
for index, row in ner_data.iterrows():
    # If the row is empty (no string in the token column)
    if type(row["token"]) != str:
        # If the current sentence is not empty, append it to the sentences and create a new sentence
        if len(sentence) > 0:
            sentences.append(sentence)
            sentence = []
    else:
        if type(row["token"]) != float and type(row["ne"]) != float:
          sentence.append([row["token"], row["ne"]])
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:06


In [24]:
sentences[0]

[['Agus', 'O'],
 ['digantikan', 'O'],
 ['Gatot', 'O'],
 ['Lupri', 'O'],
 ['Jantomo', 'O'],
 ['dari', 'O'],
 ['daerah', 'O'],
 ['pemilhan', 'O'],
 ['Jawa', 'O'],
 ['Tengah', 'O'],
 ['VIII', 'O'],
 ['di', 'O'],
 ['nomor', 'O'],
 ['urut', 'O'],
 ['keempat', 'O'],
 ['.', 'O']]

In [25]:
post_tag([list(map(tuple, sentences[0]))])

Unnamed: 0,words,pos,ne
0,Agus,NNP,O
1,digantikan,VB,O
2,Gatot,NNP,O
3,Lupri,NNP,O
4,Jantomo,NNP,O
5,dari,IN,O
6,daerah,NN,O
7,pemilhan,NN,O
8,Jawa,NNP,O
9,Tengah,NNP,O


In [26]:
fin = []
# Create a progress bar
pbar = pyprind.ProgBar(len(sentences))
for x in sentences:
  dt = post_tag([list(map(tuple, x))])
  dl = dt.values.tolist()
  fin.append(dl)
  pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:01:58


In [27]:
fin[0]

[['Agus', 'NNP', 'O'],
 ['digantikan', 'VB', 'O'],
 ['Gatot', 'NNP', 'O'],
 ['Lupri', 'NNP', 'O'],
 ['Jantomo', 'NNP', 'O'],
 ['dari', 'IN', 'O'],
 ['daerah', 'NN', 'O'],
 ['pemilhan', 'NN', 'O'],
 ['Jawa', 'NNP', 'O'],
 ['Tengah', 'NNP', 'O'],
 ['VIII', 'NNP', 'O'],
 ['di', 'IN', 'O'],
 ['nomor', 'NN', 'O'],
 ['urut', 'IN', 'O'],
 ['keempat', 'OD', 'O'],
 ['.', 'Z', 'O']]

In [28]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
        '2_gram' : gram_2(word),
        '3_gram' : gram_3(word),
        '4_gram' : gram_4(word),
        'word.IsInitCaps()': IsInitCaps(word),
        'word.IsMixedCaps()': IsMixedCaps(word),

    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:token': word1,
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i > 0:
        word2 = sent[i-2][0]
        postag2 = sent[i-2][1]
        features.update({
            '-2:token': word2,
        })
    else:
        features['BOS'] = False

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:token': word1,
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    if i < len(sent)-2:
        word2 = sent[i+2][0]
        postag2 = sent[i+2][1]
        features.update({
            '+2:token': word2
        })
    else:
        features['EOS'] = False

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

def gram_2(n):
  if (len(n)<3):
      return n
  else:
    lst = []
    s = list(n)
    for x in range(len(s)-1):
      a = s[x] + s[x+1]
      lst.append(a)
    return lst

def gram_3(n):
  if (len(n)<4):
      return n
  else:
    lst = []
    s = list(n)
    for x in range(len(s)-2):
      a = s[x] + s[x+1] + s[x+2]
      lst.append(a)
    return lst
        
def gram_4(n):
  if (len(n)<5):
      return n
  else:
    lst = []
    s = list(n)
    for x in range(len(s)-3):
      a = s[x] + s[x+1] + s[x+2] + s[x+3]
      lst.append(a)
    return lst

def IsMixedCaps(n):
  n = n[1:]
  z = list(n)
  for x in z:
    if(x.isupper()):
      return 1
  return 0

def IsInitCaps(n):
  if(n[0].isupper()):
    return 1
  else:
    return 0


In [29]:
X = [sent2features(s) for s in fin]
y = [sent2labels(s) for s in fin]

In [30]:
# Split X and y into training (80%) and test (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print("First token features:\n{}\n{}".format("-"*21, X_train[0][0]))
print("\nFirst token label:\n{}\n{}".format("-"*18, y_train[0][0]))

First token features:
---------------------
{'bias': 1.0, 'word.lower()': 'jero', 'word[-3:]': 'ero', 'word[-2:]': 'ro', 'word.isupper()': False, 'word.istitle()': True, 'word.isdigit()': False, 'postag': 'NNP', 'postag[:2]': 'NN', '2_gram': ['Je', 'er', 'ro'], '3_gram': ['Jer', 'ero'], '4_gram': 'Jero', 'word.IsInitCaps()': 1, 'word.IsMixedCaps()': 0, 'BOS': False, '+1:token': 'optimistis', '+1:word.lower()': 'optimistis', '+1:word.istitle()': False, '+1:word.isupper()': False, '+1:postag': 'JJ', '+1:postag[:2]': 'JJ', '+2:token': 'kuota'}

First token label:
------------------
B-PERSON


In [31]:
# Create a new CRF model
crf = CRF(algorithm="lbfgs",
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=True)

# Train the CRF model on the supplied training data
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [32]:
# CRF model to predict on the test data
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred, labels=classes))

                precision    recall  f1-score   support

      B-PERSON       0.88      0.85      0.87       318
      I-PERSON       0.78      0.82      0.80       186
    B-LOCATION       0.82      0.79      0.81       198
    I-LOCATION       0.69      0.63      0.66       109
B-ORGANIZATION       0.66      0.66      0.66       153
I-ORGANIZATION       0.61      0.62      0.61       164

     micro avg       0.76      0.75      0.76      1128
     macro avg       0.74      0.73      0.73      1128
  weighted avg       0.77      0.75      0.76      1128



## c. Prediksi test_set

In [33]:
test_merg

['Tengkoe Amir Hamzah yang bernama lengkap Tengkoe Amir Hamzah Pangeran Indra Poetera, atau lebih dikenal hanya dengan nama pena Amir Hamzah adalah sastrawan Indonesia angkatan Poedjangga Baroe dan Pahlawan Nasional Indonesia.',
 'Lahir dari keluarga bangsawan Melayu Kesultanan Langkat di Sumatra Utara, ia dididik di Sumatra dan Jawa. Saat berguru di SMA di Surakarta sekitar 1930, Amir muda terlibat dengan gerakan nasionalis dan jatuh cinta dengan seorang teman sekolahnya, Ilik Soendari.',
 'Bahkan setelah Amir melanjutkan studinya di sekolah hukum di Batavia (sekarang Jakarta) keduanya tetap dekat, hanya berpisah pada tahun 1937 ketika Amir dipanggil kembali ke Sumatra untuk menikahi putri sultan dan mengambil tanggung jawab di lingkungan keraton.',
 'Meskipun tidak bahagia dengan pernikahannya, dia memenuhi tugas kekeratonannya.',
 'Setelah Indonesia memproklamasikan kemerdekaannya pada tahun 1945, ia menjabat sebagai wakil pemerintah di Langkat.',
 'Namun siapa nyana, pada tahun per

In [34]:
los = []
tmp =[]
for x in test_merg:
  t = x.split()
  for x in t :
    tmp.append([x, 'X'])
  los.append(tmp)
  tmp = []

In [35]:
los2 = []
tmp =[]
for x in test_merg:
  t = x.split()
  for x in t :
    tmp.append(x)
  los2.append(tmp)
  tmp = []

POST tag

In [36]:
fin2 = []
# Create a progress bar
pbar = pyprind.ProgBar(len(los))
for x in los:
  dt = post_tag([list(map(tuple, x))])
  dl = dt.values.tolist()
  fin2.append(dl)
  pbar.update()

0% [#############] 100% | ETA: 00:00:00
Total time elapsed: 00:00:00


In [37]:
fin2[0]

[['Tengkoe', 'NNP', 'X'],
 ['Amir', 'NNP', 'X'],
 ['Hamzah', 'NNP', 'X'],
 ['yang', 'SC', 'X'],
 ['bernama', 'VB', 'X'],
 ['lengkap', 'JJ', 'X'],
 ['Tengkoe', 'NNP', 'X'],
 ['Amir', 'NNP', 'X'],
 ['Hamzah', 'NNP', 'X'],
 ['Pangeran', 'NNP', 'X'],
 ['Indra', 'NNP', 'X'],
 ['Poetera,', 'NNP', 'X'],
 ['atau', 'CC', 'X'],
 ['lebih', 'RB', 'X'],
 ['dikenal', 'VB', 'X'],
 ['hanya', 'RB', 'X'],
 ['dengan', 'IN', 'X'],
 ['nama', 'NN', 'X'],
 ['pena', 'NN', 'X'],
 ['Amir', 'NNP', 'X'],
 ['Hamzah', 'NNP', 'X'],
 ['adalah', 'VB', 'X'],
 ['sastrawan', 'NN', 'X'],
 ['Indonesia', 'NNP', 'X'],
 ['angkatan', 'NN', 'X'],
 ['Poedjangga', 'NNP', 'X'],
 ['Baroe', 'NNP', 'X'],
 ['dan', 'CC', 'X'],
 ['Pahlawan', 'NNP', 'X'],
 ['Nasional', 'NNP', 'X'],
 ['Indonesia.', 'NNP', 'X']]

In [38]:
x_set = [sent2features(s) for s in fin2]
y_set = [sent2labels(s) for s in fin2]

In [39]:
set_pred = crf.predict(x_set)

In [40]:
test_1 = pd.DataFrame(
    {'Kalimat': los2[0],
     'Prediksi': set_pred[0],
    })
test_1.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30
Kalimat,Tengkoe,Amir,Hamzah,yang,bernama,lengkap,Tengkoe,Amir,Hamzah,Pangeran,Indra,"Poetera,",atau,lebih,dikenal,hanya,dengan,nama,pena,Amir,Hamzah,adalah,sastrawan,Indonesia,angkatan,Poedjangga,Baroe,dan,Pahlawan,Nasional,Indonesia.
Prediksi,B-PERSON,I-PERSON,I-PERSON,O,O,O,B-PERSON,I-PERSON,I-PERSON,I-PERSON,I-PERSON,I-PERSON,O,O,O,O,O,O,O,B-PERSON,I-PERSON,O,O,B-LOCATION,O,B-PERSON,I-PERSON,O,B-ORGANIZATION,I-ORGANIZATION,I-ORGANIZATION


In [41]:
test_2 = pd.DataFrame(
    {'Kalimat': los2[2],
     'Prediksi': set_pred[2],
    })
test_2.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36
Kalimat,Bahkan,setelah,Amir,melanjutkan,studinya,di,sekolah,hukum,di,Batavia,(sekarang,Jakarta),keduanya,tetap,"dekat,",hanya,berpisah,pada,tahun,1937,ketika,Amir,dipanggil,kembali,ke,Sumatra,untuk,menikahi,putri,sultan,dan,mengambil,tanggung,jawab,di,lingkungan,keraton.
Prediksi,O,O,B-PERSON,O,O,O,O,O,O,B-LOCATION,I-LOCATION,I-LOCATION,O,O,O,O,O,O,O,O,O,B-PERSON,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O


In [42]:
test_3 = pd.DataFrame(
    {'Kalimat': los2[2],
     'Prediksi': set_pred[2],
    })
test_3.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36
Kalimat,Bahkan,setelah,Amir,melanjutkan,studinya,di,sekolah,hukum,di,Batavia,(sekarang,Jakarta),keduanya,tetap,"dekat,",hanya,berpisah,pada,tahun,1937,ketika,Amir,dipanggil,kembali,ke,Sumatra,untuk,menikahi,putri,sultan,dan,mengambil,tanggung,jawab,di,lingkungan,keraton.
Prediksi,O,O,B-PERSON,O,O,O,O,O,O,B-LOCATION,I-LOCATION,I-LOCATION,O,O,O,O,O,O,O,O,O,B-PERSON,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O


In [43]:
test_4 = pd.DataFrame(
    {'Kalimat': los2[3],
     'Prediksi': set_pred[3],
    })
test_4.T

Unnamed: 0,0,1,2,3,4,5,6,7,8
Kalimat,Meskipun,tidak,bahagia,dengan,"pernikahannya,",dia,memenuhi,tugas,kekeratonannya.
Prediksi,O,O,O,O,O,O,O,O,O


In [44]:
test_5 = pd.DataFrame(
    {'Kalimat': los2[4],
     'Prediksi': set_pred[4],
    })
test_5.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
Kalimat,Setelah,Indonesia,memproklamasikan,kemerdekaannya,pada,tahun,1945,ia,menjabat,sebagai,wakil,pemerintah,di,Langkat.
Prediksi,O,O,O,O,O,O,O,O,O,O,O,O,O,B-LOCATION


In [45]:
test_6 = pd.DataFrame(
    {'Kalimat': los2[5],
     'Prediksi': set_pred[5],
    })
test_6.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33
Kalimat,Namun,siapa,"nyana,",pada,tahun,pertama,negara,Indonesia,yang,baru,"lahir,",ia,meninggal,dalam,peristiwa,konflik,sosial,berdarah,di,Sumatra,yang,disulut,oleh,faksi,dari,Partai,Komunis,Indonesia,dan,dimakamkan,di,sebuah,kuburan,massal.
Prediksi,O,O,O,O,O,O,O,B-LOCATION,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-ORGANIZATION,I-ORGANIZATION,I-ORGANIZATION,O,O,O,O,O,O


In [46]:
test_7 = pd.DataFrame(
    {'Kalimat': los2[6],
     'Prediksi': set_pred[6],
    })
test_7.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
Kalimat,Lukisan,Galileo,menunjukkan,bahwa,ia,pertama,melihat,Neptunus,pada,tanggal,28,Desember,1612,dan,27,Januari,1613.
Prediksi,B-PERSON,I-PERSON,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O


In [47]:
test_8 = pd.DataFrame(
    {'Kalimat': los2[7],
     'Prediksi': set_pred[7],
    })
test_8.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21
Kalimat,Pada,kedua,hari,"tersebut,",Galileo,salah,menganggap,Neptunus,sebagai,sebuah,bintang,tetap,ketika,planet,ini,muncul,sangat,dekat—konjungsi—dengan,Jupiter,pada,langit,malam.
Prediksi,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O


In [48]:
test_9 = pd.DataFrame(
    {'Kalimat': los2[8],
     'Prediksi': set_pred[8],
    })
test_9.T

Unnamed: 0,0,1,2,3,4,5
Kalimat,Ia,tidak,dianggap,sebagai,penemu,Neptunus.
Prediksi,O,O,O,O,O,O


In [49]:
test_10 = pd.DataFrame(
    {'Kalimat': los2[9],
     'Prediksi': set_pred[9],
    })
test_10.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21
Kalimat,Pada,masa,pengamatan,pertamanya,bulan,Desember,1612,Neptunus,bersifat,tetap,di,langit,karena,planet,ini,baru,saja,mengalami,penghuluan,pada,hari,itu.
Prediksi,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O


In [50]:
test_11 = pd.DataFrame(
    {'Kalimat': los2[10],
     'Prediksi': set_pred[10],
    })
test_11.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
Kalimat,Gerakan,ke,belakang,ini,terbentuk,ketika,orbit,Bumi,membawa,Bumi,melewati,planet,terluar.
Prediksi,O,O,O,O,O,O,O,O,O,O,O,O,O


In [51]:
test_12 = pd.DataFrame(
    {'Kalimat': los2[11],
     'Prediksi': set_pred[11],
    })
test_12.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
Kalimat,Karena,Neptunus,baru,saja,memulai,siklus,penghuluan,"tahunannya,",gerakan,planet,ini,terlalu,sulit,dilacak,menggunakan,teleskop,kecil,Galileo.
Prediksi,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-LOCATION


In [52]:
test_13 = pd.DataFrame(
    {'Kalimat': los2[12],
     'Prediksi': set_pred[12],
    })
test_13.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28
Kalimat,pada,Juli,2009,fisikawan,Universitas,"Melbourne,",David,Jamieson,mengumumkan,adanya,bukti,baru,yang,menyatakan,bahwa,Galileo,setidaknya,sadar,bahwa,bintang,yang,ia,amati,telah,berpindah,relatif,terhadap,bintang,tetap.
Prediksi,O,O,O,O,B-ORGANIZATION,I-ORGANIZATION,I-ORGANIZATION,I-ORGANIZATION,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O


In [53]:
frames = [test_1,
test_2,
test_3,
test_4,
test_5,
test_6,
test_7,
test_8,
test_9,
test_10,
test_11,
test_12,
test_13,]

  
result = pd.concat(frames)
display(result)

Unnamed: 0,Kalimat,Prediksi
0,Tengkoe,B-PERSON
1,Amir,I-PERSON
2,Hamzah,I-PERSON
3,yang,O
4,bernama,O
...,...,...
24,berpindah,O
25,relatif,O
26,terhadap,O
27,bintang,O


In [54]:
from google.colab import files

result.to_csv('df.csv')
files.download('df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Hasil anotasi (Manual)

In [55]:
import csv
import pandas as pd
anotasi_df = pd.read_csv('https://raw.githubusercontent.com/yaumialfadha/Data/master/anotation.csv')

In [56]:
anotasi_df

Unnamed: 0.1,Unnamed: 0,Kalimat,Prediksi,Anotasi
0,0,Tengkoe,B-PERSON,B-PERSON
1,1,Amir,I-PERSON,I-PERSON
2,2,Hamzah,I-PERSON,I-PERSON
3,3,yang,O,O
4,4,bernama,O,O
...,...,...,...,...
284,24,berpindah,O,O
285,25,relatif,O,O
286,26,terhadap,O,O
287,27,bintang,O,O


## d. Bandingkan hasil prediksi model dengan hasil anotasi

In [57]:
row_count = anotasi_df.shape[0]
correct = 0
incorrect = 0
spurius = 0
par = 0
mis = 0

list_prediksi = anotasi_df['Prediksi'].values.tolist()
list_anotasi = anotasi_df['Anotasi'].values.tolist()

for i in range(len(list_prediksi)):
    if list_prediksi[i] == list_anotasi[i]:
      correct = correct + 1
    else :
      incorrect = incorrect + 1
    for j in range(len(list_prediksi)):
      if list_anotasi[i]  not in list_prediksi:
        spurius = spurius +1

#POSSIBLE(POS)=COR+INC+PAR+MIS=TP+FN
#ACTUAL(ACT)=COR+INC+PAR+SPU=TP+FP

pos = correct + incorrect + par + mis
act = correct + incorrect + spurius + par


#exact match (i.e., strict and exact )
#Precision=COR/ACT=TP/TP+FP
#Recall=COR/POS=TP/TP+FN

strict = correct/act 
exact = correct/pos 

print(strict)
print(exact)


#partial match  (i.e., partial and type)
#Precision=COR + 0.5 × PAR/ACT=TPTP+FP
#Recall=COR + 0.5 × PAR/POS=CORACT=TPTP+FP

partial = (correct + 0.5 * par)/act 
type_partial = (correct + 0.5 * par)/pos


print(partial)
print(type_partial)

print("correct: " + str(correct))
print("incorrect: " + str(incorrect))
print





0.972318339100346
0.972318339100346
0.972318339100346
0.972318339100346
correct: 281
incorrect: 8


<function print>

# WSD

In [58]:
from html.parser import HTMLParser
import os
import math
import string

import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', 200)

In [59]:
import nltk
# nltk.download()

from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import warnings
warnings.filterwarnings("ignore")

In [61]:
import nltk
import os

In [69]:


ct = nltk.tag.CRFTagger()
ct.set_model_file('/content/drive/MyDrive/Data mining/all_indo_man_tag_corpus_model.crf.tagger')

In [63]:
!pip install pySastrawi

Collecting pySastrawi
[?25l  Downloading https://files.pythonhosted.org/packages/61/84/b0a5454a040f81e81e6a95a5d5635f20ad43cc0c288f8b4966b339084962/PySastrawi-1.2.0-py2.py3-none-any.whl (210kB)
[K     |█▋                              | 10kB 13.7MB/s eta 0:00:01[K     |███▏                            | 20kB 18.4MB/s eta 0:00:01[K     |████▊                           | 30kB 11.1MB/s eta 0:00:01[K     |██████▎                         | 40kB 8.6MB/s eta 0:00:01[K     |███████▉                        | 51kB 5.6MB/s eta 0:00:01[K     |█████████▍                      | 61kB 6.2MB/s eta 0:00:01[K     |███████████                     | 71kB 6.0MB/s eta 0:00:01[K     |████████████▌                   | 81kB 6.6MB/s eta 0:00:01[K     |██████████████                  | 92kB 6.5MB/s eta 0:00:01[K     |███████████████▋                | 102kB 6.9MB/s eta 0:00:01[K     |█████████████████▏              | 112kB 6.9MB/s eta 0:00:01[K     |██████████████████▊             | 122kB 6.

In [64]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [65]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from nltk.tokenize import word_tokenize 
factory = StemmerFactory()
stemmer = factory.create_stemmer()

factory = StopWordRemoverFactory()
stopword = factory.create_stop_word_remover()


In [66]:
# Function to convert  
def listToString(s): 
    
    # initialize an empty string
    str1 = " " 
    
    # return string  
    return (str1.join(s))

# 1.

In [121]:
import csv
train2 = pd.read_csv('https://raw.githubusercontent.com/dfaizaditya/Data/master/train_2_sense.csv')
train6 = pd.read_csv('https://raw.githubusercontent.com/dfaizaditya/Data/master/train_6_sense.csv')

test2 = pd.read_csv('https://raw.githubusercontent.com/dfaizaditya/Data/master/test_2_sense.csv')
test6 = pd.read_csv('https://raw.githubusercontent.com/dfaizaditya/Data/master/test_6_sense.csv')

PUNCT_TO_REMOVE = string.punctuation
PUNCT_TO_REMOVE = PUNCT_TO_REMOVE.replace("-","")
print(PUNCT_TO_REMOVE) # Output : !"#$%&'()*+,-./:;<=>?@[\]^`{|}~

!"#$%&'()*+,./:;<=>?@[\]^_`{|}~


In [122]:
annotated_words = set(train2.kata)

In [123]:
t2 = train2.values.tolist()
for x in t2:
  z = ct.tag_sents([x[3].split(' ')])
  x.append(z[0])
  kalimat = x[3].translate(str.maketrans('','',PUNCT_TO_REMOVE)).lower()
  stop = stopword.remove(kalimat)
  tokens = nltk.tokenize.word_tokenize(stop)
  x.append(stemmer.stem(listToString(tokens)))

t6 = train6.values.tolist()
for x in t6:
  z = ct.tag_sents([x[3].split(' ')])
  x.append(z[0])
  kalimat = x[3].translate(str.maketrans('','',PUNCT_TO_REMOVE)).lower()
  stop = stopword.remove(kalimat)
  tokens = nltk.tokenize.word_tokenize(stop)
  x.append(stemmer.stem(listToString(tokens)))

In [124]:
for x in t2:
  z = ct.tag_sents([x[5].split(' ')])
  x.append(z[0])

for x in t6:
  z = ct.tag_sents([x[5].split(' ')])
  x.append(z[0])

In [125]:
from pandas import DataFrame
df2 = DataFrame (t2,columns=['kalimat_id','kata','sense','kalimat','pos','stem','pos_stem'])

In [126]:
df2_cut = df2.loc[df2['kata'] == 'lebat']
df2_cut

Unnamed: 0,kalimat_id,kata,sense,kalimat,pos,stem,pos_stem
888,932092,lebat,5501,Umumnya hanya tumbuh lebat pada laki-laki.,"[(Umumnya, RB), (hanya, RB), (tumbuh, VB), (lebat, NN), (pada, IN), (laki-laki., NNP)]",tumbuh lebat laki,"[(tumbuh, VB), (lebat, NN), (laki, NN)]"
889,932093,lebat,5501,"Demi kelanjutan hidup mereka, pendatang-pendatang baru mulai membuka hutan lebat menjadi ladang-ladang dan daerah sepanjang Lawe Alas dan Lawe Bulan sebagai daerah persawahan.","[(Demi, NN), (kelanjutan, NN), (hidup, NN), (mereka,, NN), (pendatang-pendatang, NN), (baru, JJ), (mulai, VB), (membuka, VB), (hutan, NN), (lebat, NN), (menjadi, VB), (ladang-ladang, NN), (dan, CC...",lanjut hidup datang buka hutan lebat ladang daerah lawe alas lawe daerah sawah,"[(lanjut, VB), (hidup, NN), (datang, VB), (buka, VB), (hutan, NN), (lebat, NN), (ladang, NN), (daerah, NN), (lawe, FW), (alas, FW), (lawe, FW), (daerah, NN), (sawah, VB)]"
890,932094,lebat,5501,"Tank ini populer digunakan di negara-negara Asia Tenggara yang memiliki hutan lebat atau kepulauan seperti Indonesia, Thailand, Filipina, Brunei Darussalam, dan Malaysia.","[(Tank, NN), (ini, PR), (populer, JJ), (digunakan, VB), (di, IN), (negara-negara, NN), (Asia, NNP), (Tenggara, NNP), (yang, SC), (memiliki, VB), (hutan, NN), (lebat, NN), (atau, CC), (kepulauan, N...",tank populer negara asia tenggara milik hutan lebat pulau indonesia thailand filipina brunei darussalam malaysia,"[(tank, NN), (populer, JJ), (negara, NN), (asia, NN), (tenggara, NN), (milik, NN), (hutan, NN), (lebat, NN), (pulau, NN), (indonesia, NNP), (thailand, NNP), (filipina, NNP), (brunei, NNP), (daruss..."
891,932100,lebat,5502,"Badai Tropis Washi membawa 10 jam hujan lebat yang memicu bencana banjir bandang Mindanao, daerah yang jarang mengalami siklon tropis.","[(Badai, NNP), (Tropis, NNP), (Washi, NNP), (membawa, VB), (10, CD), (jam, NN), (hujan, NN), (lebat, NN), (yang, SC), (memicu, VB), (bencana, NN), (banjir, NN), (bandang, NN), (Mindanao,, NNP), (d...",badai tropis washi bawa 10 jam hujan lebat picu bencana banjir bandang mindanao daerah jarang alami siklon tropis,"[(badai, NN), (tropis, JJ), (washi, VB), (bawa, VB), (10, CD), (jam, NN), (hujan, NN), (lebat, NN), (picu, VB), (bencana, NN), (banjir, NN), (bandang, NN), (mindanao, NN), (daerah, NN), (jarang, R..."
892,932102,lebat,5502,"Kota ini memiliki curah hujan yang lebat sehingga dijuluki sebagai ""Kota Penghujan"".","[(Kota, NN), (ini, PR), (memiliki, VB), (curah, NN), (hujan, NN), (yang, SC), (lebat, VB), (sehingga, SC), (dijuluki, VB), (sebagai, IN), (""Kota, NN), (Penghujan""., NNP)]",kota milik curah hujan lebat juluk kota hujan,"[(kota, NN), (milik, NN), (curah, NN), (hujan, NN), (lebat, NN), (juluk, NN), (kota, NN), (hujan, NN)]"
...,...,...,...,...,...,...,...
1066,932511,lebat,5502,"Dua peristiwa yang membawa perubahan besar mempercepat terjadinya air bah: ledakan tempat-tempat penyimpanan air yang besar di bawah tanah, mungkin disebabkan oleh gempa bumi dengan gelombang-gelo...","[(Dua, CD), (peristiwa, NN), (yang, SC), (membawa, VB), (perubahan, NN), (besar, JJ), (mempercepat, VB), (terjadinya, NN), (air, NN), (bah:, NN), (ledakan, NN), (tempat-tempat, NN), (penyimpanan, ...",peristiwa bawa ubah cepat air bah ledak tempat simpan air tanah sebab gempa bumi gelombang pasang samudera hujan lebat 40 ayat 12,"[(peristiwa, NN), (bawa, VB), (ubah, NN), (cepat, JJ), (air, NN), (bah, NN), (ledak, NN), (tempat, NN), (simpan, NN), (air, NN), (tanah, NN), (sebab, SC), (gempa, NN), (bumi, NN), (gelombang, NN),..."
1067,932514,lebat,5501,"Kombinasi dari postur kekar dan panjang, bulu lebat membuatnya tampak gagah dan empuk.","[(Kombinasi, NN), (dari, IN), (postur, NN), (kekar, NN), (dan, CC), (panjang,, NN), (bulu, NN), (lebat, NN), (membuatnya, RB), (tampak, VB), (gagah, NN), (dan, CC), (empuk., NNP)]",kombinasi postur kekar bulu lebat buat gagah empuk,"[(kombinasi, NN), (postur, NN), (kekar, NN), (bulu, NN), (lebat, NN), (buat, IN), (gagah, NN), (empuk, NN)]"
1068,932515,lebat,5501,hutan tropis nan lebat menjadikan sumber mata air murni yg sehat dan alami.,"[(hutan, NN), (tropis, JJ), (nan, NN), (lebat, NN), (menjadikan, VB), (sumber, NN), (mata, NN), (air, NN), (murni, NN), (yg, NN), (sehat, JJ), (dan, CC), (alami., NNP)]",hutan tropis nan lebat jadi sumber air murni yg sehat alami,"[(hutan, NN), (tropis, JJ), (nan, NN), (lebat, NN), (jadi, VB), (sumber, NN), (air, NN), (murni, NN), (yg, NN), (sehat, JJ), (alami, PRP)]"
1069,932529,lebat,5502,Hujan lebat yang mengguyur malam itu membuat arus sungai yang memisahkan rumah Beth dengan rumah pohon menjadi deras dan memutuskan satu-satunya jembatan gantung yang ada di sana.,"[(Hujan, NN), (lebat, NN), (yang, SC), (mengguyur, NN), (malam, NN), (itu, PR), (membuat, VB), (arus, NN), (sungai, NN), (yang, SC), (memisahkan, VB), (rumah, NN), (Beth, NNP), (dengan, IN), (ruma...",hujan lebat guyur malam arus sungai pisah rumah beth rumah pohon deras putus satu jembatan gantung,"[(hujan, NN), (lebat, NN), (guyur, NN), (malam, NN), (arus, NN), (sungai, NN), (pisah, NN), (rumah, NN), (beth, NN), (rumah, NN), (pohon, NN), (deras, NN), (putus, VB), (satu, CD), (jembatan, NN),..."


In [127]:
from pandas import DataFrame
df6 = DataFrame (t6,columns=['kalimat_id','kata','sense','kalimat','pos','stem','pos_stem'])

In [128]:
df6_cut = df6.loc[df6['kata'] == 'mata']
df6_cut

Unnamed: 0,kalimat_id,kata,sense,kalimat,pos,stem,pos_stem
0,942397,mata,1001,"Misalnya ketika dia diperhadapkan dengan raksasa mata satu, Kiklops yang disuruh oleh Poseidon untuk membunuhnya, dia tidak mengatakan bahwa dirinya Odisseus dan justru menjawab ""bukan siapa-siapa...","[(Misalnya, RB), (ketika, SC), (dia, PRP), (diperhadapkan, VB), (dengan, IN), (raksasa, NN), (mata, NN), (satu,, NNP), (Kiklops, NNP), (yang, SC), (disuruh, VB), (oleh, IN), (Poseidon, NNP), (untu...",hadap raksasa kiklops suruh poseidon bunuh odisseus siapa sangkal identitas,"[(hadap, IN), (raksasa, NN), (kiklops, FW), (suruh, FW), (poseidon, FW), (bunuh, VB), (odisseus, NN), (siapa, SC), (sangkal, VB), (identitas, NN)]"
1,942485,mata,1002,"Desa Sukamakmur yang notabene ibukota Kecamatan Sukamakmur sebagian besar mata pencaharian warga masyarakatnya adalah dibidang pertanian, perdagangan dan jasa.","[(Desa, NNP), (Sukamakmur, NNP), (yang, SC), (notabene, NNP), (ibukota, NNP), (Kecamatan, NNP), (Sukamakmur, NNP), (sebagian, CD), (besar, JJ), (mata, NN), (pencaharian, NN), (warga, NN), (masyara...",desa sukamakmur notabene ibukota camat sukamakmur cahari warga masyarakat bidang tani dagang jasa,"[(desa, NN), (sukamakmur, NN), (notabene, NN), (ibukota, NN), (camat, NN), (sukamakmur, NN), (cahari, NN), (warga, NN), (masyarakat, NN), (bidang, NN), (tani, VB), (dagang, NN), (jasa, NN)]"
2,942577,mata,1001,Kedua kelompok saling berhadapan dengan seluruh tubuh kecuali mata diselimuti sarung.,"[(Kedua, CD), (kelompok, NN), (saling, RB), (berhadapan, VB), (dengan, IN), (seluruh, CD), (tubuh, NN), (kecuali, NN), (mata, NN), (diselimuti, VB), (sarung., NNP)]",kelompok hadap tubuh kecuali limut sarung,"[(kelompok, NN), (hadap, IN), (tubuh, NN), (kecuali, RB), (limut, VB), (sarung, VB)]"
3,942601,mata,1001,"Namun, beberapa jam kemudian, seperti siklus penggantian dinding mata berakhir, Bopha naik menjadi topan kategori 4, sedangkan mata menjadi terdefinisikan dengan baik lagi.","[(Namun,, NNP), (beberapa, CD), (jam, NN), (kemudian,, NN), (seperti, IN), (siklus, NN), (penggantian, NN), (dinding, NN), (mata, NN), (berakhir,, NNP), (Bopha, NNP), (naik, VB), (menjadi, VB), (t...",jam siklus ganti dinding bopha topan kategori 4 definisi,"[(jam, NN), (siklus, NN), (ganti, NN), (dinding, NN), (bopha, NN), (topan, NN), (kategori, NN), (4, CD), (definisi, NN)]"
4,942687,mata,1006,"Walaupun pada setiap abad banyak komet berperiode panjang yang muncul dengan lebih terang dan dahsyat, Halley adalah satu-satunya komet dengan periode pendek yang dapat dilihat dengan mata telanja...","[(Walaupun, SC), (pada, IN), (setiap, CD), (abad, NN), (banyak, CD), (komet, NN), (berperiode, NN), (panjang, JJ), (yang, SC), (muncul, VB), (dengan, IN), (lebih, RB), (terang, VB), (dan, CC), (da...",abad komet periode muncul terang dahsyat halley satu komet periode pendek telanjang rentang umur manusia,"[(abad, NN), (komet, NN), (periode, NN), (muncul, VB), (terang, NN), (dahsyat, NN), (halley, FW), (satu, CD), (komet, NN), (periode, NN), (pendek, NN), (telanjang, NN), (rentang, NN), (umur, NN), ..."
...,...,...,...,...,...,...,...
117,952171,mata,1001,Hal ini dikarenakan televisi hologram menggunakan lensa lentikular yang menyebabkan tampilan berbeda pada mata kanan dan kiri.,"[(Hal, NN), (ini, PR), (dikarenakan, VB), (televisi, NN), (hologram, NN), (menggunakan, VB), (lensa, NN), (lentikular, NN), (yang, SC), (menyebabkan, VB), (tampilan, NN), (berbeda, VB), (pada, IN)...",televisi hologram lensa lentikular sebab tampil beda kanan kiri,"[(televisi, NN), (hologram, NN), (lensa, NN), (lentikular, NN), (sebab, SC), (tampil, JJ), (beda, VB), (kanan, NN), (kiri, NN)]"
118,952202,mata,1006,"Para saksi mata yang selamat melaporkan bahwa ia bekerja dengan rajin sampai kematiannya, dan tampak masih hidup di air setelah kapal tenggelam.","[(Para, DT), (saksi, NN), (mata, NN), (yang, SC), (selamat, JJ), (melaporkan, VB), (bahwa, SC), (ia, PRP), (bekerja, VB), (dengan, IN), (rajin, NN), (sampai, IN), (kematiannya,, NN), (dan, CC), (t...",saksi selamat lapor rajin mati hidup air kapal tenggelam,"[(saksi, NN), (selamat, JJ), (lapor, NN), (rajin, NN), (mati, VB), (hidup, NN), (air, NN), (kapal, NN), (tenggelam, VB)]"
119,952350,mata,1003,Harga uang menjadi seperlima dari mata uang terbaru.,"[(Harga, NN), (uang, NN), (menjadi, VB), (seperlima, CD), (dari, IN), (mata, NN), (uang, NN), (terbaru., NNP)]",harga uang lima uang baru,"[(harga, NN), (uang, NN), (lima, CD), (uang, NN), (baru, JJ)]"
120,952458,mata,1001,"Dan jika Anda pergi keluar pada hari musim panas, dua aliran air akan mengalir seperti sungai tinta dari mata Anda, dan keringat mengalir dari pipi pada tenggorokan anda.","[(Dan, CC), (jika, SC), (Anda, PRP), (pergi, VB), (keluar, VB), (pada, IN), (hari, NN), (musim, NN), (panas,, NN), (dua, CD), (aliran, NN), (air, NN), (akan, MD), (mengalir, VB), (seperti, IN), (s...",pergi musim panas alir air alir sungai tinta keringat alir pipi tenggorok,"[(pergi, VB), (musim, NN), (panas, JJ), (alir, NN), (air, NN), (alir, NN), (sungai, NN), (tinta, VB), (keringat, NN), (alir, NN), (pipi, NN), (tenggorok, VB)]"


## Test 2 dan 6 sense

In [129]:
x = df2_cut.iloc[:,5].values
y = df2_cut.iloc[:,2].values

In [130]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.01,random_state=0)
print(x_train.shape)
print(x_test.shape)

(181,)
(2,)


In [131]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report

In [132]:
text_model = Pipeline([('tfidf',TfidfVectorizer()),('model',MultinomialNB())])

text_model.fit(x_train,y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('model',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [133]:
y_pred = text_model.predict(x_test)
y_pred

array([5501, 5501])

In [135]:
test2_cut = test2.loc[test2['kata'] == 'lebat']
test6_cut = test6.loc[test6['kata'] == 'mata']

In [136]:
t2t = test2_cut.values.tolist()
for x in t2t:
  z = ct.tag_sents([x[3].split(' ')])
  x.append(z[0])
  kalimat = x[3].translate(str.maketrans('','',PUNCT_TO_REMOVE)).lower()
  stop = stopword.remove(kalimat)
  tokens = nltk.tokenize.word_tokenize(stop)
  x.append(stemmer.stem(listToString(tokens)))

t6t = test6_cut.values.tolist()
for x in t6t:
  z = ct.tag_sents([x[3].split(' ')])
  x.append(z[0])
  kalimat = x[3].translate(str.maketrans('','',PUNCT_TO_REMOVE)).lower()
  stop = stopword.remove(kalimat)
  tokens = nltk.tokenize.word_tokenize(stop)
  x.append(stemmer.stem(listToString(tokens)))

In [137]:
for x in t2t:
  z = ct.tag_sents([x[6].split(' ')])
  x.append(z[0])

for x in t6t:
  z = ct.tag_sents([x[6].split(' ')])
  x.append(z[0])

In [138]:
test_lst =[]
for x in t2t:
  test_lst.append(x[6])

In [139]:
test2_pred = text_model.predict(test_lst)
test2_pred

array([5501, 5501, 5501, 5501, 5502, 5501, 5501, 5501, 5501, 5501, 5501,
       5501, 5502, 5501, 5502, 5501, 5501, 5501, 5502, 5501, 5502, 5501,
       5501])

## Compare test 2 sense

In [140]:
fin2 = test2_cut[["kata", "sense", "id"]]
fin2['Prediction'] = test2_pred
fin2

Unnamed: 0,kata,sense,id,Prediction
152,lebat,5501,932115.0,5501
153,lebat,5501,932135.0,5501
154,lebat,5502,932157.0,5501
155,lebat,5501,932158.0,5501
156,lebat,5502,932175.0,5502
157,lebat,5501,932185.0,5501
158,lebat,5501,932207.0,5501
159,lebat,5501,932229.0,5501
160,lebat,5501,932270.0,5501
161,lebat,5501,932271.0,5501


In [141]:
y2 = fin2['sense'].values.tolist()
accuracy_score(test2_pred, y2)*100

91.30434782608695

In [142]:
print(classification_report(test2_pred,y2))

              precision    recall  f1-score   support

        5501       1.00      0.89      0.94        18
        5502       0.71      1.00      0.83         5

    accuracy                           0.91        23
   macro avg       0.86      0.94      0.89        23
weighted avg       0.94      0.91      0.92        23



## Compare test 6 sense

In [164]:
x = df6_cut.iloc[:,5].values
y = df6_cut.iloc[:,2].values

In [165]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.01,random_state=0)
print(x_train.shape)
print(x_test.shape)

(120,)
(2,)


In [166]:
text_model = Pipeline([('tfidf',TfidfVectorizer()),('model',MultinomialNB())])

text_model.fit(x_train,y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('model',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [167]:
y_pred = text_model.predict(x_test)
y_pred

array([1001, 1001])

In [168]:
test_lst6 =[]
for x in t6t:
  test_lst6.append(x[6])

In [169]:
test6_pred = text_model.predict(test_lst6)
test6_pred

array([1002, 1001, 1001, 1001, 1003, 1003, 1001, 1001, 1003, 1002, 1001,
       1002, 1001, 1001, 1001, 1002, 1003, 1003, 1001, 1003, 1001, 1002,
       1003, 1001, 1001, 1001, 1001, 1001, 1003, 1001, 1001, 1001, 1001,
       1003, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1003, 1003, 1003,
       1003, 1002, 1001, 1001, 1003])

In [170]:
fin6 = test6_cut[["kata", "sense", "id"]]
fin6['Prediction'] = test6_pred
fin6

Unnamed: 0,kata,sense,id,Prediction
0,mata,1002,942488.0,1002
1,mata,1001,942611.0,1001
2,mata,1001,942855.0,1001
3,mata,1003,943476.0,1001
4,mata,1003,943610.0,1003
5,mata,1003,943957.0,1003
6,mata,1003,944153.0,1001
7,mata,1001,944188.0,1001
8,mata,1003,944512.0,1003
9,mata,1002,945056.0,1002


In [171]:
y6 = fin6['sense'].values.tolist()
accuracy_score(test6_pred, y6)*100

73.46938775510205

In [172]:
print(classification_report(test6_pred,y6))

              precision    recall  f1-score   support

        1001       1.00      0.55      0.71        29
        1002       1.00      1.00      1.00         6
        1003       0.88      1.00      0.93        14
        1004       0.00      0.00      0.00         0
        1005       0.00      0.00      0.00         0
        1006       0.00      0.00      0.00         0

    accuracy                           0.73        49
   macro avg       0.48      0.43      0.44        49
weighted avg       0.96      0.73      0.81        49



## tidak menggunakan seluruh fitur (top 50)

In [173]:
x = df2_cut.head(50).iloc[:,5].values
y = df2_cut.head(50).iloc[:,2].values

In [174]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.01,random_state=0)
print(x_train.shape)
print(x_test.shape)

(49,)
(1,)


In [175]:
text_model = Pipeline([('tfidf',TfidfVectorizer()),('model',MultinomialNB())])

text_model.fit(x_train,y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('model',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [176]:
y_pred = text_model.predict(x_test)
y_pred

array([5501])

In [177]:
test2_pred = text_model.predict(test_lst)
test2_pred

array([5501, 5501, 5501, 5501, 5501, 5501, 5501, 5501, 5501, 5501, 5501,
       5501, 5501, 5501, 5501, 5501, 5501, 5501, 5501, 5501, 5501, 5501,
       5501])

In [178]:
y2 = fin2['sense'].values.tolist()
accuracy_score(test2_pred, y2)*100

69.56521739130434

In [179]:
print(classification_report(test2_pred,y2))

              precision    recall  f1-score   support

        5501       1.00      0.70      0.82        23
        5502       0.00      0.00      0.00         0

    accuracy                           0.70        23
   macro avg       0.50      0.35      0.41        23
weighted avg       1.00      0.70      0.82        23



In [180]:
x = df6_cut.head(50).iloc[:,5].values
y = df6_cut.head(50).iloc[:,2].values

In [181]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.01,random_state=0)
print(x_train.shape)
print(x_test.shape)

(49,)
(1,)


In [182]:
text_model = Pipeline([('tfidf',TfidfVectorizer()),('model',MultinomialNB())])

text_model.fit(x_train,y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('model',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [183]:
y_pred = text_model.predict(x_test)
y_pred

array([1001])

In [190]:
test6_pred = text_model.predict(test_lst6)
test6_pred

array([1002, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1002, 1001,
       1001, 1001, 1001, 1001, 1002, 1001, 1001, 1001, 1001, 1001, 1002,
       1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001,
       1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001,
       1001, 1001, 1001, 1001, 1001])

In [192]:
y6 = fin6['sense'].values.tolist()
accuracy_score(test6_pred, y6)*100

40.816326530612244

In [193]:
print(classification_report(test2_pred,y2))

              precision    recall  f1-score   support

        1001       0.00      0.00      0.00      23.0
        5501       0.00      0.00      0.00       0.0
        5502       0.00      0.00      0.00       0.0

    accuracy                           0.00      23.0
   macro avg       0.00      0.00      0.00      23.0
weighted avg       0.00      0.00      0.00      23.0



Refrensi :
- https://github.com/farhanreynaldo/pos-tagging-indonesia
- https://towardsdatascience.com/named-entity-recognition-and-classification-with-scikit-learn-f05372f07ba2
- https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html
