# <center> POS-Tagging dengan Metode Statistika </center>

## <center> A. Pembangunan Model Data Train </center>

In [2]:
def read_dataset(fname):
    sentences = []
    tags = []
    with open(fname) as f:
        content = f.readlines()
    # you may also want to remove whitespace characters like `\n` at the end of each line
    content = [x.strip() for x in content]
    idx_line = 0
    while idx_line < len(content):
        sent = [] #untuk menampung katanya
        tag = [] #untuk menampung postag dari tiap kata
#         print('idx_line =')
#         print(idx_line)
        while not content[idx_line].startswith('</kalimat'): #batas akhir dari kalimat
            if  not content[idx_line].startswith('<kalimat'):
                content_part = content[idx_line].split('\t')
                sent.append(content_part[0])
                tag.append(content_part[1])
            idx_line = idx_line + 1
        sentences.append(sent)
        tags.append(tag)
        idx_line = idx_line+2        
    return sentences, tags

In [3]:
def features(sentence, index): #feature yang digunakan adalah awalan dan akhiran,informasi context, bentuk karakter, dan jenis kata
    """ sentence: [w1, w2, ...], index: the index of the word """
    
    
    #Kamus diambil dari Kamus yang didefinisikan pada website http://bahasa.cs.ui.ac.id/postag/corpus (nama file: '1-1 tag dict.txt')
    tanda_baca = ['"','`','!','.','?',';','(',')','[',']','{','}',',','&']
    simbol_matematika = ['$','%','*','/',':','+','-','=','<','>']
    kata_angka = ['satu','dua','tiga','empat','lima','enam','tujuh','delapan','sembilan','puluh','belas','ratus','ribu','juta','miliar','triliun','seantero','beberapa','semua','seluruh','ke','seper','nya']
    modal = ['harus','mungkin','pernah','sudah','telah']
    negatif = ['belum','bukan','jangan','kagak','nggak','tak','tidak']
    kata_hubung = ['apalagi','atau','ataupun','dan','jangankan','kemudian','lalu','namun','padahal','sedangkan','tetapi']
    determiner = ['para','sang','si','sebuah']
    posisi = ['kepada','oleh','dalam','terhadap',]
    subjek_rujukan = ['beliau','dia','dikau','engkau','ia','kalian','kami','kamu','kau','kita','mereka','saya','seseorang','sesuatu']
    keterangan = ['apabila','asalkan','bahwa','yaitu','yakni','untuk','pun','yang']
    
    
    return {
        'word': sentence[index],
                                                            #Feature tempat kata
        'kata_pertama': index == 0,
        'kata_terakhir': index == len(sentence)-1,
        
                                                                      #Feature Bentuk Karakter dan Jenis kata
        'nama_entitas':sentence[index][0].upper() == sentence[index][0],
        'simbol': sentence[index].upper() == sentence[index][:2] or 'Rp' in sentence[index],
        'angka': sentence[index].isdigit() == sentence[index],
        'subjek': sentence[index].lower() == sentence[index] and ((sentence[index] not in tanda_baca) 
                                                                  or (sentence[index] not in simbol_matematika) or (sentence[index] not in kata_angka) 
                                                                  or (sentence[index] not in kata_angka) or (sentence[index] not in modal)
                                                                  or (sentence[index] not in negatif) or (sentence[index] not in kata_hubung)
                                                                  or (sentence[index] not in determiner) or (sentence[index] not in posisi)
                                                                  or (sentence[index] not in subjek_rujukan) or (sentence[index] not in keterangan)),
        'kata_angka': (sentence[index] in kata_angka) or (sentence[index][:2] in kata_angka) or (sentence[index][-3:] in kata_angka) 
                            or (sentence[index][:5] in kata_angka) or ((sentence[index][:2] in kata_angka) and (sentence[index][-3:] in kata_angka)) or ('pertama' in sentence[index]) ,
        'tanda-baca': sentence[index] in tanda_baca,
        'simbol_mtk': sentence[index] in simbol_matematika,
        'modal': sentence[index] in modal,
        'sambung': '-' in sentence[index],
        'kata-hubung': sentence[index] in kata_hubung,
        'determiner': sentence[index] in determiner,
        'posisi': sentence[index] in posisi,
        'rujukan': sentence[index] in subjek_rujukan,
        'keterangan': sentence[index] in keterangan or sentence[index][-3:] in keterangan,
        
        #awalan dan akhiran dari kata
        'prefix-1': sentence[index][0], #prefix adalah karakter awal dari kata
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:], #suffix adalah karakter akhir dari kata
        'suffix-3': sentence[index][-3:],
        
        #informasi context
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
    }

In [4]:
def transform_to_dataset(sentences, tags): #ekstraksi feature
    X, y = [], []
    for sentence_idx in range(len(sentences)):
        for index in range(len(sentences[sentence_idx])):
            X.append(features(sentences[sentence_idx], index))
            y.append(tags[sentence_idx][index])
    return X, y

In [5]:
sentences, tags = read_dataset('Data_Train.txt') #membaca data train

In [6]:
cutoff = int(.70 * len(sentences))

#Memisahkan data train dengan data validasi (train: 70% dan validasi: 30%)
training_sentences = sentences[cutoff:] #data train
training_tags = tags[cutoff:]

test_sentences = sentences[:cutoff] #data validasi
test_tags = tags[:cutoff]

In [7]:
X, y = transform_to_dataset(training_sentences, training_tags) #Melakukan ekstrasi feature terhadap data train
print('data training ke-0 =')
print(X[0])
print('label training ke-0 =')
print(y[0])

data training ke-0 =
{'word': 'Dana', 'kata_pertama': True, 'kata_terakhir': False, 'nama_entitas': True, 'simbol': False, 'angka': False, 'subjek': False, 'kata_angka': False, 'tanda-baca': False, 'simbol_mtk': False, 'modal': False, 'sambung': False, 'kata-hubung': False, 'determiner': False, 'posisi': False, 'rujukan': False, 'keterangan': False, 'prefix-1': 'D', 'prefix-2': 'Da', 'prefix-3': 'Dan', 'suffix-1': 'a', 'suffix-2': 'na', 'suffix-3': 'ana', 'prev_word': '', 'next_word': 'pinjaman'}
label training ke-0 =
NN


In [8]:
from sklearn import tree
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

In [9]:
# Melakukan klasifikasi dengan menggunakan naive bayes 
clf = Pipeline([
        ('vectorizer', DictVectorizer(sparse=False)),
        ('classifier', MultinomialNB())
    ])
clf.fit(X[:1030], y[:1030])

Pipeline(memory=None,
     steps=[('vectorizer', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=False)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [10]:
X_test, y_test = transform_to_dataset(test_sentences, test_tags) #Ekstraksi feature terhadap data validasi
print("Accuracy:")
print(clf.score(X_test, y_test)) #Menghitung score yang diperoleh setelah proses klasifikasi pada data validasi

Accuracy:
0.7956760877852203


## <center> B. Pengujian Model Data Train </center>

In [11]:
sentences_test, tag_test = read_dataset('Data_Test.txt')
kumpulan_tag = []
kumpulan_kata = []

for tag in tag_test:
    for i in range(len(tag)):
        kumpulan_tag.append(tag[i]) #mengambil tag dalam datatest
        
for sentence in sentences_test:
    for i in range(len(sentence)):
        kumpulan_kata.append(sentence[i]) #mengambil kata dalam datatest
print(kumpulan_tag)

['NN', 'NN', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP', 'IN', 'CD', 'NN', 'OD', 'SC', 'VB', 'NNP', 'CD', 'VB', 'VB', 'NN', 'SYM', 'CD', 'CD', 'Z', 'CC', 'VB', 'CD', 'CD', 'IN', 'SYM', 'CD', 'CD', 'IN', 'NN', 'JJ', 'NN', 'CC', 'Z', 'NN', 'JJ', 'NN', 'VB', 'VB', 'IN', 'SYM', 'CD', 'CD', 'IN', 'NN', 'OD', 'CD', 'VB', 'SYM', 'CD', 'CD', 'IN', 'NN', 'OD', 'CD', 'Z', 'CC', 'NN', 'NN', 'IN', 'NN', 'PR', 'VB', 'IN', 'NN', 'SYM', 'CD', 'CD', 'VB', 'SYM', 'CD', 'CD', 'Z', 'IN', 'NN', 'JJ', 'Z', 'NN', 'NN', 'VB', 'VB', 'NN', 'JJ', 'JJ', 'NN', 'IN', 'SYM', 'CD', 'CD', 'IN', 'NN', 'OD', 'NN', 'CC', 'VB', 'SYM', 'CD', 'CD', 'IN', 'NN', 'OD', 'NN', 'PR', 'Z', 'SC', 'NN', 'PRP', 'MD', 'RB', 'VB', 'Z', 'IN', 'NN', 'CD', 'NNP', 'VB', 'VB', 'NN', 'JJ', 'SYM', 'CD', 'CD', 'Z', 'CD', 'CD', 'X', 'MD', 'VB', 'IN', 'NN', 'SC', 'NN', 'NN', 'CD', 'Z', 'SC', 'JJ', 'SYM', 'CD', 'CD', 'CC', 'SYM', 'CD', 'IN', 'NND', 'NN', 'Z', 'NN', 'NN', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP', 'IN', 'NN', 'OD', 'CD', 'VB', 'VB', 'NN', 

In [12]:
hasil_tag = [] #menampung hasil prediksi tag dari hasil klasifikasi 
for sentence in sentences_test:
    pos_tag = clf.predict([features(sentence, index) for index in range(len(sentence))]) #ekstraksi feature dan klasifikasi data test
    for tag in pos_tag:
        hasil_tag.append(tag) #memasukkan seluruh tag hasil prediksi kedalam array

In [13]:
                                   #Perhitungan akurasi
jumlah_tag = 0
jumlah_salah = 0
akurasi = 0
for i in range(len(kumpulan_tag)):
    if hasil_tag[i] == kumpulan_tag[i]:
        jumlah_tag += 1
    else:
        jumlah_salah += 1
        print(str(jumlah_salah)+'.', 'Kata: '+ kumpulan_kata[i], 'Tag Test: '+hasil_tag[i], 'Tag Asli: '+kumpulan_tag[i])
#         print('---------------------')
print(jumlah_salah)
akurasi = (jumlah_tag/len(kumpulan_tag)) * 100
print(akurasi)

1. Kata: pertama Tag Test: NN Tag Asli: OD
2. Kata: lalu Tag Test: JJ Tag Asli: CC
3. Kata: bersih Tag Test: VB Tag Asli: JJ
4. Kata: pertama Tag Test: NN Tag Asli: OD
5. Kata: pertama Tag Test: NN Tag Asli: OD
6. Kata: namun Tag Test: NN Tag Asli: CC
7. Kata: beban Tag Test: VB Tag Asli: NN
8. Kata: bersih Tag Test: VB Tag Asli: JJ
9. Kata: pertama Tag Test: NN Tag Asli: OD
10. Kata: lalu Tag Test: JJ Tag Asli: CC
11. Kata: pertama Tag Test: NN Tag Asli: OD
12. Kata: sehingga Tag Test: NN Tag Asli: SC
13. Kata: tetap Tag Test: VB Tag Asli: RB
14. Kata: diantaranya Tag Test: PRP Tag Asli: X
15. Kata: untuk Tag Test: IN Tag Asli: SC
16. Kata: per Tag Test: NN Tag Asli: IN
17. Kata: lembar Tag Test: NN Tag Asli: NND
18. Kata: - Tag Test: Z Tag Asli: NNP
19. Kata: 8 Tag Test: CD Tag Asli: NNP
20. Kata: pertama Tag Test: NN Tag Asli: OD
21. Kata: sekitar Tag Test: JJ Tag Asli: IN
22. Kata: lalu Tag Test: JJ Tag Asli: CC
23. Kata: kotor Tag Test: NN Tag Asli: JJ
24. Kata: pertama Tag Test: 