In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Preparation

## 1. Import Library


Berikut adalah library yang digunakan pada sistem ini.

In [2]:
import pandas as pd
import numpy as np
from gensim.models import FastText
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

## 2. Load Dataset

Load data train, dan melakukan pemisahan 1 string tiap record menjadi 3 kolom yang berbeda.

In [3]:
train_data = pd.read_csv("drive/My Drive/Sequential Labelling/dataset/train.csv", encoding="utf-8",names=['chunks'])
train_data[['Words','POS','TAG']] = train_data["chunks"].str.split(" ", 2, expand=True)
train_data = train_data.drop(columns=['chunks'])
train_data

Unnamed: 0,Words,POS,TAG
0,Confidence,NN,B-NP
1,in,IN,B-PP
2,the,DT,B-NP
3,pound,NN,I-NP
4,is,VBZ,B-VP
...,...,...,...
88184,the,DT,B-NP
88185,scorecard,NN,I-NP
88186,.,.,O
88187,,,


Load data test, dan melakukan pemisahan 1 string tiap record menjadi 3 kolom yang berbeda.

In [4]:
test_data = pd.read_csv("drive/My Drive/Sequential Labelling/dataset/test.csv", encoding="utf-8",names=['chunks'])
test_data[['Words','POS','TAG']] = test_data["chunks"].str.split(" ", 2, expand=True)
test_data = test_data.drop(columns=['chunks'])
test_data

Unnamed: 0,Words,POS,TAG
0,Rockwell,NNP,B-NP
1,International,NNP,I-NP
2,Corp.,NNP,I-NP
3,'s,POS,B-NP
4,Tulsa,NNP,I-NP
...,...,...,...
49384,to,TO,B-PP
49385,Mr.,NNP,B-NP
49386,Harlow,NNP,I-NP
49387,.,.,O


## 3. Data Preprocessing

Diketahui data train dan test masih memiliki record yang null. Maka lakukan cleaning.

In [5]:
print("Null Train:")
print(train_data.isnull().sum())
print()
print("Null Test:")
print(test_data.isnull().sum())

Null Train:
Words    3572
POS      3572
TAG      3572
dtype: int64

Null Test:
Words    2012
POS      2012
TAG      2012
dtype: int64


Dibawah ini adalah proses untuk mendrop record yang null dan mereset index.

In [6]:
train_data.dropna(subset=['Words','POS','TAG'], inplace=True)
train_data = train_data.reset_index()

test_data.dropna(subset=['Words','POS','TAG'], inplace=True)
test_data = test_data.reset_index()

Dibawah ini adalah proses untuk mendrop record yang mengandung string "O", karena pada dataset terdapat pemisah sentence.

In [7]:
train_data = train_data[~train_data.TAG.str.contains("O")]
train_data = train_data.reset_index()

test_data = test_data[~test_data.TAG.str.contains("O")]
test_data = test_data.reset_index()

In [8]:
print("Jumlah Train :",np.shape(train_data)[0])
print("Jumlah Test :",np.shape(test_data)[0])

Jumlah Train : 73315
Jumlah Test : 41175


Dibawah ini adalah proses menggabungkan kolom 1 dan 2 pada dataset untuk menjadi 1 string di kolom yang sama.

In [9]:
dTrain = [[str(train_data['Words'][i])+"_"+str(train_data['POS'][i])] for i in range(0,len(train_data))]
dTest = [[str(test_data['Words'][i])+"_"+str(test_data['POS'][i])] for i in range(0,len(test_data))]

# FastText() Modelling

Berikut adalah proses building vocabulary menggunakan library FastText(). Epoch sebesar 30, dan ukuran dimensi vektor sebesar 10.

In [10]:
vec_size = 10

model = FastText(size=vec_size,min_count=1)
model.build_vocab(dTrain)
model.train(dTrain, total_examples=model.corpus_count, epochs=30)

Dari hasil building vocabulary, terdapat 11923 kata yang berbeda dalam vocabulary.

In [11]:
len(model.wv.vocab)

11923

Berikut adalah fungsi untuk menggenerate vektor dari hasil building vocabulary.

In [12]:
def get_vec(model,data,lab):
  vector = []
  tag = []
  for i in range(0,len(data)):
    vector.append(model.wv[data[i]])
    tag.append(lab[i])
  return vector,tag

In [13]:
train_vec,train_tag = get_vec(model,dTrain,train_data.TAG)
test_vec,test_tag = get_vec(model,dTest,test_data.TAG)

Sebagai informasi, berikut adalah label label yang terdapat pada masing-masing jenis dataset.

In [14]:
print("label train : ",list(set(train_tag)))
print("label test  : ",list(set(test_tag)))

label train :  ['B-PP', 'B-VP', 'I-NP', 'B-NP', 'I-ADVP', 'I-ADJP', 'B-INTJ', 'I-VP', 'I-SBAR', 'I-PRT', 'I-PP', 'B-ADJP', 'B-PRT', 'B-LST', 'I-INTJ', 'B-SBAR', 'B-ADVP']
label test  :  ['I-NP', 'B-VP', 'B-PP', 'B-NP', 'I-ADJP', 'I-ADVP', 'B-INTJ', 'I-LST', 'I-VP', 'I-SBAR', 'I-PP', 'B-ADJP', 'B-PRT', 'B-LST', 'B-SBAR', 'B-ADVP']


# Training, Validating & Predicting

Sebelum masuk ke tahap training, validating dan predicting. Lakukan terlebih dahulu split data train untuk membagi data train kedalam data train baru dan data validasi. Data validasi sebesar 10% dari data train.

In [15]:
X_train,X_valid,y_train,y_valid = train_test_split(train_vec, train_tag, test_size=0.1, shuffle=True)

In [16]:
X_train = np.reshape(X_train,(len(X_train),vec_size))
X_valid = np.reshape(X_valid,(len(X_valid),vec_size))
X_test = np.reshape(test_vec,(len(test_vec),vec_size))

Berikut adalah tahap training, validating dan predicting dengan menggunakan metode K-NN dengan K sebesar 10.

In [17]:
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                     weights='uniform')

In [18]:
training1 = knn.predict(X_train)
print('Training accuracy (KNN) : %.2f' % (accuracy_score(training1, y_train)*100),"%")
#
y_pred_valid1 = knn.predict(X_valid)
print('Validation accuracy (KNN) : %.2f' % (accuracy_score(y_valid, y_pred_valid1)*100),"%")
# 
y_pred_test1 = knn.predict(X_test)
print('Test accuracy (KNN) : %.2f' % (accuracy_score(test_tag, y_pred_test1)*100),"%")

Training accuracy (KNN) : 77.07 %
Validation accuracy (KNN) : 73.28 %
Test accuracy (KNN) : 72.76 %


Berikut adalah tahap training, validating dan predicting dengan menggunakan metode Random Forest dengan jumlah pohon sebanyak 50.

In [19]:
RF = RandomForestClassifier(n_estimators=50,random_state=42)
RF.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [20]:
training_2 = RF.predict(X_train)
print('Training accuracy (Random Forest) : %.2f' % (accuracy_score(training_2, y_train)*100),"%")
#
y_pred_valid2 = RF.predict(X_valid)
print('Validation accuracy (Random Forest) : %.2f' % (accuracy_score(y_valid, y_pred_valid2)*100),"%")

y_pred_test2 = RF.predict(X_test)
print('Test accuracy (Random Forest) : %.2f' % (accuracy_score(test_tag, y_pred_test2)*100),"%")

Training accuracy (Random Forest) : 86.49 %
Validation accuracy (Random Forest) : 79.43 %
Test accuracy (Random Forest) : 78.42 %
