In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Library Import

In [2]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import word_tokenize, pos_tag
from sklearn.metrics import accuracy_score
from gensim.models import FastText
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from difflib import SequenceMatcher

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Untuk membangun model POS Tag menggunakan machine learning, dibutuhkan sebuah dataset yang mengandung label POS. Dataset didapatkan dari : https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus?select=ner_dataset.csv

In [3]:
train_data = pd.read_csv("drive/My Drive/NLP/Dataset/pos_dataset.csv", encoding="Latin-1")
train_data

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
...,...,...,...,...
1048570,,they,PRP,O
1048571,,responded,VBD,O
1048572,,to,TO,O
1048573,,the,DT,O


Code dibawah ini adalah untuk membuat list baru berisi kata dari setiap record dataset yang dilakukan lowercase.

In [4]:
dTrain = [[str.lower(train_data['Word'][i])] for i in range(len(train_data))]

Pembangunan model FastText() :

In [5]:
vec_size = 5

model = FastText(size=vec_size,min_count=1)
model.build_vocab(dTrain)
model.train(dTrain, total_examples=model.corpus_count, epochs=30)

Fungsi get_vec() dibawah ini adalah fungsi untuk menggenerate vektor fitur berdasarkan proses training model FastText sebelumnya.

In [6]:
def get_vec(model,data,lab):
  vector = []
  tag = []
  for i in range(0,len(data)):
    vector.append(model.wv[data[i]])
    tag.append(lab[i])
  return vector,tag

In [7]:
X_train,y_train = get_vec(model,dTrain,train_data.POS)

Split data, data test menjadi 20%.

In [8]:
Xn_train, X_test, yn_train, y_test = train_test_split(X_train,y_train, test_size=0.2,shuffle=True)

Reshape array sebelum dilakukan fit/training dengan algoritma Random Forest

In [9]:
Xn_train = np.reshape(Xn_train,(len(Xn_train),vec_size))
X_test = np.reshape(X_test,(len(X_test),vec_size))

Training dengan menggunakan Random Forest

In [10]:
RF = RandomForestClassifier(n_estimators=50,random_state=42)
RF.fit(Xn_train, yn_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [11]:
training = RF.predict(Xn_train)
print('Akurasi Training : %.2f' % (accuracy_score(training, yn_train)*100),"%")
#
y_pred_test = RF.predict(X_test)
print('Akurasi Testing : %.2f' % (accuracy_score(y_test, y_pred_test)*100),"%")

Akurasi Training : 93.59 %
Akurasi Testing : 92.25 %


Fungsi dibawah ini bertujuan untuk mendapatkan question type dari setiap pertanyaan berdasarkan label POS.

In [12]:
def question_type_extraction(postag):
  questype = ""
  for i in range(0,len(postag)):
    if postag[i][1]=="WDT" or postag[i][1]=="WP" or postag[i][1]=="WP$" or postag[i][1]=="WRB":
      if (questype.find(postag[i][0]) == -1):
        questype += postag[i][0] +","
  if len(questype)!=0 and questype[len(questype)-1] == ",":
    questype = questype[:-1]
  if len(questype)==0:
    questype = "Unknown"
  return questype

Fungsi dibawah ini bertujuan untuk mendapatkan keyword dari setiap pertanyaan berdasarkan label POS.

In [13]:
def extract_keyword(postag):
  keyword = ""
  for i in range(0,len(postag)):
    if postag[i][1]=="NNP" or postag[i][1]=="NNS" or postag[i][1]=="NN" or (postag[i][1]=="JJ" and postag[i][0].lower()!="many") or postag[i][1]=="CD" or postag[i][1]=="RBS" or (postag[i][1]=="VBN" and postag[i][1]!="been") or (postag[i][1]=="VBD" and postag[i][0].lower()!="was" and postag[i][0].lower()!="were") or postag[i][1]=="VBG" or (postag[i][1]=="VB" and postag[i][0].lower()!="be") or postag[i][1]=="RB":
      keyword += postag[i][0] +" "
  if len(keyword)!=0 and keyword[len(keyword)-1] == " ":
    keyword = keyword[:-1]
  return keyword

Fungsi dibawah ini bertujuan untuk mendapatkan label POS dengan menggunakan model FastText dan Random Forest.

In [14]:
def get_pos_tag(token_word):
  postag = []
  for j in range(len(token_word)):
    try:
      vec = model.wv[token_word[j]]
      labelpos = RF.predict([vec])[0]
    except:
      labelpos = "NN"
    postag.append([token_word[j],labelpos])
  return postag

Fungsi dibawah ini bertujuan untuk menggenerate keyword dan question type dari setiap pertanyaan dengan memanggil fungsi-fungsi sebelumnya.

In [15]:
 def question_understanding(dataset,modeltype):
  keywords = []
  qtype = []

  for i in range(0,len(dataset)):
    token_word = word_tokenize(dataset.question[i].lower())
    if modeltype=="nltk":
      postag = pos_tag(token_word)
    elif modeltype=="fasttext":
      postag = get_pos_tag(token_word)
    keywords.append(extract_keyword(postag))
    qtype.append(question_type_extraction(postag))
  
  output = pd.DataFrame({"keywords":np.array(keywords),"question_type":np.array(qtype)})

  return output

Load dataset COVID-QA

In [16]:
data = pd.read_json("/content/drive/MyDrive/NLP/Dataset/COVID-QA.json")
data = pd.json_normalize(data["data"],record_path='paragraphs')
new_data = pd.DataFrame(0,index=np.arange(2019),columns=["document_id","context","id_question","question","answers_start","answers_text","is_impossible"])
idx=0
for x in range(0,len(data["qas"])):
  for y in range(0,len(data["qas"][x])):
    new_data["document_id"].iloc[idx] = data["document_id"][x]
    new_data["context"].iloc[idx] = data["context"][x]
    new_data["id_question"].iloc[idx] = data["qas"][x][y]["id"]
    new_data['question'].iloc[idx] = data["qas"][x][y]["question"]
    new_data['answers_start'].iloc[idx] = data["qas"][x][y]["answers"][0]["answer_start"]
    new_data['answers_text'].iloc[idx] = data["qas"][x][y]["answers"][0]["text"]
    new_data['is_impossible'].iloc[idx] = data["qas"][x][y]["is_impossible"]
    idx+=1
new_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Unnamed: 0,document_id,context,id_question,question,answers_start,answers_text,is_impossible
0,630,Functional Genetic Variants in DC-SIGNR Are As...,262,What is the main cause of HIV-1 infection in c...,370,Mother-to-child transmission (MTCT) is the mai...,False
1,630,Functional Genetic Variants in DC-SIGNR Are As...,276,What plays the crucial role in the Mother to C...,2003,DC-SIGNR plays a crucial role in MTCT of HIV-1...,False
2,630,Functional Genetic Variants in DC-SIGNR Are As...,278,How many children were infected by HIV-1 in 20...,2291,"more than 400,000 children were infected world...",False
3,630,Functional Genetic Variants in DC-SIGNR Are As...,316,What is the role of C-C Motif Chemokine Ligand...,28143,"High copy numbers of CCL3L1, a potent HIV-1 su...",False
4,630,Functional Genetic Variants in DC-SIGNR Are As...,305,What is DC-GENR and where is it expressed?,3207,Dendritic cell-specific ICAM-grabbing non-inte...,False
...,...,...,...,...,...,...,...
2014,1713,"Ebola Virus Maintenance: If Not (Only) Bats, W...",5315,What is the structure of the Ebolavirus?,2270,single-strand RNA filoviruses,False
2015,1713,"Ebola Virus Maintenance: If Not (Only) Bats, W...",5316,When was the West African Ebolavirus outbreak?,2546,2013-2016,False
2016,1713,"Ebola Virus Maintenance: If Not (Only) Bats, W...",5317,What animals are considered to be maintenance ...,4083,African bats,False
2017,1713,"Ebola Virus Maintenance: If Not (Only) Bats, W...",5318,What do circles indicate in Figure 1?,7212,a maintenance function play by the host(s),False


In [17]:
qu_nltk = question_understanding(new_data,"nltk")
qu_fasttext = question_understanding(new_data,"fasttext")

Berikut adalah hasil proses question understanding dengan menggunakan model POS Tag dari NLTK, berupa perolehan keyword dan question type pada dataset.

In [18]:
qu_nltk

Unnamed: 0,keywords,question_type
0,main cause hiv-1 infection children,what
1,crucial role mother child transmission hiv-1 risk,what
2,children infected hiv-1 2008-2009 worldwide,how
3,role c-c motif 3 1 ccl3l1 mother child transmi...,what
4,dc-genr expressed,"what,where"
...,...,...
2014,structure ebolavirus,what
2015,west african ebolavirus outbreak,when
2016,animals considered maintenance hosts ebolavirus,what
2017,circles indicate figure 1,what


Berikut adalah hasil proses question understanding dengan menggunakan model FastText dan Random Forest, berupa perolehan keyword dan question type pada dataset.

In [19]:
qu_fasttext

Unnamed: 0,keywords,question_type
0,main cause hiv-1 infection children,what
1,crucial role mother child transmission hiv-1 i...,what
2,children infected hiv-1 2008-2009 worldwide,how
3,role c-c motif chemokine ligand 3 1 ccl3l1 mot...,what
4,dc-genr expressed,"what,where"
...,...,...
2014,structure ebolavirus,what
2015,west african ebolavirus outbreak,when
2016,animals considered maintenance ebolavirus,what
2017,circles figure 1,what


Berikut adalah kemiripan keyword secara mutlak. Maksudnya adalah penghitugan kemiripan / keakuratan keyword yang dihasilkan oleh NLTK dan FastText secara persis. Contohnya sebagai berikut.

String A = "ABC DEF GHI" <br>
String B = "ABC GHI" <br>

Maka, antara dua string tersebut dinyatakan tidak sama dan tidak mengkalkulasi persentase kemiripan.

In [20]:
print('Kemiripan keyword secara mutlak : %.2f' % (accuracy_score(qu_nltk.keywords, qu_fasttext.keywords)*100),"%")

Kemiripan keyword secara mutlak : 78.01 %


Ada paradigma/bentuk lain untuk melihat kemiripan keyword yang dihasilkan oleh 2 model tersebut. Bisa dilakukan dengan keyword similarity. Similarity yang digunakan adalah string similarity menggunakan library SequenceMatcher dari difflib. Contohnya sebagai berikut.

In [21]:
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [22]:
String1 = "ABC DEF GHI"
String2 = "ABC GHI"
print("Text 1 :",String1)
print("Text 2 :",String2)
print("Similarity text 1 dan 2:  %.2f" % (similar(String1,String2)))
print()
String3 = "coronavirus firstly spread"
String4 = "corona first spread"
print("Text 3 :",String3)
print("Text 4 :",String4)
print("Similarity text 3 dan 4 :  %.2f" % (similar(String3,String4)))
print()
print("Similarity text 1 dan 3 :  %.2f" % (similar(String1,String3)))

Text 1 : ABC DEF GHI
Text 2 : ABC GHI
Similarity text 1 dan 2:  0.78

Text 3 : coronavirus firstly spread
Text 4 : corona first spread
Similarity text 3 dan 4 :  0.84

Similarity text 1 dan 3 :  0.11


Untuk setiap similarity yang dihasilkan akan dijumlahkan dan dibagi dengan jumlah data. Berikut adalah hasil kemiripan keyword berdasarkan keyword similarity.

In [23]:
sum = 0
for i in range(len(qu_nltk)):
  
  sum+=similar(qu_nltk.keywords[i],qu_fasttext.keywords[i])

print('Kemiripan keyword dengan mempertimbangkan keyword similarity : %.2f' % (sum/2019*100),"%")

Kemiripan keyword dengan mempertimbangkan keyword similarity : 97.44 %


In [24]:
print('Kemiripan jenis pertanyaan: %.2f' % (accuracy_score(qu_nltk.question_type, qu_fasttext.question_type)*100),"%")

Kemiripan jenis pertanyaan: 99.06 %


Sebagai pengujian, berikut adalah contoh pertanyaan yang dilakukan proses question understanding dengan model NLTK dan FastText+Random Forest.

In [25]:
test_input = "When is the coronavirus started from and where is it? "

In [26]:
quest_token = word_tokenize(test_input)
postag_question = pos_tag(quest_token)
quest_key = extract_keyword(postag_question)
quest_type =  question_type_extraction(postag_question)
print("PERTANYAAN  : ",test_input)
print("1. POS Tag dengan NLTK")
print("Keyword     : ",quest_key)
print("Quest. Type : ",quest_type)
print()
postag_question2 = get_pos_tag(quest_token)
quest_key2 = extract_keyword(postag_question2)
quest_type2 =  question_type_extraction(postag_question2)
print("2. POS Tag dengan model FastText dan Random Forest")
print("Keyword     : ",quest_key2)
print("Quest. Type : ",quest_type2)

PERTANYAAN  :  When is the coronavirus started from and where is it? 
1. POS Tag dengan NLTK
Keyword     :  coronavirus started
Quest. Type :  When,where

2. POS Tag dengan model FastText dan Random Forest
Keyword     :  When coronavirus started
Quest. Type :  where


Secara kontekstual, model NLTK lebih presisi dari segi ketepatan. Maka dari itu, untuk proses/modul Question Understanding di program utama akan digunakan pelabelan POS dengan library NLTK.