<a href="https://colab.research.google.com/github/crescendonow/thai_geoparsing/blob/main/toponym_colab/IJG_1_CRF_pythainlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tqdm
!pip install pickle-mixin
!pip install sklearn_crfsuite
!pip install pythainlp[full]

In [None]:
#library for use
from tqdm import tqdm
import pandas as pd
import re
import csv
import pickle
import codecs
from itertools import chain

#nlp library
from pythainlp import word_tokenize, Tokenizer
from pythainlp.tag import pos_tag, pos_tag_sents
from pythainlp.corpus.common import thai_stopwords
stopwords = thai_stopwords()
from pythainlp.util import isthai, isthaichar, normalize
from nltk.tokenize import RegexpTokenizer

#Mange data with scikit-learn
import sklearn_crfsuite
from collections import Counter
from sklearn_crfsuite import CRF
from sklearn_crfsuite import scorers,metrics
from sklearn.metrics import make_scorer, classification_report
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import cross_validate, train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
with open('/content/drive/MyDrive/dissertation/TEXT DATA/TrainingData/1.Tokennization/train_attacut_cl.data', 'rb') as token:
    train_data = pickle.load(token)
with open('/content/drive/MyDrive/dissertation/TEXT DATA/TrainingData/1.Tokennization/test_attacut_cl.data', 'rb') as token2:
    test_data = pickle.load(token2)

In [None]:
#feature from word space 
def is_space(word):
    if word == ' ' or word == '\t' or word == '':
        return True
    else:
        return False
    
def is_stopword(word):
    return word in stopwords

In [None]:
def word_features(sentence,i):
    word = sentence[i][0]
    pos = sentence[i][1]
        
    #create feature in current word
    features = {
                    'bias': 1.0, 
                    'word': word,
                    'word.is_stopword' : is_stopword(word),
                    'word.isthai' : isthai(word),
                    'word.is_space()': is_space(word),
                    'postag': pos,
                    'word.isdigit()': word.isdigit()
                }
        #If this is not the first word of sentence 
    if i > 0:
        prev_word = sentence[i-1][0]
        prev_pos = sentence[i-1][1]
        features.update({
                            '-1:word' : prev_word,
                            '-1:word.is_stopword' : is_stopword(prev_word),
                            '-1:word.isthai' : isthai(prev_word),
                            '-1:word.is_space()': is_space(prev_word),
                            '-1:postag': prev_pos,
                            '-1:word.isdigit()': word.isdigit()
                        })
    else:
        features['BOS'] = True
        
    if i < len(sentence)-1:
        next_word = sentence[i+1][0]
        next_pos = sentence[i+1][0]
        features.update({
                            '+1:word' : next_word,
                            '+1:word.is_stopword' : is_stopword(next_word),
                            '+1:word.isthai' : isthai(next_word),
                            '+1:word.is_space()': is_space(next_word),
                            '+1:postag': next_pos,
                            '+1:word.isdigit()': word.isdigit()
                        })
    else:
        features['EOS'] = True
        
    return features
                
#return feature dictionary for each word
def sentence_features(sentence):
    return [word_features(sentence,i) for i in range(len(sentence))]

#return the label NER tags
def sentence_labels(sentence):
    return [label for token,pos,label in sentence]

#return token 
def sentence_tokens(sentence):
    return [token for token,pos,label in sentence]

In [None]:
%%time
#Prepare word for train test
X = [sentence_features(sentence) for sentence in tqdm(train_data)]
X_test = [sentence_features(sentence) for sentence in tqdm(test_data)]

#Label train test
y = [sentence_labels(sentence) for sentence in tqdm(train_data)]
y_test = [sentence_labels(sentence) for sentence in tqdm(test_data)]

#Get token 
Train_token = [sentence_tokens(sentence) for sentence in tqdm(train_data)]
Test_tokens = [sentence_tokens(sentence) for sentence in tqdm(test_data)]




  0%|          | 0/22445 [03:25<?, ?it/s]



  3%|▎         | 583/22445 [00:00<00:03, 5825.44it/s][A[A[A


  5%|▌         | 1166/22445 [00:00<00:03, 5370.72it/s][A[A[A


  8%|▊         | 1706/22445 [00:00<00:03, 5370.43it/s][A[A[A


 10%|█         | 2273/22445 [00:00<00:03, 5481.33it/s][A[A[A


 13%|█▎        | 2867/22445 [00:00<00:03, 5642.07it/s][A[A[A


 15%|█▌        | 3433/22445 [00:00<00:03, 5424.70it/s][A[A[A


 18%|█▊        | 3978/22445 [00:00<00:03, 5173.82it/s][A[A[A


 20%|██        | 4499/22445 [00:00<00:03, 4987.33it/s][A[A[A


 22%|██▏       | 5001/22445 [00:00<00:03, 4824.58it/s][A[A[A


 25%|██▍       | 5526/22445 [00:01<00:03, 4945.30it/s][A[A[A


 27%|██▋       | 6024/22445 [00:01<00:03, 4954.97it/s][A[A[A


 29%|██▉       | 6540/22445 [00:01<00:03, 5012.22it/s][A[A[A


 31%|███▏      | 7048/22445 [00:01<00:03, 5031.95it/s][A[A[A


 34%|███▎      | 7553/22445 [00:01<00:03, 4902.59it/s][A[A[A


 36%|███▌      | 8072/22445 [

CPU times: user 4.97 s, sys: 789 ms, total: 5.76 s
Wall time: 6.07 s





In [None]:
%%time
#Train a CRF Model
crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=500,
          all_possible_transitions=True,
          verbose=True)
# Train the CRF model on the supplied training data
crf.fit(X, y)

loading training data to CRFsuite: 100%|██████████| 22445/22445 [00:05<00:00, 3788.55it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 169881
Seconds required: 1.101

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 500
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=6.39  loss=836948.09 active=169661 feature_norm=1.00
Iter 2   time=3.28  loss=314019.62 active=169503 feature_norm=10.51
Iter 3   time=3.33  loss=301539.00 active=166343 feature_norm=10.15
Iter 4   time=45.96 loss=156177.06 active=51756 feature_norm=3.82
Iter 5   time=6.66  loss=154419.73 active=107190 feature_norm=5.99
Iter 6   time=3.33  loss=134715.17 active=95762 feature_norm=5.52
Iter 7   time=20.13 loss=100369.52 active=91270 feature_norm=4.57
Iter 8   time=22.83 loss=96193.66 active=65182 feature_norm=4.39
Iter 9   time=17.93 loss=93274.94 active=88517 feature_norm=4.43


In [None]:
labels = list(crf.classes_)
labels.remove('O')
labels.remove('B-DMIN')

In [None]:
def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
I-RES  -> I-RES   7.644382
I-GOV  -> I-GOV   7.603610
I-STORE -> I-STORE 7.562329
I-HP   -> I-HP    7.489074
I-RP   -> I-RP    7.422316
I-MKT  -> I-MKT   7.331117
B-RES  -> I-RES   7.315994
I-RCT  -> I-RCT   7.267376
B-MKT  -> I-MKT   7.193755
I-FPLACE -> I-FPLACE 7.179230
I-TRAN -> I-TRAN  7.171256
B-HP   -> I-HP    7.147498
I-BSN  -> I-BSN   7.147008
B-BSN  -> I-BSN   7.146017
I-ACP  -> I-ACP   7.085294
B-ROAD -> I-ROAD  7.046454
I-ADMIN -> I-ADMIN 7.039824
B-RCT  -> I-RCT   7.032365
B-STORE -> I-STORE 7.028306
B-RP   -> I-RP    6.971872

Top unlikely transitions:
I-RT   -> B-ROAD  -1.541919
I-RT   -> B-ADMIN -2.013767
O      -> I-MON   -2.373073
O      -> I-OTHER -2.924190
O      -> I-ADMIN -3.236720
O      -> I-NAT   -3.429585
O      -> I-MKT   -3.449455
O      -> I-FPLACE -3.573813
O      -> I-HP    -3.720789
O      -> I-TRAN  -3.788105
O      -> I-GOV   -3.894882
O      -> I-DEP   -4.037931
O      -> I-ACP   -4.076842
O      -> I-STORE -4.094445
O      -> 

In [None]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
8.252221 B-ADMIN  word:เชียงใหม่
7.714003 B-TRAN   word:สนามบินดอนเมือง
7.606032 B-RCT    word:ราชมัง
7.386334 B-ACP    word:ม.เกษตร 
7.156945 B-NAT    word:แม่น้ำเจ้าพระยา
7.144025 B-HP     word:โรงพยาบาลเกษมราษฎร์ รามคําแหง
7.021909 B-FPLACE word:จ.ชิบะ
6.982005 O        word:มุ่งหน้า
6.854342 B-RCT    word:ซาฟารี
6.719107 B-TRAN   word:ท่าอากาศยานสุวรรณภูมิ
6.689090 B-DEP    word:Siam_Paragon
6.658530 B-DEP    word:centralwOrld
6.534752 B-DEP    word:CentralPlaza Lardprao
6.512401 B-ADMIN  word:หัวหิน
6.486530 B-DEP    word:ICONSIAM 
6.453681 B-HP     word:โรงพยาบาลปิยะเวท
6.439105 B-MKT    word:จตุจักร
6.305431 B-FPLACE word:Berlin
6.259327 B-DEP    word:IKEA Bangna
6.204556 B-RP     word:วัดทุ่งครุ
6.114689 B-FPLACE word:เชียงตุง
6.092116 B-ACP    word:Kasetsart University
6.085799 B-ACP    word:มศว
6.031750 B-RCT    word:อิมแพค
6.000830 B-BSN    word:เมืองทองธานี
5.952974 B-ADMIN  word:หนองแขม
5.950570 B-ADMIN  word:บางแค
5.942878 B-ROAD   word:ประชาชื่น
5.918649 B-

In [None]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.8879567852116683

In [None]:
def ner_classification_report(y_true, y_pred):
 
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
    tagset = set(lb.classes_) - {'O','B-DMIN'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    #tagset = list(sorted(set(lb.classes_)))
    #tagset = tagset[:-2]
    print(tagset)
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    print(classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
        digits=4
    ))

In [None]:
ner_classification_report(y_test,y_pred)

['B-ACP', 'I-ACP', 'B-ADMIN', 'I-ADMIN', 'B-BSN', 'I-BSN', 'B-DEP', 'I-DEP', 'B-FPLACE', 'I-FPLACE', 'B-GOV', 'I-GOV', 'B-HP', 'I-HP', 'B-MKT', 'I-MKT', 'B-MON', 'I-MON', 'B-NAT', 'I-NAT', 'B-OTHER', 'I-OTHER', 'B-RCT', 'I-RCT', 'B-RES', 'I-RES', 'B-ROAD', 'I-ROAD', 'B-RP', 'I-RP', 'B-RT', 'I-RT', 'B-STORE', 'I-STORE', 'B-TRAN', 'I-TRAN']
              precision    recall  f1-score   support

       B-ACP     0.9655    0.7568    0.8485        37
       I-ACP     0.9720    0.9905    0.9811       105
     B-ADMIN     0.9018    0.8026    0.8493       309
     I-ADMIN     0.8077    0.9403    0.8690        67
       B-BSN     0.9444    0.6538    0.7727        26
       I-BSN     0.9375    0.8929    0.9146        84
       B-DEP     0.9737    0.7184    0.8268       103
       I-DEP     0.9726    0.8452    0.9045       168
    B-FPLACE     0.8246    0.4747    0.6026        99
    I-FPLACE     1.0000    0.8519    0.9200        27
       B-GOV     0.9444    0.6800    0.7907        25
       I-G

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
def  chunk_extract(tag_list,sentence_id):
  """
  >>> chunk_extract(['O', 'B-PER', 'I-PER', 'B-ORG', 'O'], 1)
  [(1, 1, 3, 'PER'), (1, 3, 4, 'ORG')]
  """
  nes = []
  for i, tag in enumerate(tag_list):
    if tag[0] == 'B':
        ner_type = tag[2:]
        if i < len(tag_list)-1:  
            current = i + 1
            if tag_list[current] == 'O':
                nes.append((sentence_id,i,i, ner_type)) 
            elif tag_list[current] == 'I-{}'.format(ner_type):    
                while tag_list[current] == 'I-{}'.format(ner_type) and current < (len(tag_list)-1):
                    current += 1
                nes.append((sentence_id,i, current, ner_type))
        else:
            nes.append((sentence_id,i,i, ner_type))
  return nes

In [None]:
sentence_id = [x for x in range(len(y_test))]

In [None]:
chunk_test = set()
error_id = []
for sent,idx in tqdm(zip(y_test,sentence_id)):
    ch = chunk_extract(sent,idx)
    chunk_test.update(ch)

5617it [00:00, 212223.95it/s]


In [None]:
chunk_pred = set()
error_id = []
for sent,idx in tqdm(zip(y_pred,sentence_id)):
    ch = chunk_extract(sent,idx)
    chunk_pred.update(ch)

5617it [00:00, 176649.61it/s]


In [None]:
def evaluation_phrase(true,prediction):
    total_correct = len(true.intersection(prediction))
    total_predict = len(prediction)
    total_true = len(true)
    
    precision = total_correct/total_predict
    recall = total_correct/total_true
    f1 = (2 * precision * recall)/(precision + recall)
    
    print('total_correct:',total_correct,':','total_predict:',total_predict,':','total_true:',total_true)
    print('precision:', round(precision,3))
    print('recall:', round(recall,3))
    print('f1:', round(f1,3))
    return [round(precision,3),round(recall,3),round(f1,3)]

In [None]:
evaluation_phrase(chunk_test,chunk_pred)

total_correct: 801 : total_predict: 880 : total_true: 1129
precision: 0.91
recall: 0.709
f1: 0.797


[0.91, 0.709, 0.797]

In [None]:
def evaluation_phrase_type(chunk_test,chunk_pred):
    total_pred, total_true, total_correct = 0, 0, 0
    df = pd.DataFrame(columns=['PRECISION','RECALL','F1','SUPPORT'])
    ent_types = ['ACP','ADMIN','BSN','DEP','FPLACE','GOV','HP','MKT','MON','NAT','OTHER','RCT','RES','ROAD','RP','RT','STORE','TRAN']
    list_correct = []
    for ent in ent_types:
        true_set = []
        pred_set = []
        type_pred,type_true,type_correct = 0,0,0
        for tag in chunk_test:
            if tag[3] == ent:
                true_set.append(tag)
            if ent in tag:
                type_true+=1
                tag_true = set(true_set)
        for tag2 in chunk_pred:
            if tag2[3] == ent:
                pred_set.append(tag2)
            if ent in tag2:
                type_pred+=1
                tag_pred = set(pred_set)
        type_correct=len(tag_true.intersection(tag_pred))
        try:
            precision = type_correct/type_pred
        except:
            precision = 0
        recall = type_correct/type_true
        try:
            f1 = (2 * precision * recall) / (precision + recall)
        except:
            f1 = 0
        list_correct.append([type_true,type_pred,type_correct,ent])
        df.loc[ent] = [round(precision,3), round(recall,3), round(f1,3), str(type_true)]

    #Calculate micro macro f1
    total_true,total_pred,total_correct = 0,0,0
    for p in list_correct:
        total_true+=p[0]
        total_pred+=p[1]
        total_correct+=p[2]
    precision_micro = total_correct / total_pred
    recall_micro = total_correct / total_true
    f1_micro = (2 * precision_micro * recall_micro) / (precision_micro + recall_micro)
    df.loc['MACRO'] = [round(df.PRECISION.mean(),3), round(df.RECALL.mean(),3), round(df.F1.mean(),3), str(total_true)]
    df.loc['MICRO'] = [round(precision_micro,3), round(recall_micro,3), round(f1_micro,3), str(total_true)]

    print(df)

In [None]:
chunk_test = list(chunk_test)
chunk_pred = list(chunk_pred)

In [None]:
evaluation_phrase_type(chunk_test,chunk_pred)

        PRECISION  RECALL     F1 SUPPORT
ACP         0.931   0.730  0.818      37
ADMIN       0.898   0.799  0.846     309
BSN         0.833   0.577  0.682      26
DEP         0.947   0.699  0.804     103
FPLACE      0.825   0.475  0.603      99
GOV         0.889   0.640  0.744      25
HP          1.000   0.667  0.800      24
MKT         0.875   0.389  0.538      18
MON         0.000   0.000  0.000       2
NAT         1.000   0.455  0.625      11
OTHER       1.000   0.333  0.500       3
RCT         0.875   0.795  0.833      44
RES         0.958   0.767  0.852      30
ROAD        0.937   0.671  0.782     155
RP          0.909   0.588  0.714      51
RT          0.931   0.837  0.882     129
STORE       0.947   0.750  0.837      24
TRAN        0.882   0.769  0.822      39
MACRO       0.869   0.608  0.705    1129
MICRO       0.910   0.709  0.797    1129


In [None]:
#save model
with open('/content/drive/MyDrive/dissertation/TEXT DATA/TrainingData/3.Model/1.16CRF_pythainlp.model', 'wb') as model:
    pickle.dump(crf,model)