<a href="https://colab.research.google.com/github/chasubeen/ML_lab/blob/main/3_HMM_viterbi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
import os
import re
import numpy as np
import matplotlib.pyplot as plt

# **1. Preprocessing**

## **1-1. Load data**
- CoNLL-2000 file을 읽어와 word($\mathbf x$)와 tag($\mathbf z$) 처리

In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
# data path

train_path = "/content/drive/MyDrive/ML_lab/3_HMM/data/train.txt"
test_path = "/content/drive/MyDrive/ML_lab/3_HMM/data/test.txt"

In [21]:
### Read file

def read_file(path):
  raw = open(path, 'r').read().strip().split('\n\n')  # 빈 줄로 구분된 각 문장 처리
  words, tags = [], []

  for sentence in raw:
    lines = sentence.strip().split('\n')  # 문장 내의 각 라인(단어) 처리
    for line in lines:
      parts = line.split()  # 열 분할

      if len(parts) == 3:
        word, pos, chunk = parts
        words.append(word.lower())  # 소문자로 변환
        tags.append(pos)

    # 문장의 끝이라면(단어가 없다면) EOS 태그 추가
    if words:
      words.append('eos')
      tags.append('<EOS>')

  return words, tags

In [22]:
train_words, train_tags = read_file(train_path)
test_words, test_tags = read_file(test_path)

## **1-2. Preprocess Data**

### **a) 특수 문자 처리**
- 숫자 태그를 'NUM'으로 변경
- 빈 문자열 제거

In [23]:
def preprocessing(words, tags):
  processed_words = []
  processed_tags = []

  for word, tag in zip(words, tags):

    # 모든 숫자의 tag는 'NUM'으로 대체
    if re.match(r'^[0-9]*$', word):
      tag = 'NUM'

    # 빈 문자열이 아닌 경우에만 추가
    if word.strip():
      processed_words.append(word.lower())  # 소문자 변환
      processed_tags.append(tag)

  return processed_words, processed_tags

In [24]:
train_words, train_tags = preprocessing(train_words, train_tags)
test_words, test_tags = preprocessing(test_words, test_tags)

### **b) POS Tag 통합**
- 영어의 대표적인 **8품사(+ 접속사, 숫자)**만 남기고, 나머지는 기타로 처리

In [25]:
def modify_tags(tags):
  modified_tags = []

  for tag in tags:
    if tag in ['NN', 'NNS', 'NNP', 'NNPS']:
      modified_tags.append('N')

    elif tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
      modified_tags.append('V')

    elif tag in ['PRP', 'PRP$']:
      modified_tags.append('PN')

    elif tag in ['IN']:
      modified_tags.append('PP')

    elif tag in ['RB', 'RBR', 'RBS', 'WRB']:
      modified_tags.append('AD')

    elif tag in ['CC']:
      modified_tags.append('CONJ')

    elif tag in ['DT', 'PDT']:
      modified_tags.append('ART')

    elif tag in ['JJ', 'JJR', 'JJS']:
      modified_tags.append('AdJ')

    elif tag in ['NUM']:
      modified_tags.append('NUM')
    elif tag in ['<EOS>']:
      modified_tags.append('<EOS>')
    else:
      modified_tags.append('ETC')

  return modified_tags

In [26]:
train_tags = modify_tags(train_tags)
test_tags = modify_tags(test_tags)

In [27]:
print(len(set(train_tags)))
print(len(set(test_tags)))

11
11


> **11**개의 tag(품사)로 잘 정리된 것을 확인할 수 있다.

### **c) 단어 사전 생성**
- 태그와 단어들을 인덱스로 매핑하는 사전 생성

In [28]:
def make_dict(tags, words):
  tag_to_index = {tag: idx for idx, tag in enumerate(set(tags))}
  word_to_index = {word: idx for idx, word in enumerate(set(words))}
  return tag_to_index, word_to_index

In [29]:
tag_to_index, vocab_to_index = make_dict(train_tags, train_words)

In [30]:
# UNK(Unknown) 추가
vocab_to_index['UNK'] = len(vocab_to_index)

### **d) index 변환**

In [31]:
def to_indices(objs, obj_to_idx):
  return [obj_to_idx.get(obj, obj_to_idx['UNK']) for obj in objs]

In [32]:
train_data_idx = to_indices(train_words, vocab_to_index)
train_tag_idx = [tag_to_index[tag] for tag in train_tags]

In [33]:
test_data_idx = to_indices(test_words, vocab_to_index)
test_tag_idx = [tag_to_index[tag] for tag in test_tags]

## **1-3. 최종 데이터 확인**

In [34]:
print(train_tags[:40])
print()
print(train_words[:40])

['N', 'PP', 'ART', 'N', 'V', 'AD', 'V', 'ETC', 'V', 'ART', 'AdJ', 'N', 'PP', 'N', 'N', 'PP', 'N', 'ETC', 'AdJ', 'PP', 'N', 'N', 'ETC', 'V', 'ETC', 'V', 'ART', 'AdJ', 'N', 'PP', 'N', 'CONJ', 'N', 'ETC', 'AdJ', 'N', 'ETC', '<EOS>', 'N', 'PP']

['confidence', 'in', 'the', 'pound', 'is', 'widely', 'expected', 'to', 'take', 'another', 'sharp', 'dive', 'if', 'trade', 'figures', 'for', 'september', ',', 'due', 'for', 'release', 'tomorrow', ',', 'fail', 'to', 'show', 'a', 'substantial', 'improvement', 'from', 'july', 'and', 'august', "'s", 'near-record', 'deficits', '.', 'eos', 'chancellor', 'of']


# **2. Modeling**

In [35]:
class HMM_viterbi:
  def __init__(self, num_states, vocab_size, tag_size):
    self.num_states = num_states  # Hidden 상태의 개수(POS 태그 개수)
    self.vocab_size = vocab_size  # 가능한 단어(x) 목록 크기
    self.tag_size = tag_size  # 가능한 POS(z) 목록 크기

  def fit(self, X, Z, smoothing=1.0):
    # pi 추정
    self.pi = np.zeros(self.num_states)  # initial state probabilities
    for z in Z:
      self.pi[z[0]] += 1
    self.pi = (self.pi + smoothing) / (len(Z) + smoothing * self.num_states)

    # A 추정(transition probabilities matrix)
    self.A = np.zeros((self.num_states, self.num_states))
    for z in Z:
      for i in range(len(z) - 1):
        self.A[z[i], z[i + 1]] += 1
    self.A = (self.A + smoothing) / (self.A.sum(axis=1, keepdims=True) + smoothing * self.num_states)

    # B 추정(emission probabilities matrix)
    self.B = np.zeros((self.num_states, self.vocab_size))
    for x, z in zip(X, Z):
      for xj, zi in zip(x, z):
        self.B[zi, xj] += 1
    self.B = (self.B + smoothing) / (self.B.sum(axis=1, keepdims=True) + smoothing * self.vocab_size)

  def calculate_likelihood(self, V, t, j, word_idx):
    return V[t - 1] + np.log(self.A[:, j]) + np.log(self.B[j, word_idx])

  def update_state(self, V, t, j):
    return np.argmax(V[t - 1] + np.log(self.A[:, j]))

  def get_latent_path(self, X, num_words):
    V = np.zeros((num_words, self.num_states))
    state_idx = np.zeros((num_words, self.num_states), dtype=int)

    # Initialization
    V[0] = np.log(self.pi) + np.log(self.B[:, X[0]])

    # Recursion
    for t in range(1, num_words):
      for j in range(self.num_states):
        word_idx = X[t]
        likelihoods = self.calculate_likelihood(V, t, j, word_idx)
        state_idx[t, j] = self.update_state(V, t, j)
        V[t, j] = likelihoods[state_idx[t, j]]

    # Traceback
    z = np.zeros(num_words, dtype=int)
    z[-1] = np.argmax(V[-1])
    for t in range(num_words - 2, -1, -1):
      z[t] = state_idx[t + 1, z[t + 1]]

    return z

  def predict(self, X):
    num_words = len(X)
    return self.get_latent_path(X, num_words)

  def accuracy(self, z_pred, z_true):
    return np.mean(z_pred == z_true)

# **3. Training**

In [36]:
num_states = len(tag_to_index)
vocab_size = len(vocab_to_index) # 가능한 전체 단어의 경우의 수

In [37]:
model = HMM_viterbi(num_states=num_states, vocab_size=vocab_size, tag_size=num_states)
model.fit([train_data_idx], [train_tag_idx], smoothing=1.0)

In [38]:
train_pred_idx = model.predict(train_data_idx)
train_accuracy = model.accuracy(train_pred_idx, train_tag_idx)
print(f"Train Accuracy: {train_accuracy:.4f}")

Train Accuracy: 0.9414


# **4. Testing**

In [39]:
test_pred_idx = model.predict(test_data_idx)
test_accuracy = model.accuracy(test_pred_idx, test_tag_idx)
print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.9138


# **5. Results**

In [40]:
def compare_result(words, true_tags, pred_indices, tag_to_idx):
  # 1. 인덱스를 태그 이름으로 변환
  idx_to_tag = {idx: tag for tag, idx in tag_to_idx.items()}

  # 2. 예측 결과 출력
  for word, true_tag, pred_idx in zip(words, true_tags, pred_indices):
    true_tag_label = true_tag if isinstance(true_tag, str) else idx_to_tag[true_tag]
    pred_tag_label = idx_to_tag[pred_idx]
    print(f"Word: {word:15} True Tag: {true_tag_label:10} Pred Tag: {pred_tag_label:10}")

In [41]:
compare_result(train_words[:20], train_tags[:20], train_pred_idx[:20], tag_to_index)

Word: confidence      True Tag: N          Pred Tag: N         
Word: in              True Tag: PP         Pred Tag: PP        
Word: the             True Tag: ART        Pred Tag: ART       
Word: pound           True Tag: N          Pred Tag: N         
Word: is              True Tag: V          Pred Tag: V         
Word: widely          True Tag: AD         Pred Tag: AD        
Word: expected        True Tag: V          Pred Tag: V         
Word: to              True Tag: ETC        Pred Tag: ETC       
Word: take            True Tag: V          Pred Tag: V         
Word: another         True Tag: ART        Pred Tag: ART       
Word: sharp           True Tag: AdJ        Pred Tag: AdJ       
Word: dive            True Tag: N          Pred Tag: N         
Word: if              True Tag: PP         Pred Tag: PP        
Word: trade           True Tag: N          Pred Tag: N         
Word: figures         True Tag: N          Pred Tag: N         
Word: for             True Tag: PP      

In [42]:
compare_result(test_words[:20], test_tags[:20], test_pred_idx[:20], tag_to_index)

Word: rockwell        True Tag: N          Pred Tag: AdJ       
Word: international   True Tag: N          Pred Tag: N         
Word: corp.           True Tag: N          Pred Tag: N         
Word: 's              True Tag: ETC        Pred Tag: ETC       
Word: tulsa           True Tag: N          Pred Tag: <EOS>     
Word: unit            True Tag: N          Pred Tag: N         
Word: said            True Tag: V          Pred Tag: V         
Word: it              True Tag: PN         Pred Tag: PN        
Word: signed          True Tag: V          Pred Tag: V         
Word: a               True Tag: ART        Pred Tag: ART       
Word: tentative       True Tag: AdJ        Pred Tag: AdJ       
Word: agreement       True Tag: N          Pred Tag: N         
Word: extending       True Tag: V          Pred Tag: PP        
Word: its             True Tag: PN         Pred Tag: PN        
Word: contract        True Tag: N          Pred Tag: N         
Word: with            True Tag: PP      