<a href="https://colab.research.google.com/github/chasubeen/ML_lab/blob/main/3_HMM_viterbi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import re
import numpy as np
import matplotlib.pyplot as plt

# **1. Preprocessing**

## **1-1. Load data**
- CoNLL-2000 file을 읽어와 word(x)와 tag(z) 처리
- 특수 품사(`remove_pos`) 제거

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# data path

train_path = "/content/drive/MyDrive/ML_lab/3_HMM/data/train.txt"
test_path = "/content/drive/MyDrive/ML_lab/3_HMM/data/test.txt"

In [4]:
### Read file

def read_file(path):
  raw = open(path, 'r').read().strip().split('\n\n')  # 빈 줄로 구분된 각 문장 처리
  words, tags = [], []

  for sentence in raw:
    lines = sentence.strip().split('\n')  # 문장 내의 각 라인 처리
    for line in lines:
      parts = line.split()  # 열 분할

      if len(parts) == 3:
        word, pos, chunk = parts
        words.append(word.lower())  # 소문자로 변환
        tags.append(pos)

    # 문장의 끝에 EOS 태그 추가
    if words:
      words.append('eos')
      tags.append('<EOS>')

  return words, tags

In [5]:
train_words, train_tags = read_file(train_path)
test_words, test_tags = read_file(test_path)

In [6]:
print(train_tags[40:80])

['DT', 'NNP', 'NNP', 'NNP', 'POS', 'VBN', 'NN', 'TO', 'DT', 'NN', 'JJ', 'NN', 'VBZ', 'VBN', 'TO', 'VB', 'DT', 'NN', 'IN', 'NN', 'IN', 'DT', 'JJ', 'NN', '.', '<EOS>', 'CC', 'NNS', 'VBP', 'VBG', 'NN', 'IN', 'NN', 'VBZ', 'VBN', 'VBN', 'IN', 'DT', 'NN', 'POS']


In [7]:
print(train_words[40:80])

['the', 'exchequer', 'nigel', 'lawson', "'s", 'restated', 'commitment', 'to', 'a', 'firm', 'monetary', 'policy', 'has', 'helped', 'to', 'prevent', 'a', 'freefall', 'in', 'sterling', 'over', 'the', 'past', 'week', '.', 'eos', 'but', 'analysts', 'reckon', 'underlying', 'support', 'for', 'sterling', 'has', 'been', 'eroded', 'by', 'the', 'chancellor', "'s"]


## **1-2. Preprocess Data**

### **a) 특수 문자 처리**

In [8]:
def preprocessing(words, tags):
  processed_words = []
  processed_tags = []

  for word, tag in zip(words, tags):

    # 모든 숫자의 tag는 'NUM'으로 대체
    if re.match(r'^[0-9]*$', word):
      tag = 'NUM'

    # 빈 문자열이 아닌 경우에만 추가
    if word.strip():
      processed_words.append(word.lower())  # 소문자 변환
      processed_tags.append(tag)

  return processed_words, processed_tags

In [9]:
train_words, train_tags = preprocessing(train_words, train_tags)
test_words, test_tags = preprocessing(test_words, test_tags)

In [10]:
print(train_tags[100:120])

['NN', 'IN', 'DT', 'NN', 'VBG', 'VBN', 'TO', 'VB', 'NN', 'NNS', 'TO', 'NUM', 'NN', 'IN', 'PRP$', 'JJ', 'NUM', 'NN', 'NN', 'TO']


In [11]:
print(train_words[100:120])

['risk', 'of', 'the', 'government', 'being', 'forced', 'to', 'increase', 'base', 'rates', 'to', '16', '%', 'from', 'their', 'current', '15', '%', 'level', 'to']


### **b) POS Tag 통합 및 변경**

In [12]:
def modify_tags(tags):
  modified_tags = []

  for tag in tags:
    if tag in ['NN', 'NNS', 'NNP', 'NNPS']:
      modified_tags.append('N')

    elif tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
      modified_tags.append('V')

    elif tag in ['PRP', 'PRP$']:
      modified_tags.append('PN')

    elif tag in ['IN']:
      modified_tags.append('PP')

    elif tag in ['RB', 'RBR', 'RBS', 'WRB']:
      modified_tags.append('AD')

    elif tag in ['CC']:
      modified_tags.append('CONJ')

    elif tag in ['DT', 'PDT']:
      modified_tags.append('ART')

    elif tag in ['JJ', 'JJR', 'JJS']:
      modified_tags.append('AdJ')

    elif tag in ['NUM']:
      modified_tags.append('NUM')
    elif tag in ['<EOS>']:
      modified_tags.append('<EOS>')
    else:
      modified_tags.append('ETC')

  return modified_tags

In [13]:
train_tags = modify_tags(train_tags)
test_tags = modify_tags(test_tags)

In [14]:
print(len(set(train_tags)))
print(len(set(test_tags)))

11
11


> 11개의 tag로 잘 정리된 것을 확인할 수 있다.

In [15]:
print(train_tags[100:120])

['N', 'PP', 'ART', 'N', 'V', 'V', 'ETC', 'V', 'N', 'N', 'ETC', 'NUM', 'N', 'PP', 'PN', 'AdJ', 'NUM', 'N', 'N', 'ETC']


In [16]:
print(train_words[100:120])

['risk', 'of', 'the', 'government', 'being', 'forced', 'to', 'increase', 'base', 'rates', 'to', '16', '%', 'from', 'their', 'current', '15', '%', 'level', 'to']


### **c) make dictionary**

In [17]:
def make_dict(tags, words):
  tag_to_index = {tag: idx for idx, tag in enumerate(set(tags))}
  word_to_index = {word: idx for idx, word in enumerate(set(words))}
  return tag_to_index, word_to_index

In [19]:
combined_words = set(train_words + test_words)
combined_tags = set(train_tags + test_tags)

In [20]:
tag_to_index, vocab_to_index = make_dict(combined_tags, combined_words)

### **e) index 변환**

In [21]:
# 'UNK' 토큰 추가
vocab_to_index['UNK'] = len(vocab_to_index)

In [25]:
def words_to_indices(words, vocab_to_index):
  return [vocab_to_index.get(word, vocab_to_index['UNK']) for word in words]

In [26]:
train_data_idx = words_to_indices(train_words, vocab_to_index)
train_tag_idx = [tag_to_index[tag] for tag in train_tags]

In [27]:
test_data_idx = words_to_indices(test_words, vocab_to_index)
test_tag_idx = [tag_to_index[tag] for tag in test_tags]

## **1-3. 최종 데이터 확인**

In [28]:
print(train_tags[:40])
print()
print(train_words[:40])

['N', 'PP', 'ART', 'N', 'V', 'AD', 'V', 'ETC', 'V', 'ART', 'AdJ', 'N', 'PP', 'N', 'N', 'PP', 'N', 'ETC', 'AdJ', 'PP', 'N', 'N', 'ETC', 'V', 'ETC', 'V', 'ART', 'AdJ', 'N', 'PP', 'N', 'CONJ', 'N', 'ETC', 'AdJ', 'N', 'ETC', '<EOS>', 'N', 'PP']

['confidence', 'in', 'the', 'pound', 'is', 'widely', 'expected', 'to', 'take', 'another', 'sharp', 'dive', 'if', 'trade', 'figures', 'for', 'september', ',', 'due', 'for', 'release', 'tomorrow', ',', 'fail', 'to', 'show', 'a', 'substantial', 'improvement', 'from', 'july', 'and', 'august', "'s", 'near-record', 'deficits', '.', 'eos', 'chancellor', 'of']


# **2. Modeling**

In [29]:
class HMM_viterbi:
    def __init__(self, num_states, vocab_size, tag_size):
        self.num_states = num_states  # Hidden 상태의 개수(POS 태그 개수)
        self.vocab_size = vocab_size  # 가능한 단어(x) 목록 크기
        self.tag_size = tag_size  # 가능한 POS(z) 목록 크기

    def fit(self, X, Z, smoothing=1.0):
        # pi 추정
        self.pi = np.zeros(self.num_states)  # initial state probabilities
        for z in Z:
            self.pi[z[0]] += 1
        self.pi = (self.pi + smoothing) / (len(Z) + smoothing * self.num_states)

        # A 추정(transition probabilities matrix)
        self.A = np.zeros((self.num_states, self.num_states))
        for z in Z:
            for i in range(len(z) - 1):
                self.A[z[i], z[i + 1]] += 1
        self.A = (self.A + smoothing) / (self.A.sum(axis=1, keepdims=True) + smoothing * self.num_states)

        # B 추정(emission probabilities matrix)
        self.B = np.zeros((self.num_states, self.vocab_size))
        for x, z in zip(X, Z):
            for xj, zi in zip(x, z):
                self.B[zi, xj] += 1
        self.B = (self.B + smoothing) / (self.B.sum(axis=1, keepdims=True) + smoothing * self.vocab_size)

    def calculate_likelihood(self, V, t, j, word_idx):
        '''
        V: log likelihoods until time t
        t: current time
        j: idx of state at current time
        '''
        return V[t - 1] + np.log(self.A[:, j]) + np.log(self.B[j, word_idx])

    def update_state(self, V, t, j):
        '''
        V: log likelihoods until time t
        t: current time
        j: idx of state at current time
        '''
        return np.argmax(V[t - 1] + np.log(self.A[:, j]))

    def get_latent_path(self, X, num_words):
        V = np.zeros((num_words, self.num_states))
        state_idx = np.zeros((num_words, self.num_states), dtype=int)

        # Initialization
        V[0] = np.log(self.pi) + np.log(self.B[:, X[0]])

        # Recursion
        for t in range(1, num_words):
            for j in range(self.num_states):
                word_idx = X[t]
                likelihoods = self.calculate_likelihood(V, t, j, word_idx)
                state_idx[t, j] = self.update_state(V, t, j)
                V[t, j] = likelihoods[state_idx[t, j]]

        # Traceback
        z = np.zeros(num_words, dtype=int)
        z[-1] = np.argmax(V[-1])
        for t in range(num_words - 2, -1, -1):
            z[t] = state_idx[t + 1, z[t + 1]]

        return z

    def predict(self, X):
        num_words = len(X)
        return self.get_latent_path(X, num_words)

    def accuracy(self, z_pred, z_true):
        return np.mean(z_pred == z_true)

# **3. Training**

In [30]:
num_states = len(tag_to_index)
vocab_size = len(vocab_to_index)

In [31]:
model = HMM_viterbi(num_states=num_states, vocab_size=vocab_size, tag_size=num_states)
model.fit([train_data_idx], [train_tag_idx], smoothing=1.0)

In [32]:
train_pred_tags = model.predict(train_data_idx)
train_accuracy = model.accuracy(train_pred_tags, train_tag_idx)
print(f"Train Accuracy: {train_accuracy:.4f}")

Train Accuracy: 0.9435


# **4. Testing**

In [33]:
test_pred_tags = model.predict(test_data_idx)
test_accuracy = model.accuracy(test_pred_tags, test_tag_idx)
print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.9138
