<a href="https://colab.research.google.com/github/chaehoon1/Linear_Algebra_and_AI/blob/main/linear_algebra_and_AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [60]:
import re
from collections import Counter, defaultdict
import random
import math
import numpy as np
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import lil_matrix

In [3]:
def build_vocab(text: str) -> list[str]:
    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    tokens = text.split()
    vocab = sorted(list(set(tokens)))

    return vocab


V 생성 함수

In [4]:
def build_corpus(text: str, vocab: list[str]) -> list[list[int]]:
    word_to_id = {word: idx for idx, word in enumerate(vocab)}
    sentences = text.lower().split('.')

    corpus = []

    for sentence in sentences:
        clean = re.sub(r'[^a-z\s]', ' ', sentence)
        tokens = clean.split()

        if not tokens:
            continue

        token_ids = [word_to_id[token] for token in tokens if token in word_to_id]

        if token_ids:
            corpus.append(token_ids)

    return corpus


corpus 생성 함수

In [129]:
"""
def cut_file(input_path, output_path, max_lines=1_000_000):
    with open(input_path, "r", encoding="utf-8") as fin, \
         open(output_path, "w", encoding="utf-8") as fout:

        for i, line in enumerate(fin):
            if i >= max_lines:
                break
            fout.write(line)

cut_file("/content/sample_data/wikisent2.txt", "/content/sample_data/wikisent2_cut.txt")
"""
with open("/content/sample_data/multiverse.txt", "r", encoding="utf-8") as f:
    text = f.read()

vocab = build_vocab(text)
corpus = build_corpus(text, vocab)

텍스트 데이터로부터 V와 corpus 생성

In [130]:
def build_D(corpus, window_size):
    D = []
    for sentence in corpus:
        L = len(sentence)
        for i, w in enumerate(sentence):
            start = max(0, i - window_size)
            end = min(L, i + window_size + 1)
            for j in range(start, end):
                if i == j:
                    continue
                D.append((w, sentence[j]))
    return D


D 생성 함수

In [138]:
pair_count = Counter(D)
w_count = Counter([w for w, _ in D])
c_count = Counter([c for _, c in D])
D_size = len(D)

def p_wc(D, w, c):
  return pair_count[(w, c)] / D_size if D_size > 0 else 0.0

def p_w(D, w):
  return w_count[w] / D_size if D_size > 0 else 0.0

#def p_c(D, c):
#  return c_count[c] / D_size if D_size > 0 else 0.0
# unigram negative sampling distribution: not used

확률 함수

In [139]:
D = build_D(corpus, 5)

D 생성

In [140]:
c_counts = Counter([c for _, c in D])

c_pow = {c: count ** (3/4) for c, count in c_counts.items()}

total = sum(c_pow.values())

p_D = {c: value / total for c, value in c_pow.items()}

p_D(c) 확률 분포 정의

In [145]:
def build_SPPMI_matrix(v, D, p_D, k=10):
    count = 0
    V = len(v)
    SPPMI = np.zeros((V, V), dtype=np.float32)

    for w in range(V):
        pw = p_w(D, w)
        if pw == 0:
            continue

        for c in range(V):
            pwc = p_wc(D, w, c)
            if pwc == 0:
                continue

            pDc = p_D.get(c, 0.0)
            if pDc == 0:
                continue

            # SPPMI = max(log( p(w,c) / (p(w)*k*p_D(c)) ), 0)
            sppmi_value = math.log((pwc / (pw * k * pDc)) + 1e-12)

            if sppmi_value > 0:
                SPPMI[w, c] = sppmi_value
            count = count + 1
            if count % 1000 == 0:
                print(f"progress: {count} /{V*V}")

    return SPPMI

Shifted Positive PMI 행렬 생성 함수

In [146]:
SPPMI = build_SPPMI_matrix(vocab, D, p_D)


progress: 1000 /589824
progress: 2000 /589824
progress: 3000 /589824
progress: 4000 /589824
progress: 5000 /589824
progress: 6000 /589824
progress: 7000 /589824
progress: 8000 /589824
progress: 9000 /589824
progress: 10000 /589824
progress: 11000 /589824
progress: 12000 /589824
progress: 13000 /589824
progress: 14000 /589824
progress: 15000 /589824


Shifted Positive PMI 행렬 생성

In [147]:
def spmi_to_embedding(SPMI_matrix, k=200):
    """
    입력:
      SPMI_matrix : numpy array (V x V)
      k : 임베딩 차원

    출력:
      embeddings : numpy array (V x k)
    """
    svd = TruncatedSVD(n_components=k, n_iter=10, random_state=42)
    embeddings = svd.fit_transform(SPMI_matrix)

    return embeddings

SVD 함수

In [148]:
embeddings = spmi_to_embedding(SPPMI)

임베딩

In [149]:
def get_embedding(word, vocab, embeddings):
    if word not in vocab:
        return None
    idx = vocab.index(word)
    return embeddings[idx]

def print_word(vec, vocab, embeddings, top_k=10):
    # Normalize input vector
    v = vec / (np.linalg.norm(vec) + 1e-9)

    # Normalize embeddings matrix
    emb_norm = embeddings / (np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-9)

    # Cosine similarity 계산
    sims = np.dot(emb_norm, v)

    # 가장 유사한 top_k 인덱스
    top_indices = sims.argsort()[::-1][:top_k]

    # 출력
    for idx in top_indices:
        print(f"{vocab[idx]}   (sim = {sims[idx]:.4f})")

벡터-단어 변환 함수

In [202]:
vec1 = get_embedding("actual", vocab, embeddings)
vec2 = get_embedding("world", vocab, embeddings)
vec3 = vec1 + vec2
print_word(vec3, vocab, embeddings, 10)

actual   (sim = 0.9516)
status   (sim = 0.5478)
true   (sim = 0.5398)
ontological   (sim = 0.5128)
whether   (sim = 0.4944)
possibility   (sim = 0.4688)
equivalent   (sim = 0.4601)
description   (sim = 0.4566)
modal   (sim = 0.4428)
reality   (sim = 0.4399)


최종 결과

In [None]:
def get_word_vec(word, vocab, embeddings):
    """Return embedding vector of word"""
    idx = vocab.index(word)
    return embeddings[idx]


def build_sentence_matrix(sentence_tokens, vocab, embeddings):
    """sentence → embedding matrix (L, d)"""
    vecs = []
    for w in sentence_tokens:
        if w in vocab:
            vecs.append(get_word_vec(w, vocab, embeddings))
        else:
            # unknown word → zero vector
            vecs.append(np.zeros(embeddings.shape[1]))
    return np.vstack(vecs)  # (L, d)


def self_attention(E):
    """
    E: (L, d)  sentence embedding matrix
    Q = E, K = E, V = E (Wq=Wk=Wv=I)
    return attention_scores, output
    """
    Q = E
    K = E
    V = E
    d = E.shape[1]

    # Attention score (L, L)
    scores = np.dot(Q, K.T) / np.sqrt(d)

    # Softmax row-wise
    exp_scores = np.exp(scores - np.max(scores, axis=1, keepdims=True))
    A = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

    # Output
    O = np.dot(A, V)
    return A, O


def show_attention(sentence_tokens, attention_matrix, top_k=3):
    L = len(sentence_tokens)
    for i in range(L):
        scores = attention_matrix[i]
        top_idx = scores.argsort()[::-1][:top_k]
        print(f"[{sentence_tokens[i]}] → ", end="")
        for j in top_idx:
            print(f"{sentence_tokens[j]}({scores[j]:.3f}) ", end="")
        print()

In [None]:
sentence = "there are many possible worlds in the multiverse".split()
E = build_sentence_matrix(sentence, vocab, embeddings)
A, O = self_attention(E)

show_attention(sentence, A, top_k=4)