In [1]:
import tensorflow as tf
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

documents = [
    "貓 愛 睡覺",
    "狗 喜歡 跑步",
    "貓 很 可愛",
    "狗 很 忠誠",
    "睡覺 是一種 享受",
    "運動 對健康 有幫助",
    "跑步 是一種 運動",
    "可愛 的 小狗",
]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)
tfidf_array = tfidf_matrix.toarray()
num_clusters = 2
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(tfidf_array)
labels = kmeans.labels_
tf_labels = tf.convert_to_tensor(labels, dtype=tf.int32)
for i, doc in enumerate(documents):
    print(f"句子: {doc} -> 類別: {tf_labels.numpy()[i]}")


2025-03-30 11:32:10.637438: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-30 11:32:10.653709: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743305530.667296   26947 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743305530.671580   26947 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1743305530.684144   26947 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

句子: 貓 愛 睡覺 -> 類別: 0
句子: 狗 喜歡 跑步 -> 類別: 0
句子: 貓 很 可愛 -> 類別: 0
句子: 狗 很 忠誠 -> 類別: 0
句子: 睡覺 是一種 享受 -> 類別: 0
句子: 運動 對健康 有幫助 -> 類別: 1
句子: 跑步 是一種 運動 -> 類別: 1
句子: 可愛 的 小狗 -> 類別: 0


I0000 00:00:1743305533.159499   26947 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5563 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9


In [2]:
import tensorflow as tf
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

documents = [
    "貓 愛 睡覺",
    "狗 喜歡 跑步",
    "貓 很 可愛",
    "狗 很 忠誠",
    "睡覺 是一種 享受",
    "運動 對健康 有幫助",
    "跑步 是一種 運動",
    "可愛 的 小狗",
]
vectorizer = CountVectorizer()
count_matrix = vectorizer.fit_transform(documents)
print("count_matrix\n", count_matrix)
print("count_matrix_array\n", count_matrix.toarray())
num_topics = 2  # 設定兩個主題
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda_features = lda_model.fit_transform(count_matrix)
print("lda_features\n", lda_features)
kmeans = KMeans(n_clusters=num_topics, random_state=42)
kmeans.fit(lda_features)
labels = kmeans.labels_
tf_labels = tf.convert_to_tensor(labels, dtype=tf.int32)
for i, doc in enumerate(documents):
    print(f"句子: {doc} -> 類別: {tf_labels.numpy()[i]}")


count_matrix
   (0, 8)	1
  (1, 2)	1
  (1, 9)	1
  (2, 1)	1
  (3, 5)	1
  (4, 8)	1
  (4, 6)	1
  (4, 0)	1
  (5, 10)	1
  (5, 3)	1
  (5, 7)	1
  (6, 9)	1
  (6, 6)	1
  (6, 10)	1
  (7, 1)	1
  (7, 4)	1
count_matrix_array
 [[0 0 0 0 0 0 0 0 1 0 0]
 [0 0 1 0 0 0 0 0 0 1 0]
 [0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0]
 [1 0 0 0 0 0 1 0 1 0 0]
 [0 0 0 1 0 0 0 1 0 0 1]
 [0 0 0 0 0 0 1 0 0 1 1]
 [0 1 0 0 1 0 0 0 0 0 0]]
lda_features
 [[0.74451977 0.25548023]
 [0.17205978 0.82794022]
 [0.25531633 0.74468367]
 [0.73808296 0.26191704]
 [0.85744139 0.14255861]
 [0.85574108 0.14425892]
 [0.18459155 0.81540845]
 [0.17208144 0.82791856]]
句子: 貓 愛 睡覺 -> 類別: 1
句子: 狗 喜歡 跑步 -> 類別: 0
句子: 貓 很 可愛 -> 類別: 0
句子: 狗 很 忠誠 -> 類別: 1
句子: 睡覺 是一種 享受 -> 類別: 1
句子: 運動 對健康 有幫助 -> 類別: 1
句子: 跑步 是一種 運動 -> 類別: 0
句子: 可愛 的 小狗 -> 類別: 0


In [3]:
import tensorflow as tf
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer

documents = [
    "貓 喜歡 睡覺",
    "狗 喜歡 跑步",
    "貓 很 可愛",
    "狗 很 忠誠",
    "睡覺 是一種 享受",
    "運動 對健康 有幫助",
    "跑步 是一種 運動",
    "可愛 的 小狗",
]

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents).toarray()
dbscan = DBSCAN(eps=0.9, min_samples=1, metric="euclidean")  # eps 決定鄰近範圍
labels = dbscan.fit_predict(tfidf_matrix)
tf_labels = tf.convert_to_tensor(labels, dtype=tf.int32)
for i, doc in enumerate(documents):
    print(f"句子: {doc} -> 類別: {tf_labels.numpy()[i]}")


句子: 貓 喜歡 睡覺 -> 類別: 0
句子: 狗 喜歡 跑步 -> 類別: 1
句子: 貓 很 可愛 -> 類別: 2
句子: 狗 很 忠誠 -> 類別: 3
句子: 睡覺 是一種 享受 -> 類別: 4
句子: 運動 對健康 有幫助 -> 類別: 5
句子: 跑步 是一種 運動 -> 類別: 6
句子: 可愛 的 小狗 -> 類別: 2


In [4]:
import tensorflow as tf
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.mixture import GaussianMixture

documents = [
    "貓 喜歡 睡覺",
    "狗 喜歡 跑步",
    "貓 很 可愛",
    "狗 很 忠誠",
    "睡覺 是一種 享受",
    "運動 對健康 有幫助",
    "慢跑 是一種 運動",
    "可愛 的 小狗",
]
vectorizer = CountVectorizer()
count_matrix = vectorizer.fit_transform(documents)
num_topics = 2  # 設定兩個主題
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda_features = lda_model.fit_transform(count_matrix)
gmm = GaussianMixture(n_components=num_topics, random_state=42)
gmm.fit(lda_features)
labels = gmm.predict(lda_features)
tf_labels = tf.convert_to_tensor(labels, dtype=tf.int32)
for i, doc in enumerate(documents):
    print(f"句子: {doc} -> 類別: {tf_labels.numpy()[i]}")
proba1 = gmm.predict_proba(lda_features)
print(proba1)


句子: 貓 喜歡 睡覺 -> 類別: 1
句子: 狗 喜歡 跑步 -> 類別: 0
句子: 貓 很 可愛 -> 類別: 0
句子: 狗 很 忠誠 -> 類別: 0
句子: 睡覺 是一種 享受 -> 類別: 1
句子: 運動 對健康 有幫助 -> 類別: 1
句子: 慢跑 是一種 運動 -> 類別: 0
句子: 可愛 的 小狗 -> 類別: 0
[[3.15503443e-031 1.00000000e+000]
 [1.00000000e+000 6.95444741e-096]
 [1.00000000e+000 6.94844544e-084]
 [1.00000000e+000 4.19737756e-082]
 [4.19296289e-039 1.00000000e+000]
 [6.06089439e-039 1.00000000e+000]
 [1.00000000e+000 1.13798785e-068]
 [1.00000000e+000 9.60606780e-109]]


In [5]:
import gensim.downloader as api

glove_model = api.load("glove-wiki-gigaword-300")
word1 = "king"
word2 = "queen"
if word1 in glove_model and word2 in glove_model:
    similarity = glove_model.similarity(word1, word2)
    print(f"'{word1}' 和 '{word2}' 的相似度: {similarity:.4f}")
else:
    print("詞彙不在 GloVe 詞典中")
similar_words = glove_model.most_similar("king", topn=5)
print("\n與 'king' 最相似的詞:")
for word, score in similar_words:
    print(f"  - {word}: {score:.4f}")


'king' 和 'queen' 的相似度: 0.6336

與 'king' 最相似的詞:
  - queen: 0.6336
  - prince: 0.6197
  - monarch: 0.5900
  - kingdom: 0.5791
  - throne: 0.5606


In [6]:
import gensim.downloader as api

glove_model = api.load("glove-wiki-gigaword-300")
word1 = "國王"
word2 = "女王"
if word1 in glove_model and word2 in glove_model:
    similarity = glove_model.similarity(word1, word2)
    print(f"'{word1}' 和 '{word2}' 的相似度: {similarity:.4f}")
else:
    print("詞彙不在 GloVe 詞典中")
similar_words = glove_model.most_similar("king", topn=5)
print("\n與 'king' 最相似的詞:")
for word, score in similar_words:
    print(f"  - {word}: {score:.4f}")


詞彙不在 GloVe 詞典中

與 'king' 最相似的詞:
  - queen: 0.6336
  - prince: 0.6197
  - monarch: 0.5900
  - kingdom: 0.5791
  - throne: 0.5606


In [7]:
import jieba
from gensim.models import FastText

texts = ["國王 統治 王國", "女王 統治 王國", "皇帝 是 強大 的", "公主 住 在 城堡"]
tokenized_texts = [list(jieba.cut(text)) for text in texts]
model = FastText(
    sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4, sg=1
)
model.save("fasttext.model")
similarity = model.wv.similarity("國王", "女王")
print(f"國王 和 女王 的相似度: {similarity:.4f}")
similar_words = model.wv.most_similar("國王", topn=5)
print("\n與 '國王' 最相似的詞:")
for word, score in similar_words:
    print(f"  - {word}: {score:.4f}")
similar_words = model.wv.most_similar("大王", topn=5)
print("\n與 '大王' 最相似的詞:")
for word, score in similar_words:
    print(f"  - {word}: {score:.4f}")


Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.424 seconds.
Prefix dict has been built successfully.


國王 和 女王 的相似度: 0.1149

與 '國王' 最相似的詞:
  - 女王: 0.1149
  - 強大: 0.1010
  - 的: 0.0769
  - 是: 0.0574
  -  : 0.0477

與 '大王' 最相似的詞:
  -  : 0.1264
  - 統治: 0.1140
  - 住: 0.1047
  - 公主: 0.0948
  - 強大: 0.0806
