In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

def check_embedding_size():
    # Khởi tạo model embedding
    embeddings = HuggingFaceEmbeddings(
        model_name="BAAI/bge-m3"
    )
    
    # Tạo embedding cho một văn bản mẫu
    text = "This is a sample text"
    embedding = embeddings.embed_query(text)
    
    # Kiểm tra kích thước
    print(f"Embedding dimension: {len(embedding)}")
    
    # Kiểm tra shape nếu dùng numpy
    import numpy as np
    embedding_array = np.array(embedding)
    print(f"Embedding shape: {embedding_array.shape}")


In [None]:
def check_multiple_embeddings():
    embeddings = HuggingFaceEmbeddings(
        model_name="BAAI/bge-m3"
    )
    
    texts = [
        "First text",
        "Second text",
        "Third text"
    ]
    
    # Tạo embeddings cho nhiều văn bản
    embedded_texts = embeddings.embed_documents(texts)
    
    # Kiểm tra kích thước cho từng embedding
    for i, emb in enumerate(embedded_texts):
        print(f"Text {i+1} embedding size: {len(emb)}")
    
    # Kiểm tra tổng thể với numpy
    import numpy as np
    embedded_array = np.array(embedded_texts)
    print(f"Overall embeddings shape: {embedded_array.shape}")


In [19]:
class EmbeddingAnalyzer:
    def __init__(self, model_name="BAAI/bge-m3"):
        self.embeddings = HuggingFaceEmbeddings(model_name=model_name)
    
    def analyze_embedding(self, text):
        embedding = self.embeddings.embed_query(text)
        
        analysis = {
            "dimension": len(embedding),
            "min_value": min(embedding),
            "max_value": max(embedding),
            "mean_value": sum(embedding) / len(embedding),
            "memory_size": len(embedding) * 4  # 4 bytes per float
        }
        
        return analysis
    
    def compare_embeddings(self, text1, text2):
        emb1 = self.embeddings.embed_query(text1)
        emb2 = self.embeddings.embed_query(text2)
        
        import numpy as np
        similarity = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
        
        return {
            "text1_size": len(emb1),
            "text2_size": len(emb2),
            "similarity": similarity
        }

# Sử dụng
analyzer = EmbeddingAnalyzer()

# Phân tích một embedding
text = "Example text for analysis"
analysis = analyzer.analyze_embedding(text)
print("Embedding analysis:", analysis)

# So sánh hai embedding
text1 = "First text"
text2 = "Second text"
comparison = analyzer.compare_embeddings(text1, text2)
print("Embedding comparison:", comparison)


Embedding analysis: {'dimension': 1024, 'min_value': -0.19820284843444824, 'max_value': 0.23256169259548187, 'mean_value': -0.0007201888914210031, 'memory_size': 4096}
Embedding comparison: {'text1_size': 1024, 'text2_size': 1024, 'similarity': 0.7652550986710522}


In [21]:
import fasttext
model_path = "lid.176.bin"
model = fasttext.load_model(model_path)
# Test mô hình
# result = model.test("Xin chào")
# print("Độ chính xác:", result[1])
text = "xin   東京は日本の首都です。世界有数の大都市であり、政治、経済、文化の中心地です。 多くの観光名所があり、毎年多くの観光客が訪れています。東京スカイツリーや浅草寺など、伝統と現代が共存する街並みが特徴です"
predictions = model.predict(text, k=1)
print(predictions[0][0])
print(predictions)


__label__ja
(('__label__ja',), array([1.00001252]))
