In [19]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import tensorflow_hub as hub

# 데이터 로드 및 샘플링
file_path = r"C:\Users\hwjh2\Desktop\딜리버리엠\인도 DB_표준화_210909.xlsx"
data = pd.read_excel(file_path)
products = data['Product'].dropna().tolist()
products_sampled = np.random.choice(products, size=int(len(products) * 0.05), replace=False)

# 1. TF-IDF + KMeans 클러스터링
def tfidf_clustering(texts, n_clusters=10):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    X = vectorizer.fit_transform(texts)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(X)
    return vectorizer, labels

# 2. Doc2Vec 임베딩 + KMeans 클러스터링
def doc2vec_clustering(texts, n_clusters=10):
    tagged_data = [TaggedDocument(words=text.split(), tags=[str(i)]) for i, text in enumerate(texts)]
    model = Doc2Vec(vector_size=300, window=2, min_count=1, workers=4, epochs=100)
    model.build_vocab(tagged_data)
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    embeddings = [model.infer_vector(text.split()) for text in texts]
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(embeddings)
    return model, labels

# 3. GloVe 임베딩 + KMeans 클러스터링
def glove_clustering(texts, n_clusters=10):
    glove_embeddings = {}
    with open(r"C:\Users\hwjh2\Desktop\딜리버리엠\glove.6B\glove.6B.300d.txt", 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            glove_embeddings[word] = vector
    
    def get_glove_vector(text):
        words = text.split()
        word_vectors = [glove_embeddings.get(word, np.zeros(300)) for word in words]
        return np.mean(word_vectors, axis=0)

    embeddings = [get_glove_vector(text) for text in texts]
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(embeddings)
    return glove_embeddings, labels

# 4. Universal Sentence Encoder (USE) + KMeans 클러스터링
def use_clustering(texts, n_clusters=10):
    embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
    embeddings = embed(texts).numpy()  # 512차원 벡터
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(embeddings)
    return embed, embeddings, labels

# 주요 단어 추출 함수 (TF-IDF)
def get_top_keywords_tfidf(texts, vectorizer, labels, n_keywords=5):
    top_keywords = []
    for cluster_num in np.unique(labels):
        cluster_indices = np.where(labels == cluster_num)[0]
        cluster_texts = [texts[i] for i in cluster_indices]
        X_cluster = vectorizer.transform(cluster_texts)
        feature_names = np.array(vectorizer.get_feature_names_out())
        tfidf_scores = X_cluster.sum(axis=0).A1
        sorted_idx = np.argsort(tfidf_scores)[::-1]
        top_keywords.append(feature_names[sorted_idx][:n_keywords])
    return top_keywords

# 주요 단어 추출 함수 (Doc2Vec, GloVe)
def get_top_keywords_combined(texts, labels, embeddings, n_keywords=5, embedding_model=None, glove_embeddings=None):
    top_keywords = []
    for cluster_num in np.unique(labels):
        cluster_indices = np.where(labels == cluster_num)[0]
        cluster_vectors = [embeddings[i] for i in cluster_indices]
        cluster_center = np.mean(cluster_vectors, axis=0)

        # Doc2Vec: 클러스터 중심에서 가장 가까운 단어를 찾기
        if embedding_model is not None:
            # 중심 벡터와 가장 유사한 단어들을 찾기
            closest_words_doc2vec = embedding_model.dv.most_similar([cluster_center], topn=n_keywords)
            doc2vec_keywords = [word for word, _ in closest_words_doc2vec]
        else:
            doc2vec_keywords = []

        # GloVe: 클러스터 중심과 가장 가까운 단어를 찾기
        if glove_embeddings is not None:
            closest_words_glove = sorted(glove_embeddings.keys(), key=lambda x: np.linalg.norm(glove_embeddings[x] - cluster_center))[:n_keywords]
        else:
            closest_words_glove = []

        # Doc2Vec와 GloVe 키워드를 합치기
        top_keywords.append(list(set(doc2vec_keywords + closest_words_glove)))

    return top_keywords


# 주요 단어 추출 함수 (USE)
def get_use_top_keywords(texts, labels, embeddings, n_keywords=5):
    top_keywords = []
    for cluster_num in np.unique(labels):
        cluster_indices = np.where(labels == cluster_num)[0]
        cluster_vectors = [embeddings[i] for i in cluster_indices]
        cluster_center = np.mean(cluster_vectors, axis=0)
        
        # USE는 문장 중심으로 비교
        closest_indices = np.argsort(np.linalg.norm(embeddings[cluster_indices] - cluster_center, axis=1))[:n_keywords]
        top_keywords.append([texts[cluster_indices[i]] for i in closest_indices])
    
    return top_keywords

# 클러스터링 및 주요 단어 추출
vectorizer, tfidf_labels = tfidf_clustering(products_sampled, n_clusters=15)
doc2vec_model, doc2vec_labels = doc2vec_clustering(products_sampled, n_clusters=15)
glove_embeddings, glove_labels = glove_clustering(products_sampled, n_clusters=15)
use_embed, use_embeddings, use_labels = use_clustering(products_sampled, n_clusters=15)

tfidf_keywords = get_top_keywords_tfidf(products_sampled, vectorizer, tfidf_labels, n_keywords=5)
doc2vec_keywords = get_top_keywords_combined(products_sampled, doc2vec_labels, 
                                            [doc2vec_model.infer_vector(text.split()) for text in products_sampled], 
                                            n_keywords=5, embedding_model=doc2vec_model, glove_embeddings=None)
glove_keywords = get_top_keywords_combined(products_sampled, glove_labels, 
                                          [np.mean([glove_embeddings.get(word, np.zeros(300)) for word in text.split()], axis=0) 
                                           for text in products_sampled], n_keywords=5, glove_embeddings=glove_embeddings)
use_keywords = get_use_top_keywords(products_sampled, use_labels, use_embeddings, n_keywords=5)

# 결과 DataFrame 생성
keywords_df = pd.DataFrame({
    'Cluster': [f'Cluster {i+1}' for i in range(15)],
    'TF-IDF': [' '.join(tfidf_keywords[i]) for i in range(15)],
    'Doc2Vec': [' '.join(doc2vec_keywords[i]) for i in range(15)],
    'GloVe': [' '.join(glove_keywords[i]) for i in range(15)],
    'USE': [' '.join(use_keywords[i]) for i in range(15)]
})

# 제품과 클러스터 정보를 포함하는 DataFrame 생성
df = pd.DataFrame({
    'Product': products_sampled,
    'TF-IDF Cluster': tfidf_labels,
    'Doc2Vec Cluster': doc2vec_labels,
    'GloVe Cluster': glove_labels
})

# 'Cluster' 열을 int 형식으로 변환
keywords_df['Cluster'] = keywords_df['Cluster'].str.extract('(\d+)').astype(int)

# 'TF-IDF Cluster' 열도 int 형식으로 변환
df['TF-IDF Cluster'] = df['TF-IDF Cluster'].astype(int)

# 주요 키워드와 함께 결과 결합
df_keywords = pd.merge(df, keywords_df, left_on='TF-IDF Cluster', right_on='Cluster', how='left')

# 최종 Excel 파일 저장
output_file_path = r"C:\Users\hwjh2\Desktop\클러스터_주요_단어_combined.xlsx"
df_keywords.to_excel(output_file_path, index=False)

print(f"주요 단어 엑셀 파일이 저장되었습니다: {output_file_path}")


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


주요 단어 엑셀 파일이 저장되었습니다: C:\Users\hwjh2\Desktop\클러스터_주요_단어_combined.xlsx


In [20]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import tensorflow_hub as hub
import gensim  # gensim 라이브러리 임포트 추가

# 데이터 로드 및 샘플링
file_path = r"C:\Users\hwjh2\Desktop\딜리버리엠\인도 DB_표준화_210909.xlsx"
data = pd.read_excel(file_path)
products = data['Product'].dropna().tolist()
products_sampled = np.random.choice(products, size=int(len(products) * 0.1), replace=False)

# 1. TF-IDF + KMeans 클러스터링
def tfidf_clustering(texts, n_clusters=10):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    X = vectorizer.fit_transform(texts)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(X)
    return vectorizer, labels

# 2. Doc2Vec 임베딩 + KMeans 클러스터링
def doc2vec_clustering(texts, n_clusters=10):
    tagged_data = [TaggedDocument(words=text.split(), tags=[str(i)]) for i, text in enumerate(texts)]
    model = Doc2Vec(vector_size=300, window=2, min_count=1, workers=4, epochs=100)
    model.build_vocab(tagged_data)
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    embeddings = [model.infer_vector(text.split()) for text in texts]
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(embeddings)
    return model, labels

# 3. GloVe 임베딩 + KMeans 클러스터링
def glove_clustering(texts, n_clusters=10):
    glove_embeddings = {}
    with open(r"C:\Users\hwjh2\Desktop\딜리버리엠\glove.6B\glove.6B.300d.txt", 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            glove_embeddings[word] = vector
    
    def get_glove_vector(text):
        words = text.split()
        word_vectors = [glove_embeddings.get(word, np.zeros(300)) for word in words]
        return np.mean(word_vectors, axis=0)

    embeddings = [get_glove_vector(text) for text in texts]
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(embeddings)
    return glove_embeddings, labels

# 4. Universal Sentence Encoder (USE) + KMeans 클러스터링
def use_clustering(texts, n_clusters=10):
    embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
    embeddings = embed(texts).numpy()  # 512차원 벡터
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(embeddings)
    return embed, embeddings, labels

# 주요 단어 추출 함수 (TF-IDF)
def get_top_keywords_tfidf(texts, vectorizer, labels, n_keywords=5):
    top_keywords = []
    for cluster_num in np.unique(labels):
        cluster_indices = np.where(labels == cluster_num)[0]
        cluster_texts = [texts[i] for i in cluster_indices]
        X_cluster = vectorizer.transform(cluster_texts)
        feature_names = np.array(vectorizer.get_feature_names_out())
        tfidf_scores = X_cluster.sum(axis=0).A1
        sorted_idx = np.argsort(tfidf_scores)[::-1]
        top_keywords.append(feature_names[sorted_idx][:n_keywords])
    return top_keywords

# 주요 단어 추출 함수 (Doc2Vec, GloVe)
def get_top_keywords_generic(texts, labels, embeddings, n_keywords=5, embedding_model=None):
    top_keywords = []
    for cluster_num in np.unique(labels):
        cluster_indices = np.where(labels == cluster_num)[0]
        cluster_texts = [texts[i] for i in cluster_indices]
        cluster_vectors = [embeddings[i] for i in cluster_indices]
        cluster_center = np.mean(cluster_vectors, axis=0)

        # Doc2Vec인 경우: 가장 가까운 단어를 찾는 과정
        if embedding_model is not None:
            if isinstance(embedding_model, gensim.models.KeyedVectors):  # GloVe와 같은 모델
                closest_words = sorted(embedding_model.keys(), key=lambda x: np.linalg.norm(embedding_model[x] - cluster_center))[:n_keywords]
                top_keywords.append(closest_words)
            else:  # Doc2Vec 모델
                closest_words = []
                for word in embedding_model.wv.index_to_key:  # Doc2Vec의 경우
                    word_vector = embedding_model.wv[word]
                    similarity = np.dot(cluster_center, word_vector) / (np.linalg.norm(cluster_center) * np.linalg.norm(word_vector))
                    closest_words.append((word, similarity))
                
                # 유사도 기준으로 정렬하여 가장 유사한 단어 5개를 선택
                closest_words = sorted(closest_words, key=lambda x: x[1], reverse=True)[:n_keywords]
                top_keywords.append([word for word, _ in closest_words])
        else:
            # 임베딩이 없으면
            top_keywords.append([])

    return top_keywords

# 주요 단어 추출 함수 (USE)
def get_use_top_keywords(texts, labels, embeddings, n_keywords=5):
    top_keywords = []
    for cluster_num in np.unique(labels):
        cluster_indices = np.where(labels == cluster_num)[0]
        cluster_vectors = [embeddings[i] for i in cluster_indices]
        cluster_center = np.mean(cluster_vectors, axis=0)
        
        # USE는 문장 중심으로 비교
        closest_indices = np.argsort(np.linalg.norm(embeddings[cluster_indices] - cluster_center, axis=1))[:n_keywords]
        top_keywords.append([texts[cluster_indices[i]] for i in closest_indices])
    
    return top_keywords

# 클러스터링 및 주요 단어 추출
vectorizer, tfidf_labels = tfidf_clustering(products_sampled, n_clusters=15)
doc2vec_model, doc2vec_labels = doc2vec_clustering(products_sampled, n_clusters=15)
glove_embeddings, glove_labels = glove_clustering(products_sampled, n_clusters=15)
use_embed, use_embeddings, use_labels = use_clustering(products_sampled, n_clusters=15)

tfidf_keywords = get_top_keywords_tfidf(products_sampled, vectorizer, tfidf_labels, n_keywords=5)
doc2vec_keywords = get_top_keywords_generic(products_sampled, doc2vec_labels, 
                                            [doc2vec_model.infer_vector(text.split()) for text in products_sampled], 
                                            n_keywords=5, embedding_model=doc2vec_model)
glove_keywords = get_top_keywords_generic(products_sampled, glove_labels, 
                                          [np.mean([glove_embeddings.get(word, np.zeros(300)) for word in text.split()], axis=0) 
                                           for text in products_sampled], n_keywords=5)
use_keywords = get_use_top_keywords(products_sampled, use_labels, use_embeddings, n_keywords=5)

# 결과 DataFrame 생성
keywords_df = pd.DataFrame({
    'Cluster': [f'Cluster {i+1}' for i in range(15)],
    'TF-IDF': [' '.join(tfidf_keywords[i]) for i in range(15)],
    'Doc2Vec': [' '.join(doc2vec_keywords[i]) for i in range(15)],
    'GloVe': [' '.join(glove_keywords[i]) for i in range(15)],
    'USE': [' '.join(use_keywords[i]) for i in range(15)]
})

# 제품과 클러스터 정보를 포함하는 DataFrame 생성
df = pd.DataFrame({
    'Product': products_sampled,
    'TF-IDF Cluster': tfidf_labels,
    'Doc2Vec Cluster': doc2vec_labels,
    'GloVe Cluster': glove_labels
})

# 'Cluster' 열을 int 형식으로 변환
keywords_df['Cluster'] = keywords_df['Cluster'].str.extract('(\d+)').astype(int)

# 'TF-IDF Cluster' 열도 int 형식으로 변환
df['TF-IDF Cluster'] = df['TF-IDF Cluster'].astype(int)

# 주요 키워드와 함께 결과 결합
df_keywords = pd.merge(df, keywords_df, left_on='TF-IDF Cluster', right_on='Cluster', how='left')

# 최종 Excel 파일 저장
output_file_path = r"C:\Users\hwjh2\Desktop\클러스터_주요_단어doc.xlsx"
df_keywords.to_excel(output_file_path, index=False)

# 완료 메시지 출력
print("Excel 파일이 성공적으로 생성되었습니다.")

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Excel 파일이 성공적으로 생성되었습니다.
