In [65]:
from collections import Counter
import json
import math
import random
import re
import pandas as pd
import matplotlib.pyplot as plt

import jieba
import jieba.analyse
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

In [2]:
jieba.load_userdict("辭典_2.txt")

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\USER\AppData\Local\Temp\jieba.cache
Loading model cost 1.700 seconds.
Prefix dict has been built successfully.


In [67]:
class KMeans:
    def cal_dist(self, p0, p1):
        """
        比較兩點的距離
        """
        return np.sqrt(np.sum((p0-p1)**2))

    def kmeans(self, datapoints, k=2):
        # 定義資料維度
        d = datapoints.shape[1]
        # 最大的迭代次數
        Max_Iterations = 1000

        cluster = np.zeros(datapoints.shape[0])
        prev_cluster = np.ones(datapoints.shape[0])

        cluster_centers = []
        for i in range(k):
            cluster_centers += [random.choice(datapoints)]

        iteration = 0
        while np.array_equal(cluster, prev_cluster) is False or iteration > Max_Iterations:
            iteration += 1
            prev_cluster = cluster.copy()

            # 將每一個點做分群
            for idx, point in enumerate(datapoints):
                min_dist = float("inf")
                for c, cluster_center in enumerate(cluster_centers):
                    dist = self.cal_dist(point, cluster_center)
                    if dist < min_dist:
                        min_dist = dist  
                        cluster[idx] = c   # 指定該點屬於哪個分群

            # 更新分群的中心
            for k in range(len(cluster_centers)):
                new_center = np.zeros(d)
                members = 0
                for point, c in zip(datapoints, cluster):
                    if c == k:
                        new_center += point
                        members += 1
                if members > 0:
                    new_center = new_center / members
                cluster_centers[k] = new_center

        return cluster

In [4]:
# 讀取資料
gift = pd.read_csv('禮物名單- k means.csv', encoding = 'utf-8')
gift.head()

Unnamed: 0,產品名稱,產品特徵
0,瘋狂玩樂遊戲機立體造型卡片,遊戲機 造型卡片
1,無力炸醬麵二合一抱枕毯,炸醬麵 抱枕毯
2,夜光筆記本,夜光 筆記本
3,可愛超人狗狗存錢筒,超人 狗狗 存錢筒
4,多功能磁吸式牙刷收納置物架,多功能 磁吸式 牙刷 收納 置物架


In [5]:
gift_name = gift['產品名稱']
gift_feature = gift['產品特徵']

In [6]:
vectorizer = TfidfVectorizer()
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(gift_feature))
bag_of_words = vectorizer.get_feature_names()
weight = tfidf.toarray()

gift_most_related_words = {}
for i in range(len(weight)): 
    w = dict(zip(bag_of_words, weight[i]))
    w = sorted(w.items(), key=lambda x: x[1], reverse=True)
    top_10 = []
    for word, prob in w[:10]:
        if prob > 0:
            top_10.append(word)
    gift_most_related_words.update({gift_name[i]: top_10})

In [7]:
gift_most_related_words

{'瘋狂玩樂遊戲機立體造型卡片 ': ['造型卡片', '遊戲機'],
 '無力炸醬麵二合一抱枕毯 ': ['抱枕毯', '炸醬麵'],
 '夜光筆記本': ['夜光', '筆記本'],
 '可愛超人狗狗存錢筒 ': ['存錢筒', '超人', '狗狗'],
 '多功能磁吸式牙刷收納置物架': ['多功能', '收納', '牙刷', '磁吸式', '置物架'],
 '輕巧好攜帶紫外線消毒棒 ': ['好攜帶', '消毒棒', '紫外線', '輕巧'],
 '吐司麵包表情抱枕 ': ['吐司', '抱枕', '麵包'],
 '台幣美金鈔票造型毛巾': ['台幣', '美金', '造型毛巾', '鈔票'],
 '會害羞的狗狗暖手行動充 ': ['暖手行動充', '狗狗'],
 '畢業禮物復古文藝羽毛筆': ['禮物', '羽毛筆', '畢業'],
 '畢業熊花束': ['花束', '畢業'],
 '學生畢業拍照道具': ['學生', '拍照道具', '畢業'],
 '二合一寵物快乾吹毛梳': ['吹毛梳', '寵物', '快乾'],
 '手繪動物系客製慶生盤': ['動物系', '慶生盤'],
 '貓咪隧道': ['貓咪', '隧道'],
 '愛的模樣100種戀愛挑戰': ['戀愛'],
 '滿滿的我愛你膠囊情書禮盒': ['情書', '我愛你'],
 '親嘴情侶對杯': ['對杯', '情侶', '親嘴']}

In [8]:
X = vectorizer.fit_transform([' '.join(w) for w in list(gift_most_related_words.values())]) 
X = X.toarray()

In [70]:
k = 4
K = KMeans()
gift_cluster_result = K.kmeans(X, k)
cluster = [[] for _ in range(k)]

for idx, c in enumerate(gift_cluster_result):
    cluster[int(c)].append(gift_name[idx])
    
for c, result in enumerate(cluster):
    print('Cluster {}: {}'.format(c, ' '.join(result)))

Cluster 0: 吐司麵包表情抱枕  二合一寵物快乾吹毛梳 親嘴情侶對杯
Cluster 1: 夜光筆記本 多功能磁吸式牙刷收納置物架 輕巧好攜帶紫外線消毒棒  會害羞的狗狗暖手行動充 
Cluster 2: 無力炸醬麵二合一抱枕毯  台幣美金鈔票造型毛巾 手繪動物系客製慶生盤 滿滿的我愛你膠囊情書禮盒
Cluster 3: 瘋狂玩樂遊戲機立體造型卡片  可愛超人狗狗存錢筒  畢業禮物復古文藝羽毛筆 畢業熊花束 學生畢業拍照道具 貓咪隧道 愛的模樣100種戀愛挑戰
