In [43]:
from time import time
import pickle
import json
from collections import defaultdict

from itertools import chain

from sklearn.cluster import KMeans, DBSCAN

import numpy as np
from numpy import dot
from numpy.linalg import norm

np.random.seed(42)

In [2]:
from tqdm import tqdm

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
# from nltk import word_tokenize 
# from nltk.tokenize import regexp_tokenize
# from nltk.stem import WordNetLemmatizer 

In [25]:
# class LemmaTokenizer(object):
#     def __init__(self):
#         self.wnl = WordNetLemmatizer()
#     def __call__(self, doc):
#         return [self.wnl.lemmatize(t) for t in regexp_tokenize(doc, pattern='[a-zA-Z\']+')]

def text_to_tfidf(text_var):
    # vectorizer = TfidfVectorizer(sublinear_tf=True, token_pattern=u'(?u)\\b[a-zA-Z][a-zA-Z]+\\b',)
    # vectorizer = TfidfVectorizer(sublinear_tf=True, token_pattern=u'(?u)\\b\\w\\w+\\b',)
    #   vectorizer = TfidfVectorizer(sublinear_tf=True, tokenizer=LemmaTokenizer(),ngram_range=(1, 1))
    # vectorizer = TfidfVectorizer(sublinear_tf=True, tokenizer=LemmaTokenizer(),ngram_range=(1, 1),use_idf=False,binary=True,norm=None)
#     vectorizer = TfidfVectorizer()
    vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(2, 3), token_pattern=u'(?u)\\b\\w+\\b')
#     vectorizer = TfidfVectorizer(use_idf=False, ngram_range=(2, 3))

    
    tfidf_vector = vectorizer.fit_transform(text_var)
    features = vectorizer.get_feature_names()

    return tfidf_vector, features



In [6]:
with open('stop_words_strict.txt') as stopfile:
    lines = stopfile.readlines()
    stop_words_strict = set([line.strip() for line in lines])
    
def check_stopwords(tokens, use_any=False):
    is_stop = [w in stop_words_strict for w in tokens]
    if use_any:
        return any(is_stop)
    else:
        return all(is_stop)

In [35]:
def get_label(vectors_in_cluster, sentences_in_cluster, texts_in_cluster, method='tfidf'):
    
    label_in_cluster = []
    if method == 'tfidf':
        #把每個cluster視為一篇文章處理
        tfidf_in_cluster, features = text_to_tfidf(texts_in_cluster) 
        tfidf_in_cluster.sort_indices()
        n = tfidf_in_cluster.shape[0]
        
        for i in range(n):
            row = tfidf_in_cluster.getrow(i).toarray()
            sort_ind = np.argsort(row)
            sort_ind = sort_ind.tolist()[0]
            discard = True
            ep = -1
            while discard:  #如果都是stop words 或是有repeat詞就跳過
                f_ind = sort_ind[ep]
                feat = features[f_ind]
                is_stop = check_stopwords(feat.split())
                repeat = len(feat.split()) != len(set(feat.split()))
                discard = is_stop or repeat
                ep = ep-1
                
            label_in_cluster.append(feat)
            
    elif method =='rake':
        from rake_nltk import Rake
        with open('stop_words.txt') as stopfile:
            stopwords = stopfile.readlines()
            stopwords = [line.strip() for line in  stopwords]
        rake = Rake(stopwords)
        for i in range(n_cluster):
            text= texts_in_cluster[i]
            rake.extract_keywords_from_text(text)
            phrases = rake.get_ranked_phrases()
            tokens_count = np.array([len(p.split(' ')) for p in phrases])
            sort_count_id = np.argsort(tokens_count)
            pid = sort_count_id[0]
            label = phrases[pid]
#             for phrase in rake.get_ranked_phrases():
# #                 if len(phrase.split(' ')) == 2 or len(phrase.split(' ')) == 3:
#                 if len(phrase.split(' ')) <=3 :
#                     label = phrase
            label_in_cluster.append(label)
    
    elif method == 'centroid_sim':
        # compute similiarity of each sentence to their centroids
        sim_list = []
        for cidx, vec_in_one_c in enumerate(vectors_in_cluster):
            cmean = np.mean(vec_in_one_c, axis=0)
            n = vec_in_one_c.shape[0]
            for i in range(n):
                v = vec_in_one_c[i,:]
                sim = dot(v, cmean)/(norm(v)*norm(cmean))
                sim_list.append(sim)

        for cidx, sim_list in enumerate(sim_in_cluster):
            sort_sim_ind = np.argsort(sim_list)
            top_labels = []
            compare_count = 5
            if len(sim_list) < 5:
                compare_count =  len(sim_list)
            for j in range(-1, (-1 - compare_count),-1):
                s_ind = sort_sim_ind[j]
                top_labels.append(sentences_in_cluster[cidx][s_ind])
            
            short_label = ''
            for label in  top_labels:
                if len(label.split())>2: 
                    if  len(short_label)==0 or len(short_label) > len(label):
                        short_label=label
            label_in_cluster.append(short_label)

    return label_in_cluster

In [29]:
def cluster_sentences(vectors, sentences, n_cluster=30, choose_n_cluster=8, method='kmeans'):

    assert len(vectors) == len(sentences)

    if method.lower() == 'kmeans':
        cluster_model = KMeans(init='k-means++', n_clusters=n_cluster, n_init=10)
    elif method.lower() == 'dbscan':
        cluster_model = DBSCAN(algorithm='brute', metric='cosine',min_samples=5 , eps=0.15)
    else:
        raise ValueError('method should be kmeans or dbscan')

    cluster_model.fit(vectors)
    labels = cluster_model.labels_

    ##################################

    all_c = list(set(labels))

    vectors_in_cluster=[]
    for c in all_c:
        vec_in_one_c = [vectors[i] for i, l  in enumerate(labels) if l == c]
        vec_in_one_c =np.array(vec_in_one_c)
        vectors_in_cluster.append(vec_in_one_c)

    sentences_in_cluster=[]
    texts_in_cluster=[]
    for c in all_c:
        sent_in_one_c = [sentences[i] for i, l  in enumerate(labels) if l ==c]

        sentences_in_cluster.append(sent_in_one_c)

        text = '.\n'.join(sent_in_one_c)
        texts_in_cluster.append(text)


    label_in_cluster = get_label(vectors_in_cluster, sentences_in_cluster, texts_in_cluster, method='tfidf')

    ##########################################

    centroid_in_cluster=[]
    for vec_in_one_c in vectors_in_cluster:
        cmean = np.mean(vec_in_one_c, axis=0)
        centroid_in_cluster.append(cmean)


    ##########################################      
    # 選出與centroid平均距離最小的前幾個clusters
    distance_in_cluster =np.zeros(n_cluster)
    for cidx, vec_in_one_c in enumerate(vectors_in_cluster):
        cmean = centroid_in_cluster[cidx]
        n = vec_in_one_c.shape[0]
        distance = 0
        for i in range(n):
    #     sim =cosine_similarity(vec_in_one_c[i,:].reshape(-1, 1), cmean.reshape(-1, 1))
            v = vec_in_one_c[i,:]
            dist = norm(v-cmean)
            distance += dist
        distance_in_cluster[c] = (distance/n)

    sort_cidx_by_dist = np.argsort(distance_in_cluster)
    cidx_chosen = sort_cidx_by_dist[:choose_n_cluster]

    all_c = np.array(all_c)
    all_c = all_c.take(cidx_chosen)
    cluster_of_sent = [lab if lab in all_c else -1 for lab in labels ]
    labels_chosen = [ label_in_cluster[cidx] for cidx in cidx_chosen ]


    return cluster_of_sent, all_c, labels_chosen    

In [11]:
#load files

with open('data/mid_to_reviews_in_sents_vector.bin', 'rb') as fin:
    movie_review_vectors = pickle.load(fin)

with open('data/mid_to_reviews_in_sents_token.json', 'r') as fin:
    movie_review_tokens = json.load(fin)

In [42]:
#single movie check
movie_id = '6467'
vectors_allr = movie_review_vectors[movie_id]
vectors= list(chain(*vectors_allr))

sentences_allr = movie_review_tokens[movie_id]
sentences= list(chain(*sentences_allr))

# print(sentences[:3])
# print(vectors[:3])

sent_count = len(vectors)
n_cluster = int(sent_count/100)
cluster_of_sent, all_c, labels = cluster_sentences(vectors, sentences, n_cluster, choose_n_cluster=8, method='kmeans')
print('\n'.join(labels))
print(all_c)

非常 好看
劇情 非常
很 容易
風格 畫風
電影院 看
感謝 妳
值得 推薦
小 戀愛 聽
[ 0 14 13 12 11 10  9 15]


In [54]:
#all movies
mid_to_cidofsent = defaultdict(list)
mid_to_tags = {}
finish = 0
for movie_id in movie_review_vectors.keys():

    ##### cluster and get labels
    vectors_allr = movie_review_vectors[movie_id]
    vectors= list(chain(*vectors_allr))

    sentences_allr = movie_review_tokens[movie_id]
    sentences= list(chain(*sentences_allr))

    sent_count = len(vectors)
    n_cluster = int(sent_count/100)
    cluster_of_sent, all_c, labels = cluster_sentences(vectors, sentences, n_cluster, choose_n_cluster=8, method='kmeans')


    ##### mapping 
    i=0
    for review in movie_review_vectors[movie_id]:
        cidofsent = []
        for sent in review:
            cidofsent.append(cluster_of_sent[i])
            i+=1
        mid_to_cidofsent[movie_id].append(cidofsent)
       
    mid_to_tags[movie_id] = list(zip(labels,all_c))
    
    finish+=1
    if finish%10==0:
        print(finish)


10
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430


In [56]:
print(mid_to_tags)

{'4334': [('值得 看', 0), ('很 好看', 16), ('感動 我的', 15), ('很 好聽', 14), ('看 一次', 13), ('革命 群眾', 12), ('dvd 一定', 11), ('看 歌劇', 10)], '5722': [('看 完', 0), ('很 恐怖', 1), ('演員 演技', 2), ('很 不錯', 3), ('好 劇情', 4), ('台灣 鬼片', 5), ('點就 好 值得', 6), ('很 好看', 7)], '3722': [('靈魂 中尉', 0), ('最後 分鐘', 14), ('好 吧', 13), ('特別 不是', 12), ('火車 爆炸', 11), ('不錯 看', 10), ('史蒂文斯 上尉', 9), ('簡稱 號', 15)], '5596': [('看 一次', 0), ('風在 前', 1), ('好看 超', 2), ('熱血 很', 3), ('非常 好看', 4), ('很 勵志', 5), ('台灣 拍', 6), ('無懼 風在 前', 7)], '2908': [('電影院 看', 0), ('它 恐怖', 1), ('最後 一幕', 2), ('惡靈 復仇', 3), ('很 恐怖', 4), ('還 不錯', 5), ('亂 拍', 6), ('結局 很', 7)], '3058': [('算 白', 0), ('新竹 彰化 嘉義', 15), ('數位 單眼', 14), ('看 完', 13), ('這部 片', 12), ('還 不錯', 11), ('一個 華人 去演', 10), ('一定 去', 9)], '2851': [('買 小說', 0), ('期待 第二集', 26), ('看過 小說', 27), ('一定 去', 28), ('還 不錯', 29), ('看 小說', 30), ('看 第二次', 31), ('看 完 電影', 32)], '3616': [('整部 片', 0), ('很 好笑', 1), ('很 好', 2), ('查克 葛里芬', 3), ('不好 笑', 4)], '2703': [('看 首映', 0), ('這部 片', 1), ('整部 片', 2), ('情侶 去', 3), ('鼓舞人

In [59]:

with open('data/mid_to_cidofsent.bin','wb') as fout:
    pickle.dump(dict(mid_to_cidofsent), fout)
    
with open('data/mid_to_tags.bin','wb') as fout:
    pickle.dump(mid_to_tags, fout)
    
# print(mid_to_cidofsent)
# print(mid_to_tags)