# 讀取檔案與前處理

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('health_fitness.csv', engine='python')

In [33]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
x = ['way', 'product', 'day', 'indiegogo', 'campaign', 'year', 'thing', 'campaigns', 'products', 
     'days', 'things', 'design', 'designing', 'designs', 'years', 'low', 'world', 'people', 'production',
    'project', 'projects', 'problem', 'problems', 'come', 'comes', 'coming', 'first', 'today', 'item', 'items',
    'other', 'others', 'possible', 'possibility', 'result', 'quality', 'backer', 'backers', 'amp', 'need',
     'class', 'user', 'users', 'parts', 'part', 'friendly', 'datum', 'units', 'unit', 'using'
     'time', 'help', 'life', 'health','vapor_soothers', 'programs', 'catspad', 'Remedium', 'Diawater',
    'perk', 'perks', 'DIRTEA', 'Stemoscope', 'research', 'remedium',  'dirtea', 'Gentletent', 'gentletent', 'device'
    , 'brush effects', 'BRUSH EFFECTS', 'dirtea', 'diawater', 'WalkingPad', 'walkingpad', 'TinyMount', 'order', 'orders',
    'market', 'biosband', 'work', 'working', 'works', 'time', 'times', 'timing', 'spoontek', 'tinymount', 'xculpter']
for i in x:
    stop_words.append(i)
print(stop_words)

In [None]:
data = df.description.values.tolist()

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
        
data_words = list(sent_to_words(data))

print(data_words[:1])
print(len(data_words))

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=1, threshold=60) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=50)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

In [None]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

char = ['NOUN','VERB','ADJ', 'ADV']

def lemmatization(texts, allowed_postags=char):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer


# Form Bigrams
data_words_bigrams = make_bigrams(data_words)


# Remove Stop Words
data_words_nostops = remove_stopwords(data_words_bigrams)



# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN'])

# Stemming
porter = PorterStemmer()
lancaster=LancasterStemmer()
# print(data_words[1])

data_words_stem = []
for num in range(len(data_lemmatized)):
    words_stem = []
    for word in data_lemmatized[num]:
        word_stem = porter.stem(word)
        words_stem.append(word_stem)
    data_words_stem.append(words_stem)

In [None]:
print('原始文字資料')    
print(data[1])
print('  ')
print('原始文字資料的斷詞')
print(data_words[1])
print('  ')
print('form bigrim')
print(data_words_bigrams[1])
print('  ')
print('removal stopwords後的結果')
print(data_words_nostops[1])
print('  ')
print('Lemmatization後的結果')
print(data_lemmatized[1])
print('  ')
print('stemming的結果')
print(data_words_stem[1])

# 讀取字典

In [None]:
# load Dictionary
from gensim.corpora import Dictionary

id2word = Dictionary.load_from_text(r'C:\Users\USER\Desktop\論文\LDA\test25\health_fitness_dictionary_25')

# Create Corpus
texts = data_words_stem

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]


from gensim.models import TfidfModel
tfidf = TfidfModel(corpus, id2word = id2word)

'''
# #filter low value words
low_value = 0.15

for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    new_bow = [b for b in bow if b[0] not in low_value_words]

#reassign        
corpus[i] = new_bow

# print(corpus[:])
# print(tfidf)
'''

print(corpus[:])

# 讀取LDA

In [None]:
from gensim.test.utils import datapath
lda = gensim.models.ldamodel.LdaModel.load('C:/Users/USER/Desktop/論文/LDA/test25/lda_model_ver_hf_25')

In [None]:
model_topics = lda.show_topics(formatted=False)
pprint(lda.print_topics(num_topics = 50, num_words=25))

In [None]:
import matplotlib.pyplot as plt 
from wordcloud import WordCloud, STOPWORDS 

for t in range(lda.num_topics): 
    plt.figure() 
    plt.imshow(WordCloud(background_color='white').fit_words(dict(lda.show_topic(t, 200))))
    plt.axis("off") 
    plt.title("TopiC#" + str(t+1)) 
    plt.show() 

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda, corpus, id2word)
vis

In [None]:
def format_topics_sentences(ldamodel=lda, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']


pd.options.display.max_columns = None
pd.options.display.max_rows = None

# Show
df_dominant_topic.head(810)

In [None]:
print(len(corpus))
for i in range(0, len(corpus)):
    print(i, "get_document_topics", lda.get_document_topics(corpus[i], minimum_probability=0.3))

In [None]:
distribution = []
for i in range(0, len(corpus)):
    for d in lda.get_document_topics(corpus[i], minimum_probability=0.0):
        distribution.append(d[1])
    
print(len(distribution))

In [None]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [None]:
all_campaign_distribution = list(chunks(distribution, 11))
print(len(all_campaign_distribution))
print(all_campaign_distribution[0])
print(all_campaign_distribution[1])
print(all_campaign_distribution[2])

In [None]:
print(all_campaign_distribution)

In [None]:
np.savetxt('topic_distribution.csv', all_campaign_distribution, delimiter = ',', fmt = '%s')

# 主題九分群

In [None]:
topic_8_campaign_index = []
for i in all_campaign_distribution:
    if i[8] >= 0.3:
        topic_8_campaign_index.append(all_campaign_distribution.index(i))

print(len(topic_8_campaign_index))
index_excel = []
for i in topic_8_campaign_index:
    i += 2
    index_excel.append(i)

print(len(index_excel))
print(index_excel)

In [None]:
topic_8_campaign = []
for i in topic_8_campaign_index:
    topic_8_campaign.append(all_campaign_distribution[i])

print(len(topic_8_campaign))
print(topic_8_campaign[0])

topic_8_array = np.array(topic_8_campaign)
print(type(topic_8_array))

In [None]:
np.savetxt('topic_9_distribution.csv', topic_8_campaign, delimiter = ',', fmt = '%s')

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
kmeans = KMeans(n_clusters=2, max_iter = 800).fit(topic_8_array)
#分群結果
kmeans.labels_  
# 調高次數
# 正規化

In [None]:
from sklearn import metrics
from sklearn.metrics import pairwise_distances
labels = kmeans.labels_
metrics.calinski_harabasz_score(topic_8_array, labels)

In [None]:
kmeans_1 = KMeans(n_clusters=3, max_iter = 800).fit(topic_8_array)
labels_1 = kmeans_1.labels_
metrics.calinski_harabasz_score(topic_8_array, labels_1)

In [None]:
kmeans_2 = KMeans(n_clusters=4, max_iter = 800).fit(topic_8_array)
labels_2 = kmeans_2.labels_
metrics.calinski_harabasz_score(topic_8_array, labels_2)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
campaign_pca = pca.fit(topic_8_array)
campaign_pca_1 = pca.transform(topic_8_array)
print("original shape:   ", topic_8_array.shape)
print("transformed shape:", campaign_pca_1.shape)
print(type(campaign_pca_1))

In [None]:
kmeans_with_pca = KMeans(n_clusters=4, max_iter = 800).fit(campaign_pca_1)
kmeans_with_pca.labels_
labels_pca = kmeans_with_pca.labels_
metrics.calinski_harabasz_score(topic_8_array, labels_pca )

In [None]:
labels_pca

In [None]:
plt.scatter(campaign_pca_1[:,0],campaign_pca_1[:,1],c=kmeans_with_pca.predict(campaign_pca_1))

In [None]:
import random
class KMeans:
    def cal_dist(self, p0, p1):
        """
        比較兩點的距離
        """
        return np.sqrt(np.sum((p0-p1)**2))

    def kmeans(self, datapoints, k=2):
        # 定義資料維度
        d = datapoints.shape[1]
        # 最大的迭代次數
        Max_Iterations = 1000

        cluster = np.zeros(datapoints.shape[0])
        prev_cluster = np.ones(datapoints.shape[0])

        cluster_centers = []
        for i in range(k):
            cluster_centers += [random.choice(datapoints)]

        iteration = 0
        while np.array_equal(cluster, prev_cluster) is False or iteration > Max_Iterations:
            iteration += 1
            prev_cluster = cluster.copy()

            # 將每一個點做分群
            for idx, point in enumerate(datapoints):
                min_dist = float("inf")
                for c, cluster_center in enumerate(cluster_centers):
                    dist = self.cal_dist(point, cluster_center)
                    if dist < min_dist:
                        min_dist = dist  
                        cluster[idx] = c   # 指定該點屬於哪個分群

            # 更新分群的中心
            for k in range(len(cluster_centers)):
                new_center = np.zeros(d)
                members = 0
                for point, c in zip(datapoints, cluster):
                    if c == k:
                        new_center += point
                        members += 1
                if members > 0:
                    new_center = new_center / members
                cluster_centers[k] = new_center

        return cluster

In [None]:
k = 3
K = KMeans()
topic_8_cluster_result = K.kmeans(topic_8_array, k)
cluster = [[] for _ in range(k)]

for idx, c in enumerate(topic_8_cluster_result):
    cluster[int(c)].append(topic_8_campaign_index[idx])
    
for c, result in enumerate(cluster):
    print('Cluster {}: {}'.format(c, result))

# 全部計畫主題統計

In [None]:
# 總主題統計
count_all = []
for i in range(0, len(corpus)):
    for j in lda.get_document_topics(corpus[i], minimum_probability=0.3):
            count_all.append(j[0])
            
print(count_all)

In [None]:
topic_count_all = dict((a+1, count_all.count(a)) for a in count_all)
print(topic_count_all)

In [None]:
df_topic_all = pd.DataFrame(list(topic_count_all.items()),columns = ['主題','計畫數'])
df_topic_all.sort_values(by=['主題'])

In [None]:
from pylab import mpl
mpl.rcParams["font.sans-serif"] = ["MingLiU"]

In [None]:
import matplotlib.pyplot as plt 
x = df_topic_all['主題']
y = df_topic_all['計畫數']
plt.xlabel('主題編號')
plt.ylabel('計畫數')
width = 0.35
plt.bar(x, y, width)
# plt.savefig('整體分布長條圖.png')

# 2015計畫主題統計

In [None]:
count_2015 = []
for i in range(0, 266):
    for j in lda.get_document_topics(corpus[i], minimum_probability=0.3):
            count_2015.append(j[0])
            
print(len(count_2015))

In [None]:
topic_count_2015 = dict((a+1, count_2015.count(a)) for a in count_2015)
print(topic_count_2015)

In [None]:
hf2015 = df_dominant_topic[:266]
hf2015.tail()

In [None]:
print(len(hf2015))

In [None]:
df_topic_2015 = pd.DataFrame(list(topic_count_2015.items()),columns = ['主題','計畫數'])
df_topic_2015.sort_values(by=['主題'])

# 2016計畫主題統計

In [None]:
count_2016 = []
for i in range(266, 395):
    for j in lda.get_document_topics(corpus[i], minimum_probability=0.3):
            count_2016.append(j[0])
            
print(count_2016)

In [None]:
topic_count_2016 = dict((a+1, count_2016.count(a)) for a in count_2016)
print(topic_count_2016)

In [None]:
hf2016 = df_dominant_topic[266:395]
hf2016.tail()

In [None]:
print(len(hf2016))

In [None]:
df_topic_2016 = pd.DataFrame(list(topic_count_2016.items()),columns = ['主題','計畫數'])
df_topic_2016.sort_values(by=['主題'])

# 2017計畫主題統計

In [None]:
count_2017 = []
for i in range(395, 466):
    for j in lda.get_document_topics(corpus[i], minimum_probability=0.3):
            count_2017.append(j[0])
            
print(count_2017)

In [None]:
topic_count_2017 = dict((a+1, count_2017.count(a)) for a in count_2017)
print(topic_count_2017)

In [None]:
hf2017 = df_dominant_topic[395:466]
hf2017.tail()

In [None]:
print(len(hf2017))

In [None]:
df_topic_2017 = pd.DataFrame(list(topic_count_2017.items()),columns = ['主題','計畫數'])
df_topic_2017.sort_values(by=['主題'])

# 2018計畫主題統計

In [None]:
count_2018 = []
for i in range(466, 529):
    for j in lda.get_document_topics(corpus[i], minimum_probability=0.3):
            count_2018.append(j[0])
            
print(count_2018)

In [None]:
topic_count_2018 = dict((a+1, count_2018.count(a)) for a in count_2018)
print(topic_count_2018)

In [None]:
hf2018 = df_dominant_topic[466:529]
hf2018.tail()

In [None]:
print(len(hf2018))

In [None]:
df_topic_2018 = pd.DataFrame(list(topic_count_2018.items()),columns = ['主題','計畫數'])
df_topic_2018.sort_values(by=['主題'])

# 2019計畫主題統計

In [None]:
count_2019 = []
for i in range(529, 597):
    for j in lda.get_document_topics(corpus[i], minimum_probability=0.3):
            count_2019.append(j[0])
            
print(count_2019)

In [None]:
topic_count_2019 = dict((a+1, count_2019.count(a)) for a in count_2019)
print(topic_count_2019)

In [None]:
hf2019 = df_dominant_topic[529:597]
hf2019.tail()

In [None]:
print(len(hf2019))

In [None]:
df_topic_2019 = pd.DataFrame(list(topic_count_2019.items()),columns = ['主題','計畫數'])
df_topic_2019.sort_values(by=['主題'])

# 2020計畫主題統計

In [None]:
count_2020 = []
for i in range(597, 806):
    for j in lda.get_document_topics(corpus[i], minimum_probability=0.3):
            count_2020.append(j[0])
            
print(count_2020)

In [None]:
topic_count_2020 = dict((a+1, count_2020.count(a)) for a in count_2020)
print(topic_count_2020)

In [None]:
hf2020 = df_dominant_topic[597:]
hf2020.tail()

In [None]:
print(len(hf2020))

In [None]:
df_topic_2020 = pd.DataFrame(list(topic_count_2020.items()),columns = ['主題','計畫數'])
df_topic_2020.sort_values(by=['主題'])

# 搜尋方式 (精確....)
# 其他搜尋引擎 (google、google scholar、新聞......)
# 多個來源