In [1]:
import pickle
import re
from nltk.stem.porter import *
from nltk.corpus import stopwords
import numpy as np

import gensim
from gensim import corpora,models, similarities
import logging
from gensim.models import Word2Vec, Doc2Vec

import sklearn

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation



words = re.compile(r"\w+",re.I)
stopword = stopwords.words('english')
stemmer = PorterStemmer()

with open ('questions', 'rb') as fp:
    questionList = pickle.load(fp)
questionList = list(set(questionList))
questionList = [x.replace('How to ', '') for x in questionList] 

def save_pickle(file_name,file_save):
    file = open(file_name, 'wb')
    pickle.dump(file_save, file)
    file.close()

def token(question):
    word_token = words.findall(question)
#     lower = [stemmer.stem(i.lower()) for i in word_token if i not in stopword]  # 要加[],否则for出错
    lower = [(i.lower()) for i in word_token if i not in stopword]
    return lower

def get_data_token():
    comparison_token = []
    for text in questionList:
        word_list = token(text)
        l = len(word_list)
        word_list[l - 1] = word_list[l - 1].strip()
        comparison_token.append(word_list)

    return comparison_token 
    
comparison_token = get_data_token()
save_pickle('comparison_token',comparison_token)

################################## tfidf #######################################
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
dictionary = corpora.Dictionary(comparison_token)
dictionary.save("wikihow_dictionary.dic")   #save dic 

bow = dictionary.token2id
corpus = [dictionary.doc2bow(text) for text in comparison_token]

tfidf = models.TfidfModel(corpus)
tfidf.save("wikihow_TFIDF_model.mdl")    
tfidf_corpus = tfidf[corpus]
indexTfidf = similarities.MatrixSimilarity(tfidf_corpus)
indexTfidf.save("wikihow_TFIDF.idx")


dictionary = corpora.Dictionary.load("wikihow_dictionary.dic")
tfidfModel = models.TfidfModel.load("wikihow_TFIDF_model.mdl")
indexTfidf = similarities.MatrixSimilarity.load("wikihow_TFIDF.idx")
print('tfidf_model end')

################################## lsi run model ###############################
lsi_model = models.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=50)
lsi_model.save("wikihow_Lsi50Topic.mdl")

index = similarities.MatrixSimilarity(lsi_model[corpus])
indexTfidf.save("wikihow_lsi.idx")
print('lsi_model end')

############################   kl-divergence    #################################
import abc
import collections
class LanguageModel(metaclass=abc.ABCMeta):
    def __init__(self, vocab, order):
        self.vocab = vocab
        self.order = order

    @abc.abstractmethod
    def probability(self, word, *history):
        pass

    
class CountLM(LanguageModel):
    @abc.abstractmethod
    def counts(self, word_and_history):
        pass

    @abc.abstractmethod
    def norm(self, history):
        pass

    def probability(self, word, *history):
        if word not in self.vocab:
            return 0.0
        sub_history = tuple(history[-(self.order - 1):]) if self.order > 1 else ()
        norm = self.norm(sub_history)
        if norm == 0:
            return 1.0 / len(self.vocab)
        else:
            return self.counts((word,) + sub_history) / self.norm(sub_history)


class NGramLM(CountLM):
    def __init__(self, train, order):
        super().__init__(set(train), order)
        self._counts = collections.defaultdict(float) 
        self._norm = collections.defaultdict(float)
        for i in range(self.order, len(train)):
            history = tuple(train[i - self.order + 1: i])
            word = train[i]
            self._counts[(word,) + history] += 1.0
            self._norm[history] += 1.0

    def counts(self, word_and_history):
        return self._counts[word_and_history]

    def norm(self, history):
        return self._norm[history]
class InterpolatedLM(LanguageModel):
    def __init__(self, main, backoff, alpha):
        super().__init__(main.vocab, main.order)
        self.main = main
        self.backoff = backoff
        self.alpha = alpha

    def probability(self, word, *history):
        return self.alpha * self.main.probability(word, *history) + \
               (1.0 - self.alpha) * self.backoff.probability(word, *history)
def lm(token):    
    unigram = NGramLM(token,1)
    bigram  = NGramLM(token,2)
    interpolate = InterpolatedLM(bigram,unigram,0.6)
    return interpolate

def KL_model(t,c): 
    kl = 0.0
    KL = {}
    Px = lm(t)
    Qx = lm(c)
    for word in t:
        if Qx.probability(word)!=0 and Px.probability(word)!=0:
            kl +=  -(Px.probability(word) * 
                    np.log(Qx.probability(word)))          
    return kl   

tfidf_model end
lsi_model end


In [2]:
len(questionList)

121853

In [3]:
############################# w2v & d2v run model ###############################
def d2vec(sorted_d2v):
    print('d2v simi top:')
    e = []
    for d2v in sorted_d2v[:10]:
        w2v_top50 = questionList[d2v.id]
        print(w2v_top50)
        e.append(w2v_top50)
    return e 

def get_dataset():
    train = []
    for i, text in enumerate(questionList):
        word_list = token(text)
        l = len(word_list)
        word_list[l - 1] = word_list[l - 1].strip()
        document = TaggededDocument(word_list, tags=[i])
        train.append(document)

    return train

def w2v_train(token):  ##以下步骤中可以直接load model
#     token_wikihow =  pd.Series(token.tolist())
    vec_wikihowall = Word2Vec(token, size=100, window=5, min_count=1, workers=4, sg=0)
    vec_wikihowall.save('wikihow_w2vec_model') ## change names 
#     model_vec_wikihow = gensim.models.Word2Vec.load('wikihow_w2vec_model') 
    return vec_wikihowall

def represention(token,vec): ## token_train/test_head/body 
    represention = np.zeros((len(token), vec.vector_size))
    for i, tokens in enumerate(token):
        tokens = [t for t in tokens if t in vec.wv.vocab]  #加[] 
        if tokens:
            represention[i, :] = np.mean([vec.wv[t] / vec.wv.vocab[t].count for t in tokens], axis=0)
    return represention 

vec_wikihowall = w2v_train(comparison_token)
vec_wikihowall = gensim.models.Word2Vec.load('wikihow_w2vec_model') 
# rep_comparison = represention(comparison_token,vec_wikihowall)
# np.savetxt('w2v_vec_wikihowall.txt',rep_comparison,delimiter=',')
print("w2v end") 

TaggededDocument = gensim.models.doc2vec.TaggedDocument

def d2v_train(x_train, size=200, epoch_num=1):
    model_dm = Doc2Vec(x_train,min_count=1, window = 3, vector_size = size, sample=1e-3, negative=5, workers=4)
    model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=70)
    model_dm.save('wikihow_doc2vec_model')
 
    return model_dm 

def get_dataset():
    train = []
    for i, text in enumerate(questionList):
        word_list = token(text)
        l = len(word_list)
        word_list[l - 1] = word_list[l - 1].strip()
        document = TaggededDocument(word_list, tags=[i])
        train.append(document)

    return train

# d2v_token = get_dataset()
# model_dm = d2v_train(d2v_token, size=200, epoch_num=1)
# print("d2v end") 

############################# w2v & d2v run model ###############################
model = "wikihow_w2vec_model" 
# model_d = 'wikihow_doc2vec_model'

model_w2v = Word2Vec.load(model)
# model_d2v = Doc2Vec.load(model_d)


w2v end


## all_model: w2v, tfidf, lsi, kl ## 

In [4]:
def w2v(ques):
    class ResultInfo(object):
        def __init__(self, index, score,score_d2v, text):
            self.id = index
            self.score = score
            self.score_d2v = score_d2v
            self.text = text    

    ###############################################################################################
    target= ques
    query = token(target)
    
    for w in query:
        if w not in model_w2v.wv.vocab:
            print ("input word %s not in dict. skip this turn" % w)

    res = []
    index = 0
    score_d2v = 0
    for comparison in comparison_token:    
        #         print (comparisons)
        score = model_w2v.wv.n_similarity(query, comparison)
        score_d2v = model_d2v.docvecs.similarity_unseen_docs(model_d2v, query, comparison)
        res.append(ResultInfo(index, score, score_d2v," ".join(comparison)))
        index += 1

    sorted_w2v = sorted(res, key=lambda ResultInfo:ResultInfo.score, reverse=True)
    print('sorted_w2v')
#     sorted_d2v = sorted(res, key=lambda ResultInfo:ResultInfo.score_d2v, reverse=True)
#     print('sorted_d2v')       
    return sorted_w2v

def w2v_model():
    sorted_w2v = w2v()
    return d2vec(sorted_w2v)

def w2vec(sorted_w2v): 
    print('w2v simi top:') 
    d = []
    for w in sorted_w2v[:100]:
        w2v_top50 = questionList[w.id]
#         print(w2v_top50)
        d.append(w2v_top50)      
    return d  

def tfidf(ques):
    target = ques
    query_bow = dictionary.doc2bow(token(target))
#     print(query)
    tfidfvect = tfidfModel[query_bow]
    simstfidf = indexTfidf[tfidfvect]

    tfidf_sims = sorted(enumerate(simstfidf), key=lambda item: -item[1])
    print ("TFIDF similary Top:") 
    a = []
    for sim in tfidf_sims[:100]:
        tfidf_top50 = questionList[sim[0]]
#         print(tfidf_top50)
        a.append(tfidf_top50)
    return a  

def lsi(ques):
    target = ques
    bow = dictionary.doc2bow(token(target))
    lsi_represent= lsi_model[bow]
    
    sims = index[lsi_represent]

    lsi_sims = sorted(enumerate(sims), key=lambda item: -item[1])
    print ("lsi similary Top:")
    b = []
    for sim in lsi_sims[:100]:
        lsi_top50 = questionList[sim[0]]
#         print(lsi_top50)   
        b.append(lsi_top50)
    return b 

def kl(ques):
    class Resultkl(object):
        def __init__(self, index, kl_score, text):
            self.id = index
            self.kl_score = kl_score
            self.text = text
    target = ques
    query = token(target) 
    ###############################################################################################
    res = []
    index = 0
    for comparison in comparison_token:
    #         print (comparisons)
        kl_score = KL_model(query, comparison)
        res.append(Resultkl(index, kl_score," ".join(comparison)))
        index += 1 
        
    sorted_kl = sorted(res, key=lambda Resultkl:Resultkl.kl_score, reverse=True)
    c =[]
    print ("kl similary Top :")
    for i in sorted_kl[:100]:
    #     print ("text %s: %s, kl_score : %s " % (i.id, i.text, i.kl_score))
        kl_top = i.text
#         print(kl_top)
        c.append(kl_top) 
    return c        

## from simi --- topic model ##

In [None]:
def simi(ques):
    TFIDF = tfidf(ques)
    LSI = lsi(ques)
    KL = kl(ques)
#     W2V = w2v_model()
    mylist = TFIDF+LSI+KL
    a = {}
    for i in mylist:
        a[i] = mylist.count(i)
    sorted_x = sorted(a.items(), key=lambda d:d[1], reverse = True)
    c = []
    for info in sorted_x:
        c.append(info[0])
    b_set = c[:6]
    print(len(c))
    return b_set

def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
    c = {}
    for topic_idx, topic in enumerate(H):
#         print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        top_doc_indices = np.argsort(W[:,topic_idx] )[::-1][0:no_top_documents]
        a = []
        for doc_index in top_doc_indices:
            a.append(documents[doc_index])
        c[topic_idx] = a
    print(c)
    return list(c.values())

def topic_model(b_set):
    documents = b_set

    no_features = 1000

    # NMF is able to use tf-idf
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
    tfidf = tfidf_vectorizer.fit_transform(documents)
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()


    # LDA can only use raw term counts 只考虑每种词汇在该训练文本中出现的频率
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    tf = tf_vectorizer.fit_transform(documents)
    tf_feature_names = tf_vectorizer.get_feature_names()

    no_topics = 4

    # Run NMF
    nmf_model = NMF(n_components=no_topics, random_state=None, alpha=.1, l1_ratio=.5, init='nndsvd',shuffle=False).fit(tfidf)
    nmf_W = nmf_model.transform(tfidf,**params)
    nmf_H = nmf_model.components_
#     n_components=None, init=None, solver=’cd’, beta_loss=’frobenius’, tol=0.0001, max_iter=200, random_state=None, alpha=0.0, l1_ratio=0.0, verbose=0, shuffle=False



    # Run LDA
    lda_model = LatentDirichletAllocation(n_components=no_topics, 
         max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
    lda_W = lda_model.transform(tf)
    lda_H = lda_model.components_

    no_top_words = 10
    no_top_documents = 2
    return display_topics(nmf_H, nmf_W, tfidf_feature_names, documents, 
                   no_top_words, no_top_documents)

def lda_model(b_set):
    documents = b_set

    no_features = 1000

#     # NMF is able to use tf-idf
#     tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
#     tfidf1 = tfidf_vectorizer.fit_transform(documents)
#     tfidf_feature_names = tfidf_vectorizer.get_feature_names()

    # LDA can only use raw term counts 只考虑每种词汇在该训练文本中出现的频率
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    tf = tf_vectorizer.fit_transform(documents)
    tf_feature_names = tf_vectorizer.get_feature_names()

    no_topics = 4

#     # Run NMF
#     nmf_model = NMF(n_components=no_topics, random_state=1, alpha=.1,l1_ratio=.5, init='nndsvd').fit(tfidf1)
#     nmf_W = nmf_model.transform(tfidf1)
#     nmf_H = nmf_model.components_

    # Run LDA
    lda_model = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
    lda_W = lda_model.transform(tf)
    lda_H = lda_model.components_

    no_top_words = 10
    no_top_documents = 1
    return  display_topics(lda_H, lda_W, tf_feature_names, documents, 
                   no_top_words, no_top_documents)  

## 最终模型1 - 找相似问题 ##

In [6]:
def find_sim(ques):
    b_set = simi(ques) 
#     nmf_questions = topic_model(b_set)
    lda_questions = lda_model(b_set)
    return lda_questions

## 最终模型2 - 归纳答案##

In [7]:
import urllib
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from urllib.request import urlopen

import pickle
import re
import random
from random import choice

with open('questions','rb') as fp:
    comparison = pickle.load(fp)
    comparisons = list(set(comparison))

target = (choice(comparisons))

target2 = target.replace('How to ', '')
Target = target2.replace(" ", "-")
print (target) 

base_url = 'https://www.wikihow.com/'
url= urljoin(base_url,Target)
print(url)
html = urlopen(url).read().decode('utf-8')
soup = BeautifulSoup(html, features='lxml')
# print(html)
##############################################################
all_href = soup.find_all('h3')
# print (all_href)
count = 0
partTitle = []
for l in all_href:
#     print('\n', all_href)
    jan = l.find_all('span', {"class": 'mw-headline'})
    for d in jan:
        count += 1
#         print("part ", count,": ", d.get_text())
        partTitle.append(d.get_text())
##############################################################
if partTitle != []:
    href = soup.find_all(class_ = 'section_text')
    count = 1
    length = len(partTitle)
    for l in href:
        if(count == length+1):
            break
        print("\nPart " + str(count) + ": " + partTitle[count-1])
        month = l.find_all('b', {"class": "whb"})
        for m in month:
            print(m.get_text())
        count += 1
    
else:
    href = soup.find_all(class_ = 'section_text')
    count = 0
    for l in href:
        month = l.find_all('b', {"class": "whb"})
        for m in month:
            count += 1
            print("step ", count,": ", m.get_text())

How to Cope in a High Pressure Customer Service Role
https://www.wikihow.com/Cope-in-a-High-Pressure-Customer-Service-Role
step  1 :  Try to keep your cool when dealing with an irate customer.
step  2 :  Set achievable targets.
step  3 :  Don't let other staff members get you down.
step  4 :  Leave your work behind at the end of the day.


## tkinter ##

In [None]:
import tkinter as tk
from tkinter import scrolledtext

def find_ques(ques=""):
    ques=query_entry.get()
    ans_text.delete(1.0, tk.END)
    
    sim_list=find_sim(ques)
    lb.delete(0,tk.END)
    for simq in sim_list:
        lb.insert(tk.END,simq)

def get_ans(target): 
#     target2 = target.replace('How to ', '')
    target1 = target.__str__()
    Target = target1.replace(" ", "-")
    # print (Target)

    base_url = 'https://www.wikihow.com/'
    url= urljoin(base_url,Target)
    # print(url)
    html = urlopen(url).read().decode('utf-8')
    # soup = BeautifulSoup(html, features='lxml')
    soup = BeautifulSoup(html, "html.parser")

    # print(html)
    ##############################################################
    all_href = soup.find_all('h3')
    # print (all_href)
    count = 0
    partTitle = []
    for l in all_href:
    #     print('\n', all_href)
        jan = l.find_all('span', {"class": 'mw-headline'})
        for d in jan:
            count += 1
    #         print("part ", count,": ", d.get_text())
            partTitle.append(d.get_text())
    ##############################################################
    res_string=""
    if partTitle != []:
        href = soup.find_all(class_ = 'section_text')
        count = 1
        length = len(partTitle)
        for l in href:
            if(count == length+1):
                break
            # print("\nPart " + str(count) + ": " + partTitle[count-1])
            res_string +="\nPart " + str(count) + ": " + partTitle[count-1]
            month = l.find_all('b', {"class": "whb"})
            for m in month:
                # print(m.get_text())
                res_string +=m.get_text()
            count += 1
        return url,res_string

    else:
        href = soup.find_all(class_ = 'section_text')
        count = 0
        for l in href:
            month = l.find_all('b', {"class": "whb"})
            for m in month:
                count += 1
                # print("step ", count,": ", m.get_text())
                res_string +="step "+ str(count)+": "+m.get_text()
        return url,res_string
    
def show_ans(event):
    if lb.curselection():
        sim_q=lb.get(lb.curselection())

        query_entry.delete(0, tk.END)
        query_entry.insert(10, sim_q)

        answer = get_ans(sim_q)
        ans_text.delete(1.0, tk.END)
        if answer is not None:
            ans_text.insert(tk.END, answer)

# def find_sim(ques):
#     b_set = simi(ques) 
# #     nmf_questions = topic_model(b_set)
#     lda_questions = lda_model(b_set)
# #     a = nmf_questions,lda_questions
#     return lda_questions

def find_sim(ques):
    b_set = simi(ques)
    return b_set[:6] 
    
root=tk.Tk()
root.geometry('600x600')
root.title("Related 'How to' questions and answer finder")

tk.Label(root,text="Enter your Query：",font=("Arial Bold", 15)).grid(row=0,column=0)

query_entry=tk.Entry(root,width=40)
query_entry.grid(row=0,column=1)

find_bt=tk.Button(root,text="Find",bg="red",command=find_ques)
find_bt.grid(row=0,column=2)

canv=tk.Canvas(root,height=15,width=370)
canv.create_line(5,5,350,5,fill="blue")
canv.grid(row=1,columnspan=3)

ans_text=scrolledtext.ScrolledText(root,height=20,width=80,bd=2,bg="PaleGoldenrod")

# ans_text.insert(tk.END,txt)
ans_text.grid(padx=5,row=7,columnspan=3)

canv=tk.Canvas(root,height=15,width= 370)
canv.create_line(5,5,350,5,fill="blue")
canv.grid(row=5,columnspan=3)

tk.Label(root,
         text='Related Questions box:',
         font=("Arial Bold", 12)).grid(row=2,column=0)

tk.Label(root,
         text='Double Click the related questions, the answer will display in the "Answer box" ',
         font=("bold italic", 12)).grid(row=4,columnspan=3)
tk.Label(root,
         text='Answer box:',
         font=("Arial Bold", 12)).grid(row=6,column=0)

lb=tk.Listbox(root,height=8,width=60)
lb.grid(row=3,columnspan=3)
lb.bind("<Button-1>",show_ans)

root.mainloop()


TFIDF similary Top:
lsi similary Top:
kl similary Top :
271
TFIDF similary Top:
lsi similary Top:
kl similary Top :
271
TFIDF similary Top:
lsi similary Top:
kl similary Top :
291
TFIDF similary Top:
lsi similary Top:
kl similary Top :
285
