In [50]:
#encoding = utf8
import jieba
from scipy import spatial
from gensim.models.keyedvectors import KeyedVectors

def load_word2vec(vec_path = '/home/gwang3/workspace/chatbot/chinese_word2vec/cn.skipgram.bin'):
    return KeyedVectors.load_word2vec_format('/home/gwang3/workspace/chatbot/chinese_word2vec/cn.skipgram.bin', binary=True, unicode_errors='ignore')

def load_core_words(model, core_word_path = 'elec_core_word'):
    core_word_list = []
    infile = open(core_word_path)
    for line in infile:
        line = line.strip().decode('utf-8')
        if line in model:
            core_word_list.append(line)
    return core_word_list


#Core function to calculat sentence vector
def cal_sen_vec(sentence, core_word_list, model, debug=False):
    seg_org_list = jieba.cut(sentence)
    seg_list = []
    for sen_seg in seg_org_list:
        sen_seg = sen_seg.strip()
        if sen_seg == '':
            continue
        seg_list.append(sen_seg)
    if debug:
        print seg_list
        
    vec = []
    for n, item in enumerate(core_word_list):
        #print n
        max_simi = 0
        for sen_seg in seg_list:
            simi_tmp = 0
            try:
                simi_tmp = model.similarity(item, sen_seg)
            except Exception as e:
                #print e
                simi_tmp = 0
            if simi_tmp > max_simi:
                max_simi = simi_tmp
        vec.append(max_simi)
    if debug:
        print vec
    return vec


#Core QA Dic Format:
#core_qa_dic[question][0] = question_vector
#core_qa_dic[question][1] = answer
def load_core_qa(core_word_list, model, core_qa_path = 'elec_core_qa'):
    core_qa_dic = {}
    infile = open(core_qa_path)
    for line in infile:
        line = line.strip().split('|')
        question = line[0].strip().decode('utf-8')
        answer = line[1].strip().decode('utf-8')
        core_qa_dic[question] = []
        core_qa_dic[question].append(cal_sen_vec(question, core_word_list, model))
        core_qa_dic[question].append(answer)
    return core_qa_dic
    

def cal_cos_simi(vec1, vec2):
    return (1 - spatial.distance.cosine(vec1, vec2))

def find_best_answer(question, core_qa_dic, core_word_list, model):
    q_vec = cal_sen_vec(sentence = question, core_word_list = core_word_list, model = model, debug = False)
    #print q_vec
    max_simi = 0
    best_q = ''
    for item in core_qa_dic:
        tmp_simi = cal_cos_simi(q_vec, core_qa_dic[item][0])
        if tmp_simi > max_simi:
            max_simi = tmp_simi
            best_q = item
    print "Best Matched Question is:"
    print best_q
    print "Similarity is:"
    print max_simi
    return core_qa_dic[best_q][1]

In [None]:
model = load_word2vec()
core_word_list = load_core_words(model = model)
print len(core_word_list)
core_qa_dic = load_core_qa(core_word_list = core_word_list, model = model)

1769


In [49]:
user_question = u'我触电了怎么办'
answer = find_best_answer(user_question, core_qa_dic, core_word_list, model)
print answer

[u'\u6211', u'\u89e6\u7535', u'\u4e86', u'\u600e\u4e48\u529e']
[0.25574871228254303, 0.38151570784233357, 0.40472357315229746, 0.28122452895143635, 0.30144551534750541, 0.32420933603018287, 0.58880984478499931, 0.3562708796126009, 0.49575076946908103, 0.3490684364089176, 0.33453548081677043, 0.39414809496298242, 0.43062547794683081, 0.35768025963421962, 0.34897235182947423, 0.46656029194678428, 0.28750640892855456, 0.4542428598902043, 0.30655361660357894, 0.23429308071447874, 0.29693201199136143, 0.31809005383912592, 0.4422907621187222, 0.3031029310573643, 0.2873173980499466, 0.27816663747687986, 0.29444352455632206, 0.42595369536139399, 0.36699836920027029, 0.26327347040845228, 0.55443402933244279, 0.50852227197472966, 0.28675291388344593, 0.32899216167932144, 0.43915823693248857, 0.37183392254173109, 0.50796485370439581, 0.46547280293212745, 0.51581648893854082, 0.29895615152310395, 0.41123912479669861, 0.32954585384300128, 0.41511218773629543, 0.30545062993568373, 0.3569751244488662

KeyError: ''

In [19]:
sentence = u'我触电了怎么办'
seg_list = jieba.cut(sentence)
for item in seg_list:
    print item
    max_sim = 0
    closest_word = ''
    for itemm in core_word_list:
        similarity = model.similarity(itemm, item)
        if similarity > max_sim:
            max_sim = similarity
            closest_word = itemm
            
    print 'Closet_Word is'
    print closest_word
    print max_sim

我
Closet_Word is
我
1.0
触电
Closet_Word is
触电
1.0
了
Closet_Word is
已经
0.800819047499
怎么办
Closet_Word is
怎么办
1.0


In [14]:
word = u'触电'
for item in core_word_list:
    similarity = model.similarity(item, word)
    if similarity != 0:
        print item
        print similarity
    

统建
0.156730928336
商户
0.152544789149
换位
0.311919129935
专用
0.154569599764
燃油
0.167567170808
孕妇
0.182622394102
能否
0.299589113548
到场
0.286460882652
合理
0.182488144045
中心
0.139131915897
上限
0.170338573684
历史
0.161418663442
城市
0.141136000944
结算
0.183843742973
用于
0.131749466008
聊
0.159923967833
租赁
0.182871033558
内容
0.217691495841
试验
0.252393775291
电价
0.234293080714
代办
0.1600767935
大修
0.200676662607
停电
0.442290762119
准确度
0.190946554092
自助银行
0.172002377011
银联
0.130133532919
新能源
0.190428823103
电视
0.375402064007
信件
0.143491492863
大鹏
0.192748611633
抄表员
0.285651177052
配电箱
0.508522271975
广东
0.239877710198
输出
0.308765222369
着火
0.439158236932
养老院
0.2088124966
成本
0.206485733037
核心
0.189234225628
电流互感器
0.515816488939
材质
0.217178012637
感应
0.411239124797
收费
0.104558795174
提交
0.131939352299
容量
0.212736486126
白天
0.214163837389
登录
0.166000371756
隔音
0.214057739197
对账
0.130105566172
体验
0.270589255435
全天
0.163447989051
暂时
0.28607482854
分开
0.264319284443
付费
0.186174213243
转化
0.187102488786
入伙
0.125444182051
工作证
0.

In [None]:
core_qa_dic = load_core_qa(core_word_list = core_word_list, model = model)

In [19]:
cal_sen_vec(sentence = u"我触电了怎么办", core_word_list = core_word_list, model = model, debug = True)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [31]:
infile = open('elec_core_word')
for line in infile:
    print 'hahaha'
    line = line.strip()
    print line
    if line in model:
        print line
        core_word_list.append(line)
    print core_word_list

In [7]:
cal_sen_vec("今天天气不错，我准备去吃个包子。包子是大肉馅的，特别特别香！")

今天天气
不错
，
我
准备
去
吃
个
包子
。
包子
是
大
肉馅
的
，
特别
特别
香
！


In [18]:
from gensim.models.keyedvectors import KeyedVectors
model = KeyedVectors.load_word2vec_format('/home/gwang3/workspace/chatbot/chinese_word2vec/cn.skipgram.bin', binary=True, unicode_errors='ignore')

In [46]:
print model.similarity(u'电费', u'水费')

0.71662710525


In [49]:
from scipy import spatial

dataSetI = [3, 45, 7, 2]
dataSetII = [2, 54, 13, 15]
result = 1 - spatial.distance.cosine(dataSetI, dataSetII)
print result

0.972284251712


In [50]:
filename = raw_input('Enter a file name: ')

In [51]:
print filename

hahah
