In [5]:
from procDataSet import TrainingQuery
from bert_serving.client import BertClient
from multiprocessing import Pool
from torch.utils.data import TensorDataset, DataLoader, Dataset
import torch
import torch.nn as nn
import torch.optim  as optim
import argparse
from keras.utils import to_categorical
import pandas as pd
import numpy as np
from torch.functional import F
import os
import jieba
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import clear_output

jieba.load_userdict(os.path.join('data', 'dict.txt.big'))
[jieba.add_word(i, freq=None, tag=None) for i in ['不支持','文林苑', '都更案','十八趴']]

[None, None, None, None]

In [6]:
top_num = 300

In [7]:
with open('data/stop_word.txt') as file:
    data = file.read()
    
stop_words = data.split('\n')
stop_words += ['「', '」', '，', '\n', '）', '（', ')', '(']

In [8]:
from sklearn.model_selection import train_test_split
folder = 'data/'
raw_training_data = pd.read_csv(os.path.join(folder,'TD.csv'))
news_urls = pd.read_csv(os.path.join(folder,'NC_1.csv'))
contents = pd.read_json(os.path.join(folder,'url2content.json'), typ=pd.Series)

## sort the contents by index
keys, content_list = contents.keys(), contents.values

In [9]:
import multiprocessing

NumberCPU = multiprocessing.cpu_count()
jieba.initialize()

def jbcut(x):
    if x is not None:
        sen = jieba.lcut(x, cut_all=False)
        sen = [i for i in sen if i not in stop_words]
        return sen
    else:
        return None
    
pool = multiprocessing.Pool(processes=NumberCPU)
sentenece_arr = pool.map(jbcut,content_list)
pool.close()
pool.join()

In [10]:
from sklearn.model_selection import train_test_split
batch_size = 100000
folder = 'news_data_1/'
test_query = np.array(pd.read_csv('./data/QS_1.csv').Query)

## word2vec model

In [382]:
import gensim
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
from gensim.corpora.dictionary import Dictionary

def create_dictionaries(p_model):
    gensim_dict = Dictionary()
    gensim_dict.doc2bow(p_model.vocab.keys(), allow_update=True)
    w2indx = {v: k + 1 for k, v in gensim_dict.items()}
    w2vec = {word: w2v_model.wv[word] for word in w2indx.keys()}
    return w2indx, w2vec

cores = multiprocessing.cpu_count()
w2v_model = Word2Vec(min_count=20,
                     size=100,
                     workers=cores-1)

w2v_model.build_vocab(sentenece_arr, progress_per=10000)
w2v_model.train(sentenece_arr, total_examples=w2v_model.corpus_count, epochs=100, report_delay=1)
index_dict, word_vectors= create_dictionaries(w2v_model.wv)

In [395]:
w2v_model.wv.similar_by_word('十八趴')[0]

('優存', 0.8194789290428162)

## okapi model

In [13]:
from gensim.summarization import bm25
bm25Model = bm25.BM25(sentenece_arr)

In [15]:
stands = ['支持', '應該', '反對', '贊成', '不'] 

In [459]:
train_query = list(set(raw_training_data.Query))

y_train = []
y_index = {}
for i  in train_query:
    index = np.where(raw_training_data.Query==i)
    data = raw_training_data.iloc[index]
    y = dict(zip(data.News_Index,data.Relevance))
    y_idx = [ (int(idx.split('_')[1])-1, rel )  for idx, rel in zip(data.News_Index, data.Relevance)]
    y_train.append(y)
    y_index[i] = [y_idx]

In [448]:
common_query = set(train_query) & set(test_query)
common_query = list(common_query)
common = [ train_query.index(i) for i in common_query]

In [445]:
[ len(list(y_train[i].values())) for i in common]

[139, 40, 230, 95, 230]

In [479]:
[ np.sum(list(y_train[i].values())) / 300 for i in range(20)]

[0.6933333333333334,
 0.2833333333333333,
 1.4466666666666668,
 2.296666666666667,
 2.6466666666666665,
 0.14,
 0.7066666666666667,
 1.3,
 2.7866666666666666,
 0.9433333333333334,
 0.23,
 0.21333333333333335,
 0.5766666666666667,
 1.3233333333333333,
 0.77,
 2.1933333333333334,
 0.24333333333333335,
 1.0533333333333332,
 1.3733333333333333,
 1.35]

In [437]:
[ np.sum(list(y_train[i].values())) / 300 for i in common]

[0.6933333333333334, 0.14, 0.9433333333333334, 0.23, 1.35]

In [496]:
total_scores = list()
for test_id , text_q in enumerate(train_query):
    text = jieba.lcut(text_q)
    text = [ t for t in text if t not in stop_words]
    scores = bm25Model.get_scores(text)
    top_query = np.argsort(scores)[::-1][:100]
    
    print(test_id ,text_q, text)
    print("top query num {}".format(len(top_query)))
    
#     print(len(text))
    sim_words = []
    for i in (text):
        if i in index_dict:
            sim_words += [p[0] for p in w2v_model.wv.similar_by_word(i)[:10]] 

    text += sim_words
#     print(len(text))
    
    for query in top_query:
        all_words = [ (sentenece_arr[query][cnt], bm25Model.get_score(sentenece_arr[query], cnt))
                     for cnt, i in enumerate(sentenece_arr[query])]

        all_words = sorted(all_words,key=lambda x:(x[1]))[::-1]
        top_words = all_words[:20]
        top_words = [x[0] for x in top_words]
        text += top_words
    
    scores = bm25Model.get_scores(text)
    for idx, rel in (y_index[text_q][0]):
        scores[idx] += np.mean(scores) * idx
        
    keys = pd.DataFrame(np.argsort(scores)[::-1][:top_num])
    ans = keys[0].apply(lambda x: 'news_{:06d}'.format(x+1))
    
    validation_score = 0
    for a in ans:
        if a in y_train[test_id].keys():
            validation_score += y_train[test_id][a]
            
    total_scores.append(validation_score/top_num)
    print("{} / {} ".format(validation_score,(np.sum(list(y_train[test_id].values())))))

0 反對旺旺中時併購中嘉 ['反對', '旺旺', '中時', '併購', '中嘉']
top query num 100
208 / 208 
1 贊成文林苑都更案可依法拆除王家 ['贊成', '文林苑', '都更案', '依法', '拆除', '王家']
top query num 100
85 / 85 
2 贊成流浪動物零撲殺 ['贊成', '流浪', '動物', '零', '撲殺']
top query num 100
434 / 434 
3 贊同課綱微調 ['贊同', '課綱', '微調']
top query num 100


KeyboardInterrupt: 

$${\displaystyle k_{1}\in [1.2,2.0]}, {\displaystyle b=0.75}$$

In [509]:
total_scores = list()
for test_id , text_q in enumerate(test_query):
    text = jieba.lcut(text_q)
    text = [ t for t in text if t not in stop_words]
    scores = bm25Model.get_scores(text)
    top_query = np.argsort(scores)[::-1][:100]
    
    print(test_id ,text_q, text)
    print("top query num {}".format(len(top_query)))
    
    for query in top_query:
        all_words = [ (sentenece_arr[query][cnt], bm25Model.get_score(sentenece_arr[query], cnt))
                     for cnt, i in enumerate(sentenece_arr[query])]

        all_words = sorted(all_words,key=lambda x:(x[1]))[::-1]
        top_words = all_words[:30]
        top_words = [x[0] for x in top_words]
        text += top_words
    
    scores = bm25Model.get_scores(text)
    delta = np.max(scores) 
    if text_q in y_index.keys():
        for idx, rel in (y_index[text_q][0]):
            scores[idx] += delta * idx
            
    total_scores.append(scores)

0 通姦在刑法上應該除罪化 ['通姦', '刑法', '應該', '除罪', '化']
top query num 100
1 應該取消機車強制二段式左轉(待轉) ['應該', '取消', '機車', '強制', '二段式', '左轉', '待轉']
top query num 100
2 支持博弈特區在台灣合法化 ['支持', '博弈', '特區', '台灣', '合法化']
top query num 100
3 中華航空空服員罷工是合理的 ['中華', '航空', '空服員', '罷工', '合理']
top query num 100
4 性交易應該合法化 ['性交易', '應該', '合法化']
top query num 100
5 ECFA早收清單可（有）達到其預期成效 ['ECFA', '早收', '清單', '達到', '預期', '成效']
top query num 100
6 應該減免證所稅 ['應該', '減', '免證', '所稅']
top query num 100
7 贊成中油在觀塘興建第三天然氣接收站 ['贊成', '中油', '觀塘', '興建', '第三', '天然氣', '接收站']
top query num 100
8 支持中國學生納入健保 ['支持', '中國', '學生', '納入', '健保']
top query num 100
9 支持臺灣中小學（含高職、專科）服儀規定（含髮、襪、鞋）給予學生自主 ['支持', '臺灣', '中小學', '含', '高職', '專科', '服儀', '規定', '含', '髮', '襪', '鞋', '給予', '學生', '自主']
top query num 100
10 不支持使用加密貨幣 ['不支持', '使用', '加密', '貨幣']
top query num 100
11 不支持學雜費調漲 ['不支持', '學雜費', '調漲']
top query num 100
12 同意政府舉債發展前瞻建設計畫 ['同意', '政府', '舉債', '發展', '前瞻', '建', '設計', '畫']
top query num 100
13 支持電競列入體育競技 ['支持', '電競', '列入', '體育競技']
top query num 100
14 反對台鐵東

## Test the model

In [510]:
top_num = 300
search_result = np.zeros((20,top_num))
for cnt,i in enumerate(total_scores):
    keys = np.argsort(i)[::-1][:top_num]
    search_result[cnt] += keys
    
search_result = search_result.astype(np.int)

df = pd.DataFrame()
df['Query_Index'] = ['q_{:02d}'.format(i+1) for i in range(20)]

for i in range(top_num):
    df['Rank_{:03d}'.format(i+1)] = search_result[:, i] + 1

for i, row in df.iterrows():
    df.iloc[i, 1:] = df.iloc[i, 1:].apply(lambda x: 'news_{:06d}'.format(x))

In [499]:
import os
sorted(os.listdir('output/'))

['simple.csv',
 'simple10.csv',
 'simple11.csv',
 'simple12.csv',
 'simple13.csv',
 'simple14.csv',
 'simple15.csv',
 'simple2.csv',
 'simple3.csv',
 'simple4.csv',
 'simple5.csv',
 'simple8.csv',
 'simple9.csv',
 'top2000.csv']

In [511]:
fname = 'simple18.csv'
df.to_csv('output/' + fname, index=False)

In [504]:
i = 0
print(test_query[i])
idx = search_result[i][0]
print(idx)
content_list[idx]

通姦在刑法上應該除罪化
54325


'【綜合報導】司改國是會議第五分組昨決議，應廢除《刑法》通姦罪，若無法馬上廢止也要修法規定不能單獨對配偶撤告。此決議昨引爆熱議，司改委員林志潔指出，除罪化能避免遭性侵者憂慮挨告通姦而不敢舉發，避免再有女作家林奕含事件；委員紀惠容也希望「女人不要為難女人」；但台灣女人連線發言人黃淑英痛批，此決議「與民眾期望背道而馳！」通姦罪涉及對家庭的承諾，「外遇的人等同對家庭背信！」\n通姦是否除罪，法務部四年前曾網路民調，百分之八十五的民眾反對廢除通姦罪，日前輕生的女作家林奕含事件，外界質疑此案可能因擔心師母提告通姦，當年才未舉發老師。而按我《刑法》規定，已婚者與人通姦，跟相姦者均最重處一年徒刑，《刑事訴訟法》另有例外規定，為維護婚姻和諧，可在通姦案中單獨對配偶撤告，僅告相姦者，也就是撤告配偶「只告小三小王條款」。\n廢除單獨撤告配偶\n昨通姦除罪化議題成為司改分組討論焦點，與會委員指南韓、日本等國家都陸續廢除通姦罪，委員、勵馨基金會理事長紀惠容指出，根據法務部統計從二○○八年至二○一五年十月間，因通姦和相姦被起訴的女性人數高達二四○九人，佔五十二％，男性則佔四十七％，而女性遭判決有罪者達五十八點四％，因大老婆常對丈夫撤告，委員、律師賴芳玉也指出，此條文「傷害所有的女性」。\n該分組認為，國內已有相關法律維護婦女遭家暴、保障夫妻財產和婚姻權益，通姦罪的存在愈趨薄弱，且單獨對配偶撤告的但書，導致女性遭處罰者高於男性，另為避免遭受性侵害者因擔心被追究通姦罪，而不敢舉發，因此決議廢除通姦罪，若無法立即廢除，也應該刪除可單獨對配偶撤告的但書規定。法務部昨表示，將研議如何修法，但也提醒目前多數民意支持通姦罪，未來如何說服立委恐是難題。\n李秀環：結婚無意義\n《蘋果》昨試訪曾爆出軌的名人看法。先前陷入毆妻卻對新歡邱惠美告白的北市議員童仲彥，昨晚表示「沒想法，歹勢」，已跟童簽字離婚的李秀環受訪則反對通姦除罪化，她表示，若除罪化，結婚已無意義，「配偶今天可以小三、明天可以跟小四在一起，那就不需要對婚姻承諾了。」至於曾有助理小三的北市議員王世堅、傳出過和熟女上摩鐵的名廚阿基師等，昨均未聯繫上，立委吳育昇也不評論此事。\n婦女新知基金會表示，一向支持通姦除罪，希望立法院盡速修法通過。兩性專家吳娟瑜也認為，除罪化是先進國家趨勢，未來女人不能再依賴別人（指法律）管老公，對此決議她表示「恭喜、支持！」但

# search_result[0]