In [1]:
import sqlite3
import pandas as pd
import numpy as np
from tqdm import tqdm
import time

In [2]:
def select_nodes_table(database_name, table_name, columns, conditions):
    conn = sqlite3.connect(database_name)
    cur = conn.cursor()
    myresult = cur.execute('''SELECT {1}
                            FROM {0} WHERE {2}'''
                           .format(table_name,columns,conditions) )
    result = []
    for row in myresult:
        result.append(row)
    conn.close()    
    return result

In [3]:
def select_edges_table(database_name, table_name, columns, conditions):
    conn = sqlite3.connect(database_name)
    cur = conn.cursor()
    myresult = cur.execute('''SELECT {1}
                            FROM {0} WHERE {2}'''
                           .format(table_name,columns,conditions) )
    result = []
    for row in myresult:
        result.append(row)
    conn.close()        
    return result

In [4]:
NEWS_DB_PATH = '/home/u4839782/NRS/GE-NRS/DailyNews/NewsNetwork_ch.db'
USER_DB_PATH = '/home/u4839782/NRS/GE-NRS/UserLogs/user2.db'
MEDIA_NAME = 'LBT'
DATE = '2020-06-30'

## 1. Read DailyNewsNetwork from database

In [5]:
MEDIAconnection = select_edges_table(NEWS_DB_PATH, MEDIA_NAME+'connection', 'News1,News2', "ParseDate>='2020-04-01'")
len(MEDIAconnection)

54846

## 2. Calculate *topology similarity score* and *document similarity score*

### 2-1. Graph embedding (using node2vec)

In [12]:
def output_MEDIAconnection_to_edgelist_file(MEDIAconnection):
    df_edgelist = pd.DataFrame(MEDIAconnection)
    df_edgelist.to_csv('./GraphEmbedding/data/news/%snews_%s.edgelist' %(MEDIA_NAME, DATE),sep=' ',index=None,header=None)

In [13]:
output_MEDIAconnection_to_edgelist_file(MEDIAconnection)

In [14]:
def Node2Vec_Network_Embedding():
    import networkx as nx
    from GraphEmbedding.ge.classify import read_node_label,Classifier
    from GraphEmbedding.ge import Node2Vec
    
    G = nx.read_edgelist('./GraphEmbedding/data/news/%snews_%s.edgelist' %(MEDIA_NAME, DATE),
                        create_using = nx.DiGraph(), nodetype = None, data = [('weight', int)])#read graph

    model = Node2Vec(G, walk_length = 10, num_walks = 80,p = 0.25, q = 4, workers = 1)#init model
    model.train(window_size = 5, iter = 3)# train model
    embeddings = model.get_embeddings()# get embedding vectors

    return embeddings

In [16]:
node2vec_embeddings = Node2Vec_Network_Embedding()

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Preprocess transition probs...


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   15.0s finished


Learning embedding vectors...
Learning embedding vectors done!


In [17]:
def output_Embeddings_to_file(embeddings):
    df_embeddings = pd.DataFrame(embeddings).T
    df_embeddings= df_embeddings.sort_index()

    df_embeddings.to_csv('./GraphEmbedding/emb/%snews_%s.emb' %(MEDIA_NAME, DATE),sep=' ',index=True,header=None)

In [18]:
output_Embeddings_to_file(node2vec_embeddings)

In [8]:
def read_Embeddings_from_file():
    DATE = '2020-06-30'
    df_embeddings = pd.read_csv('./GraphEmbedding/emb/%snews_%s.emb' %(MEDIA_NAME, DATE),sep=' ',index_col=0,header=None)
    
    index = df_embeddings.index.tolist()
    values = df_embeddings.values.tolist()
    node2vec_embeddings = {index[i]:values[i] for i in range(len(index))}
    return node2vec_embeddings

In [9]:
node2vec_embeddings = read_Embeddings_from_file()

### 2-2. topology sim score matrix

In [10]:
def construct_topology_cs_matrix(node2vec_embeddings):
    from sklearn.metrics.pairwise import cosine_similarity
    
    cs_index_list = sorted(list(node2vec_embeddings.keys()))
    topology_cs_matrix = cosine_similarity(list(node2vec_embeddings.values()))
                
    return cs_index_list, topology_cs_matrix

In [11]:
cs_index_list, topology_cs_matrix = construct_topology_cs_matrix(node2vec_embeddings)

### 2-3. doc sim score matrix

In [12]:
def construct_document_cs_matrix(node2vec_embeddings, MEDIA_NAME):
    '''
    step 1. get NewsDetail(include NewsIndex,NewsContext,NewsTitle) from database 
            ----> Due to selecting news with NewsIndex(Select * from CHTnews WHERE NewsIndex=='') is time consuming
            ----> Hence, we dump all the news from CHTnews(Select * from CHTnews WHERE ParseDate>=''), then filter the news
    step 2. construct cosine similarity matix of context and title respectively
    step 3. compare 0:context and 1:title doc sim score, return the higher score
    return: index of the matrix(list), cosine similarity matrix(list)
    '''
    import jieba
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    import time

    def open_stopWords_from_file(path):
        stopWords=[]
        # 讀入停用詞檔
        with open('stopWords.txt', 'r', encoding='UTF-8') as file:
            for data in file.readlines():
                data = data.strip()
                stopWords.append(data)
        return stopWords

    def construct_cs_matrix(context_or_title, News): # 算 context_or_title 的詞頻矩陣 ---> 1: context , 2: title
        corpus = []
        for i, k in enumerate( tqdm(News) ):
            # 結巴中文斷詞
            segments = jieba.cut(k[context_or_title], cut_all=False)
            # 移除停用詞及跳行符號
            remainderWords = list(filter(lambda a: a not in stopWords and a != '\n', segments))
            corpus.append(' '.join(remainderWords))
        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(corpus)
        cs = cosine_similarity(X)
        return cs 
    
    ### read stopWords from file
    stopWords = open_stopWords_from_file(path='./')
    ### step 1
    cs_index_list = sorted(list(node2vec_embeddings.keys()))
    MEDIA_nodes = select_nodes_table(NEWS_DB_PATH, MEDIA_NAME+'news', 'NewsIndex,NewsContext,NewsTitle', "ParseDate>='2020-04-01' ")
    News = []
    print('Selecting news details from all news...')
    for k in MEDIA_nodes:
        if k[0] in cs_index_list:
            News.append(k)
    News = sorted(News, key=lambda x:x[0], reverse=False)
    ### step 2 
    cs1 = construct_cs_matrix(1, News)  
    ### step 3
    document_cs_matrix = cs1

    return cs_index_list, document_cs_matrix

In [13]:
cs_index_list, document_cs_matrix = construct_document_cs_matrix(node2vec_embeddings, MEDIA_NAME)

Selecting news details from all news...


  0%|          | 0/34622 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.827 seconds.
Prefix dict has been built succesfully.
100%|██████████| 34622/34622 [04:29<00:00, 128.54it/s]


### 2-4. final_cs_matrix (alpha x doc sim score + (1-alpha) x topology sim score)

In [14]:
def construct_final_cs_matrix(topology_cs_matrix, document_cs_matrix, alpha):
    final_cs_matrix = topology_cs_matrix*alpha+document_cs_matrix*(1-alpha)
    return final_cs_matrix

In [15]:
final_cs_matrix = construct_final_cs_matrix(topology_cs_matrix, document_cs_matrix, 0.1)

In [16]:
topology_cs_matrix.shape, document_cs_matrix.shape, final_cs_matrix.shape

((34622, 34622), (34622, 34622), (34622, 34622))

In [16]:
# topology_cs_matrix, document_cs_matrix, final_cs_matrix

### 2-5. output final_cs_matrix to h5 file (optional)

In [24]:
def output_final_cs_matirx_to_h5_file(final_cs_matrix):
    import h5py
    with h5py.File('./final_cs_matrix.h5', 'w') as hf:
        hf.create_dataset('final_cs_matrix',  data=final_cs_matrix)
        
output_final_cs_matirx_to_h5_file(final_cs_matrix)

## 3. Recommendation

### 3-1. Get user browse history from database

In [26]:
USERnews = select_nodes_table(USER_DB_PATH, 'NODES', 'NewsIndex,NewsTitle,NewsURL,NewsContext', "ParseDate>='2020-06-20'")
len(USERnews)

35

In [27]:
def get_NewsIndex_from_DailyNewsNetwork(USERnews, MEDIA_NAME):
    USERnews_NewsIndex = []
    for k in USERnews:
        if MEDIA_NAME=='CHT':
            term = 'chinatimes.com'
        elif MEDIA_NAME=='LBT':
            term = 'ltn.com.tw'
            
        if 'user' in k[0] and k[3]!='' and term in k[2]:
            url = ('%'+k[2]+'%')
            table_name = MEDIA_NAME+'news'
            NewsNetwork_NewsIndex = select_nodes_table(NEWS_DB_PATH, table_name, 'NewsIndex', "NewsURL like '%s'" %url)
            if NewsNetwork_NewsIndex!=[]:
                USERnews_NewsIndex.append([NewsNetwork_NewsIndex[0][0],k[1],k[2]])
        else:
            print('Not NewsURL or Not in %s -> %s' %(MEDIA_NAME, k[2]) )
            continue
    return USERnews_NewsIndex

In [28]:
USERnews_NewsIndex = get_NewsIndex_from_DailyNewsNetwork(USERnews, MEDIA_NAME)

Not NewsURL or Not in LBT -> https://www.chinatimes.com/
Not NewsURL or Not in LBT -> https://www.chinatimes.com/realtimenews/20200622001567-260405
Not NewsURL or Not in LBT -> https://www.chinatimes.com/realtimenews/20200622002688-260405
Not NewsURL or Not in LBT -> https://news.ltn.com.tw/list/breakingnews
Not NewsURL or Not in LBT -> https://sports.ltn.com.tw/
Not NewsURL or Not in LBT -> https://sports.ltn.com.tw/news/paper/1381483
Not NewsURL or Not in LBT -> https://sports.ltn.com.tw/news/paper/1381484
Not NewsURL or Not in LBT -> https://www.chinatimes.com/realtimenews/20200624003302-260409
Not NewsURL or Not in LBT -> https://www.chinatimes.com/realtimenews/20200624002599-260409
Not NewsURL or Not in LBT -> https://www.chinatimes.com/realtimenews/20200624000070-260409
Not NewsURL or Not in LBT -> https://www.chinatimes.com/realtimenews/
Not NewsURL or Not in LBT -> https://covid-19.chinatimes.com/%E6%96%B0%E5%86%A0%E8%82%BA%E7%82%8E,%E5%8F%B0%E7%81%A3
Not NewsURL or Not in LBT 

### 3-2. Recommend news based on user browse history

In [38]:
def Recommed_News(NewsIndex, MEDIA_NAME, top_n):
    matrix_index = cs_index_list.index(NewsIndex)
    scores = [[cs_index_list[k], final_cs_matrix[matrix_index][k]] for k in range(len(cs_index_list))]
    results = sorted(scores, key=lambda x:x[1], reverse=True)
    
    print('Recommed News:')
    recommend_News = []
    for i in range(top_n):
        recommend_NewsIndex = results[i][0]
        if recommend_NewsIndex!=NewsIndex:
            recommend_NewsInfo = select_nodes_table(NEWS_DB_PATH,MEDIA_NAME+'news','NewsIndex,NewsTitle,NewsURL',"NewsIndex='%s' " %(recommend_NewsIndex))
            print('-'*40,'\n%s %s %s \n%s' %(recommend_NewsInfo[0][0],recommend_NewsInfo[0][1],recommend_NewsInfo[0][2], results[i][1]))
#             recommend_News.append([recommend_NewsInfo[0][0],recommend_NewsInfo[0][1],recommend_NewsInfo[0][2],results[i][1]])
#     return recommend_News

In [39]:
for n in USERnews_NewsIndex:
    print('User browsing history News: %s\n%s %s' %(n[0],n[1],n[2]))

User browsing history News: LBT_20200624_1
日女確診恐本土個案？ 陳時中：「待釐清」暫不列案號 https://news.ltn.com.tw/news/life/breakingnews/3208237
User browsing history News: LBT_20200624_22
日本女學生染疫有台灣旅遊史 陳時中︰在台接觸者至少140人 https://news.ltn.com.tw/news/life/breakingnews/3208155
User browsing history News: LBT_20200620_62
秋天解禁？ 日本首波開放入境國家沒台灣 日媒曝關鍵原因 https://news.ltn.com.tw/news/world/breakingnews/3203519
User browsing history News: LBT_20200617_46
武漢肺炎》北京耳語滿天飛！傳醫院爆滿、染疫屍體天天燒 https://news.ltn.com.tw/news/world/breakingnews/3199993
User browsing history News: LBT_20200615_134
武漢肺炎》北京警戒再提升！ 多地列為中、高風險地區 https://news.ltn.com.tw/news/world/breakingnews/3197617
User browsing history News: LBT_20200614_70
武漢肺炎》北京進入非常時期！ 瘋狂「搶菜」人龍超誇張 https://news.ltn.com.tw/news/world/breakingnews/3197566


In [40]:
for n in USERnews_NewsIndex:
#     if n[0] not in cs_index_list:
#         print('Cant find User browsed news in DailyNewsNetwork ->\n',n)
#         continue
    print('-'*100)
    print('User browsing history News: %s\n%s %s' %(n[0],n[1],n[2]))
    Recommed_News(n[0], MEDIA_NAME, 10)

----------------------------------------------------------------------------------------------------
User browsing history News: LBT_20200624_1
日女確診恐本土個案？ 陳時中：「待釐清」暫不列案號 https://news.ltn.com.tw/news/life/breakingnews/3208237
Recommed News:
---------------------------------------- 
LBT_20200624_22 日本女學生染疫有台灣旅遊史 陳時中︰在台接觸者至少140人 https://news.ltn.com.tw/news/life/breakingnews/3208155 
0.7177685555251646
---------------------------------------- 
LBT_20200529_38 新增1例境外 30多歲女性自英返台初採陰、二採確診 https://news.ltn.com.tw/news/life/breakingnews/3156745 
0.5388568321564133
---------------------------------------- 
LBT_20200607_408 新增3確診 1本土病例連假發病找不到感染源 https://news.ltn.com.tw/news/life/breakingnews/3126911 
0.5328816919544352
---------------------------------------- 
LBT_20200524_435 敦睦艦隊磐石艦再增1例 累計已31人確診 https://news.ltn.com.tw/news/life/breakingnews/3145442 
0.5279085181348717
---------------------------------------- 
LBT_20200507_28 新增1例境外！20多歲女卡達確診治療 、二採陰返台又確診 https://news.ltn.com.tw/news/life/breaki

---------------------------------------- 
LBT_20200617_522 5天106例 北京16區逾半淪陷 https://news.ltn.com.tw/news/world/paper/1380341 
0.3704604938328483
----------------------------------------------------------------------------------------------------
User browsing history News: LBT_20200614_70
武漢肺炎》北京進入非常時期！ 瘋狂「搶菜」人龍超誇張 https://news.ltn.com.tw/news/world/breakingnews/3197566
Recommed News:
---------------------------------------- 
LBT_20200614_128 武肺疫情復燃！ 北京市昨單日新增36例確診 https://news.ltn.com.tw/news/world/breakingnews/3197254 
0.6037800992953176
---------------------------------------- 
LBT_20200614_15 武肺疫情再起！北京召開防疫會議「已進入非常時期」 https://news.ltn.com.tw/news/world/breakingnews/3197212 
0.5685329252258362
---------------------------------------- 
LBT_20200613_245 武漢肺炎》北京爆二波疫情？2天增3本土病例 六大市場急休市 https://news.ltn.com.tw/news/world/breakingnews/3196444 
0.5071749579582174
---------------------------------------- 
LBT_20200613_132 武漢肺炎》北京批發市場「切魚砧板」驗出病毒 三大超市連夜下架 https://news.ltn.com.tw/news/world/breaki

#### challenges: 
1. node2vec 方法改進
2. recommed news 是過去的，應該要推薦最新的新聞