In [1]:
import sqlite3
import pandas as pd
import numpy as np
from tqdm import tqdm
import time

In [2]:
def select_nodes_table(database_name, table_name, columns, conditions):
    conn = sqlite3.connect(database_name)
    cur = conn.cursor()
    myresult = cur.execute('''SELECT {1}
                            FROM {0} WHERE {2}'''
                           .format(table_name,columns,conditions) )
    result = []
    for row in myresult:
        result.append(row)
    conn.close()    
    return result

In [3]:
def select_edges_table(database_name, table_name, columns, conditions):
    conn = sqlite3.connect(database_name)
    cur = conn.cursor()
    myresult = cur.execute('''SELECT {1}
                            FROM {0} WHERE {2}'''
                           .format(table_name,columns,conditions) )
    result = []
    for row in myresult:
        result.append(row)
    conn.close()        
    return result

## Read DailyNewsNetwork from database

In [4]:
NEWS_DB_PATH = '/Users/gtingyou/Documents/NewsNetwork/Recommendation System/NetworkEmbedding/DailyNews/NewsNetwork_ch.db'
USER_DB_PATH = '/Users/gtingyou/Documents/NewsNetwork/Recommendation System/NetworkEmbedding/UserLogs/user2.db'
MEDIA_NAME = 'LBT'
DATE = '2020-06-30'

In [5]:
MEDIAconnection = select_edges_table(NEWS_DB_PATH, MEDIA_NAME+'connection', 'News1,News2', "ParseDate>='2020-04-01'")
len(MEDIAconnection)

54846

## Graph embedding (using node2vec)

In [6]:
def output_MEDIAconnection_to_edgelist_file(MEDIAconnection):
    df_edgelist = pd.DataFrame(MEDIAconnection)
    df_edgelist.to_csv('./GraphEmbedding/data/news/%snews_%s.edgelist' %(MEDIA_NAME, DATE),sep=' ',index=None,header=None)

In [7]:
output_MEDIAconnection_to_edgelist_file(MEDIAconnection)

In [8]:
def Node2Vec_Network_Embedding():
    import networkx as nx
    from GraphEmbedding.ge.classify import read_node_label,Classifier
    from GraphEmbedding.ge import Node2Vec
    
    G = nx.read_edgelist('./GraphEmbedding/data/news/%snews_%s.edgelist' %(MEDIA_NAME, DATE),
                        create_using = nx.DiGraph(), nodetype = None, data = [('weight', int)])#read graph

    model = Node2Vec(G, walk_length = 10, num_walks = 80,p = 0.25, q = 4, workers = 1)#init model
    model.train(window_size = 5, iter = 3)# train model
    embeddings = model.get_embeddings()# get embedding vectors

    return embeddings

In [9]:
node2vec_embeddings = Node2Vec_Network_Embedding()

Preprocess transition probs...


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   17.1s finished


Learning embedding vectors...
Learning embedding vectors done!


In [10]:
def output_Embeddings_to_file(embeddings):
    df_embeddings = pd.DataFrame(embeddings).T
    df_embeddings= df_embeddings.sort_index()

    df_embeddings.to_csv('./GraphEmbedding/emb/%snews_%s.emb' %(MEDIA_NAME, DATE),sep=' ',index=True,header=None)

In [11]:
output_Embeddings_to_file(node2vec_embeddings)

## calculate *topology similarity score* and *document similarity score*

In [12]:
# def read_Embeddings_from_file():
#     DATE = '2020-06-30'
#     df_embeddings = pd.read_csv('./GraphEmbedding/emb/news_%s.emb' %(DATE),sep=' ',index_col=0,header=None)
    
#     index = df_embeddings.index.tolist()
#     values = df_embeddings.values.tolist()
#     node2vec_embeddings = {index[i]:values[i] for i in range(len(index))}
#     return node2vec_embeddings

In [13]:
# node2vec_embeddings = read_Embeddings_from_file()

#### topology sim score matrix

In [14]:
def construct_topology_cs_matrix(node2vec_embeddings):
    from sklearn.metrics.pairwise import cosine_similarity
    
    cs_index_list = sorted(list(node2vec_embeddings.keys()))
    topology_cs_matrix = cosine_similarity(list(node2vec_embeddings.values()))
                
    return cs_index_list, topology_cs_matrix

In [15]:
cs_index_list, topology_cs_matrix = construct_topology_cs_matrix(node2vec_embeddings)

#### doc sim score matrix

In [16]:
def construct_document_cs_matrix(node2vec_embeddings, MEDIA_NAME):
    '''
    step 1. get NewsDetail(include NewsIndex,NewsContext,NewsTitle) from database 
            ----> Due to selecting news with NewsIndex(Select * from CHTnews WHERE NewsIndex=='') is time consuming
            ----> Hence, we dump all the news from CHTnews(Select * from CHTnews WHERE ParseDate>=''), then filter the news
    step 2. construct cosine similarity matix of context and title respectively
    step 3. compare 0:context and 1:title doc sim score, return the higher score
    return: index of the matrix(list), cosine similarity matrix(list)
    '''
    import jieba
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    import time
    ### read stopWords from file
    def open_stopWords_from_file(path):
        stopWords=[]
        # 讀入停用詞檔
        with open('stopWords.txt', 'r', encoding='UTF-8') as file:
            for data in file.readlines():
                data = data.strip()
                stopWords.append(data)
        return stopWords
    ### construct cs matrix
    def construct_cs_matrix(context_or_title, News): # 算 context_or_title 的詞頻矩陣 ---> 1: context , 2: title
        corpus = []
        for i, k in enumerate( tqdm(News) ):
            # 結巴中文斷詞
            segments = jieba.cut(k[context_or_title], cut_all=False)
            # 移除停用詞及跳行符號
            remainderWords = list(filter(lambda a: a not in stopWords and a != '\n', segments))
            corpus.append(' '.join(remainderWords))
        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(corpus)
        cs = cosine_similarity(X)
        return cs 
    
    ### read stopWords from file
    stopWords = open_stopWords_from_file(path='./')
    
    ### step 1
    cs_index_list = sorted(list(node2vec_embeddings.keys()))
    MEDIA_nodes = select_nodes_table(NEWS_DB_PATH, MEDIA_NAME+'news', 'NewsIndex,NewsContext,NewsTitle', "ParseDate>='2020-04-01' ")
    News = []
    print('Selecting news details from all news...')
    for k in MEDIA_nodes:
        if k[0] in cs_index_list:
            News.append(k)
    News = sorted(News, key=lambda x:x[0], reverse=False)
        
    ### step 2 
    cs1 = construct_cs_matrix(1, News)
#     print('End constructing cosine similarity matrix for NewsContext ---> ', time.strftime("%H:%M:%S", time.localtime()) )
#     cs2 = construct_cs_matrix(2, News)
#     print('End constructing cosine similarity matrix for NewsTitle ---> ', time.strftime("%H:%M:%S", time.localtime()) )
    
    ### step 3
    document_cs_matrix = cs1
#     document_cs_matrix = np.zeros([len(cs_index_list),len(cs_index_list)])
#     for i in range(len(cs1)):
#         for j in range(len(cs1)):
#             if cs1[i][j]>=cs2[i][j]:
#                 document_cs_matrix[i][j] = cs1[i][j]
#             else:
#                 document_cs_matrix[i][j] = cs2[i][j]

    return cs_index_list, document_cs_matrix

In [17]:
cs_index_list, document_cs_matrix = construct_document_cs_matrix(node2vec_embeddings, MEDIA_NAME)

  0%|          | 0/34622 [00:00<?, ?it/s]Building prefix dict from /anaconda3/envs/spider/lib/python3.6/site-packages/jieba/dict.txt ...
Loading model from cache /var/folders/23/ffjh42551yd7bdv2qzvs0t680000gn/T/jieba.cache
Loading model cost 0.9726390838623047 seconds.
Prefix dict has been built succesfully.
100%|██████████| 34622/34622 [05:02<00:00, 114.46it/s]


#### final_cs_matrix (alpha x doc sim score + (1-alpha) x topology sim score)

In [18]:
def construct_final_cs_matrix(topology_cs_matrix, document_cs_matrix, alpha):
    final_cs_matrix = topology_cs_matrix*alpha+document_cs_matrix*(1-alpha)
    return final_cs_matrix

In [None]:
final_cs_matrix = construct_final_cs_matrix(topology_cs_matrix, document_cs_matrix, 0.1)

In [None]:
topology_cs_matrix.shape, document_cs_matrix.shape, final_cs_matrix.shape

In [16]:
# topology_cs_matrix, document_cs_matrix, final_cs_matrix

## Recommendation

### Get user browse history from database

In [27]:
USERnews = select_nodes_table(USER_DB_PATH, 'NODES', 'NewsIndex,NewsTitle,NewsURL,NewsContext', "ParseDate>='2020-06-20'")
len(USERnews)

35

In [41]:
def get_NewsIndex_from_DailyNewsNetwork(USERnews, MEDIA_NAME):
    USERnews_NewsIndex = []
    for k in USERnews:
        if MEDIA_NAME=='CHT':
            term = 'chinatimes.com'
        elif MEDIA_NAME=='LBT':
            term = 'ltn.com.tw'
            
        if 'user' in k[0] and k[3]!='' and term in k[2]:
            url = ('%'+k[2]+'%')
            table_name = MEDIA_NAME+'news'
            NewsNetwork_NewsIndex = select_nodes_table(NEWS_DB_PATH, table_name, 'NewsIndex', "NewsURL like '%s'" %url)
            if NewsNetwork_NewsIndex!=[]:
                USERnews_NewsIndex.append([NewsNetwork_NewsIndex[0][0],k[1],k[2]])
        else:
            print('Not NewsURL or Not in %s -> %s' %(MEDIA_NAME, k[2]) )
            continue
    return USERnews_NewsIndex

In [42]:
USERnews_NewsIndex = get_NewsIndex_from_DailyNewsNetwork(USERnews, MEDIA_NAME)

Not NewsURL or Not in LBT -> https://www.chinatimes.com/
Not NewsURL or Not in LBT -> https://www.chinatimes.com/realtimenews/20200622001567-260405
Not NewsURL or Not in LBT -> https://www.chinatimes.com/realtimenews/20200622002688-260405
Not NewsURL or Not in LBT -> https://news.ltn.com.tw/list/breakingnews
Not NewsURL or Not in LBT -> https://sports.ltn.com.tw/
Not NewsURL or Not in LBT -> https://sports.ltn.com.tw/news/paper/1381483
Not NewsURL or Not in LBT -> https://sports.ltn.com.tw/news/paper/1381484
Not NewsURL or Not in LBT -> https://www.chinatimes.com/realtimenews/20200624003302-260409
Not NewsURL or Not in LBT -> https://www.chinatimes.com/realtimenews/20200624002599-260409
Not NewsURL or Not in LBT -> https://www.chinatimes.com/realtimenews/20200624000070-260409
Not NewsURL or Not in LBT -> https://www.chinatimes.com/realtimenews/
Not NewsURL or Not in LBT -> https://covid-19.chinatimes.com/%E6%96%B0%E5%86%A0%E8%82%BA%E7%82%8E,%E5%8F%B0%E7%81%A3
Not NewsURL or Not in LBT 

### Recommend news based on user browse history

In [46]:
def Recommed_News(NewsIndex, MEDIA_NAME, top_n):
    matrix_index = cs_index_list.index(NewsIndex)
    scores = [[cs_index_list[k], final_cs_matrix[matrix_index][k]] for k in range(len(cs_index_list))]
    results = sorted(scores, key=lambda x:x[1], reverse=True)
    
    print('Recommed News:')
    recommend_News = []
    for i in range(top_n):
        recommend_NewsIndex = results[i][0]
        recommend_NewsInfo = select_nodes_table(NEWS_DB_PATH,MEDIA_NAME+'news','NewsIndex,NewsTitle,NewsURL',"NewsIndex='%s' " %(recommend_NewsIndex))
        print('-'*40)
        print(recommend_NewsInfo[0][0],recommend_NewsInfo[0][1],recommend_NewsInfo[0][2],results[i][1])
#         recommend_News.append([recommend_NewsInfo[0][0],recommend_NewsInfo[0][1],recommend_NewsInfo[0][2],results[i][1]])
        
#     return recommend_News

In [48]:
for n in USERnews_NewsIndex:
    if n[0] not in cs_index_list:
        print('Cant find User browsed news in DailyNewsNetwork ->\n',n)
    print('-'*100)
    print('User browsing history News:\n', n)
    Recommed_News(n[0], MEDIA_NAME, 10)

----------------------------------------------------------------------------------------------------
User browsing history News:
 ['LBT_20200624_1', '日女確診恐本土個案？ 陳時中：「待釐清」暫不列案號', 'https://news.ltn.com.tw/news/life/breakingnews/3208237']
Recommed News:
----------------------------------------
LBT_20200624_1 日女確診恐本土個案？ 陳時中：「待釐清」暫不列案號 https://news.ltn.com.tw/news/life/breakingnews/3208237 0.9999999940395368
----------------------------------------
LBT_20200624_22 日本女學生染疫有台灣旅遊史 陳時中︰在台接觸者至少140人 https://news.ltn.com.tw/news/life/breakingnews/3208155 0.7533423360618162
----------------------------------------
LBT_20200626_801 武漢肺炎》增1例境外移入 有瓜地馬拉活動史 https://news.ltn.com.tw/news/life/breakingnews/3208936 0.5461183244126168
----------------------------------------
LBT_20200624_123 日本染疫20多歲女曾來台 指揮中心17:30加開臨時記者會說明 https://news.ltn.com.tw/news/life/breakingnews/3207990 0.5364213527395174
----------------------------------------
LBT_20200626_584 再增1例境外移入！ 指揮中心14:00召開臨時記者會 https://news.ltn.com.tw/news/

ValueError: 'LBT_20200615_134' is not in list