## 使用networkx计算PageRank

In [1]:
import networkx as nx

# 创建一个有向图
G = nx.DiGraph()

### 读取mysql数据库

In [2]:
import pymysql
cnx = pymysql.connect(host='localhost', user='root', password='123qwe12')
cursor = cnx.cursor()
cnx.select_db('IR_db')

In [3]:
def get_douban_url_as_node():
    # 创建SQL查询
    sql = "SELECT url FROM douban"
    # 执行查询
    cursor.execute(sql)
    # 获取所有的结果
    results = cursor.fetchall()
    # 将结果从元组列表转换为普通列表，并添加到图中作为节点
    for result in results:
        G.add_node(result[0])


In [4]:

def get_ids_from_same_title():
    # 创建SQL查询
    sql = "SELECT page_id FROM same_title"
    # 执行查询
    cursor.execute(sql)
    # 获取所有的结果
    results = cursor.fetchall()
    # 将结果从元组列表转换为普通列表
    ids_same_title = [result[0] for result in results]
    return ids_same_title

def get_page_url_as_node():
    global ids_page
    # 创建SQL查询
    sql = "SELECT id,url FROM page"
    # 执行查询
    cursor.execute(sql)
    # 获取所有的结果
    results = cursor.fetchall()
    # 将结果从元组列表转换为普通列表
    ids = [result[0] for result in results]
    urls = [result[1] for result in results]
    # 获取相同title的id
    ids_same_title = get_ids_from_same_title()
    print(len(ids_same_title))
    # 去除相同title的id
    for id,url in zip(ids,urls):
        if id not in ids_same_title:
            G.add_node(url)

    # 从same_title中按title分组，获取每组的第一个id
    sql = "SELECT MIN(page_id) FROM same_title GROUP BY title"
    cursor.execute(sql)
    results = cursor.fetchall()
    ids = [result[0] for result in results]
    print(len(ids))
    # 找到这些id对应的url
    for id in ids:
        sql = "SELECT url FROM page WHERE id = %d" % id
        cursor.execute(sql)
        results = cursor.fetchall()
        url = results[0]
        G.add_node(url)

### 创建节点 × 没必要了，创建边的时候会自动创建节点

In [5]:
get_douban_url_as_node()

In [6]:
get_page_url_as_node()

1435
610


### 创建节点之间的边

In [4]:
def get_douban_edges():
    # 从数据库中获取所有的记录
    sql = "SELECT url, links FROM douban"
    cursor.execute(sql)
    results = cursor.fetchall()
     # 对于每一条记录
    for url, links in results:
        # 将links按照'\n'解析成一个列表
        links = links.split('\n')
        # 对于每一个link
        for link in links:
           G.add_edge(url, link)

In [5]:
get_douban_edges()

In [6]:
def create_url_to_url_mapping():
    # 创建一个空的映射
    url_to_url = {}
    # 从same_title表中按title分组，获取每组的第一个URL
    sql = "SELECT MIN(page_id), title FROM same_title GROUP BY title"
    cursor.execute(sql)
    results = cursor.fetchall()
    # 将每个title的所有URL映射到对应的节点
    for page_id, title in results:
        # 获取该title的第一个URL
        sql = "SELECT url FROM same_title WHERE page_id = %d" % page_id
        cursor.execute(sql)
        results = cursor.fetchall()
        target_url = results[0]
        # 获取该title的所有URL
        sql = "SELECT url FROM same_title WHERE title = %s"
        cursor.execute(sql,(title,))
        urls = cursor.fetchall()
        for url in urls:
            url_to_url[url] = target_url
    return url_to_url

In [7]:
# 拿到同title的url映射
url_to_url = create_url_to_url_mapping()

In [8]:
def get_page_edges():
    # 从数据库中获取所有的记录
    sql = "SELECT url, links FROM page"
    cursor.execute(sql)
    results = cursor.fetchall()
    # 对于每一条记录
    for url, links in results:
        # 如果URL在映射中，使用映射中的值替换URL
        url = url_to_url.get(url, url)
        # 将links按照'\n'解析成一个列表
        links = links.split('\n')
        # 对于每一个link
        for link in links:
            # 如果link在映射中，使用映射中的值替换link
            link = url_to_url.get(link, link)
            # 在图G中添加一条从url到link的边
            G.add_edge(url, link)

In [9]:
get_page_edges()

### 计算PageRank

In [10]:
def cal_page_rank():
    # 使用networkx中的pagerank函数计算PR值
    pr = nx.pagerank(G)
    return pr

### 更新es索引

In [11]:
from elasticsearch import Elasticsearch

# 实例化es
es = Elasticsearch(hosts="http://localhost:9200")

In [39]:
# 使用URL搜索文档
res = es.search(index="web", body={"query": {"match": {"url": 'https://www.runoob.com/'}}})
# 查看page_id
page_id = res['hits']['hits'][0]['_source']['pageRank']
print(page_id)

0.0021668640962373142


In [23]:
from elasticsearch.helpers import scan

def add_page_rank_to_documents(pr):
    # 遍历web索引中的所有文档
    for doc in scan(es, index='web'):
        # 获取文档的URL
        url = doc['_source']['url']
        # 在pr字典中查询对应的PageRank值
        page_rank = pr.get(url)
        # 如果找到了PageRank值
        if page_rank is not None:
            # 更新文档，添加pageRank字段
            es.update(index='web', id=doc['_id'], body={'doc': {'pageRank': page_rank}})
        else :
            # 否则，设置pageRank字段为0
            es.update(index='web', id=doc['_id'], body={'doc': {'pageRank': 0}})

In [21]:
pr = cal_page_rank()

In [24]:
add_page_rank_to_documents(pr)

In [26]:
## 打印PageRank值
# 获取web索引中的前一百条文档
res = es.search(index="web", body={"query": {"match_all": {}}}, size=100)

# 遍历这些文档
for doc in res['hits']['hits']:
    # 获取并打印文档的title和pageRank字段
    title = doc['_source'].get('title', 'Not available')
    page_rank = doc['_source'].get('pageRank', 'Not available')
    print(f"Title: {title}, PageRank: {page_rank}")

Title: 浪漫传奇——《贫民窟的百万富翁》, PageRank: 3.1680108217425975e-05
Title: 需要夢想又害怕夢想, PageRank: 3.1680108217425975e-05
Title: 贫民窟的百万富翁 Slumdog Millionaire, PageRank: 0.0004328589715619791
Title: 唱一首歌, PageRank: 4.1406828371657615e-05
Title: 从教育心理学的角度来扯几句, PageRank: 4.1406828371657615e-05
Title: 嘴角漾笑, PageRank: 4.1406828371657615e-05
Title: 傻孩子的星期六, PageRank: 4.1406828371657615e-05
Title: 一个自私孩子的美梦, PageRank: 4.1406828371657615e-05
Title: 蒙丹烧了学校？老师到底带孩子们去了哪里？, PageRank: 4.1406828371657615e-05
Title: 第三次看《放牛班》，觉得蒙丹并非我们以为的坏孩子, PageRank: 4.1406828371657615e-05
Title: 《放牛班的春天》是如何治愈你的？, PageRank: 4.1406828371657615e-05
Title: “池塘之底”有天使, PageRank: 4.1406828371657615e-05
Title: 有关马修老师和莫杭治的母亲, PageRank: 4.1406828371657615e-05
Title: 放牛班的春天 Les choristes, PageRank: 0.0006807693122078392
Title: 这个世界是异类的, PageRank: 3.8854484497569625e-05
Title: 死亡诗社里的一段话，四首诗, PageRank: 3.8854484497569625e-05
Title: 《死亡诗社》经典台词欣赏（转）, PageRank: 3.8854484497569625e-05
Title: 《死亡诗社》：这一刻，我开始懂得自己, PageRank: 3.8854484497569625e-05


### 测试pageRank影响排序结果

In [145]:
def deal_bool_query(query):
    # 执行搜索
    response = es.search(
        index="web",
        body={
            "query": {
                "function_score": {
                    "query": {
                        "bool": {
                            "must": [
                                {
                                    "multi_match": {
                                        "query": query,
                                        "fields": ["title", "content"]
                                    }
                                }
                            ]
                        }
                    },
                    "script_score": {
                        "script": {
                            "source": "Math.log1p(doc['pageRank'].value * params.factor)",
                            "params": {
                                "factor": 100000
                            }
                        }
                    },
                    "boost_mode": "sum"
                }
            },
            "explain": True  # 添加这一行
        }
    )
    # 获取搜索结果
    hits = response["hits"]["hits"]
    # 格式化搜索结果
    results = [
        {
            "id": hit["_id"],
            "title": hit["_source"]["title"],
            "content": hit["_source"]["content"],
            "url": hit["_source"]["url"],
            "type": hit["_source"]["type"],
            "pageRank": hit["_source"]["pageRank"],
            "explanation": hit["_explanation"]  # 添加这一行
        }
        for hit in hits
    ]
    return results

In [146]:
results = deal_bool_query('肖申克的救赎')

for result in results:
    print(result['title'], result['url'], result['type'])
    print(result['explanation']['value'])
    

肖申克的救赎 https://book.douban.com/subject/1829226/ douban
40.591507
肖申克的救赎 The Shawshank Redemption https://movie.douban.com/subject/1292052/ douban
36.22301
十年·肖申克的救赎 https://movie.douban.com/review/1000369/ douban
35.161785
《肖申克的救赎》到底“救赎”了什么？ https://movie.douban.com/review/10350620/ douban
33.33816
肖申克的救赎，读书笔记 https://book.douban.com/review/1336253/ douban
32.47394
《肖申克的救赎》的一些幕后花絮 https://movie.douban.com/review/1062920/ douban
29.472248
为何《肖申克的救赎》在IMDb和豆瓣都能排第一？ https://movie.douban.com/review/9259304/ douban
29.19319
汲汲而生，汲汲而死 https://book.douban.com/review/1597365/ douban
28.38151
《肖申克的救赎》：1994—2007，希望就是现实 https://movie.douban.com/review/1127585/ douban
27.998638
关于“救赎” https://movie.douban.com/review/8848890/ douban
27.522339


In [139]:
import math

print(math.log1p(0.00039118477103319317*100000))
print(math.log1p(3.9509503613941765e-05*100000))

print(math.log1p(0.00039118477103319317*10000000))
print(math.log1p(3.9509503613941765e-05*10000000))

3.691837003831898
1.5995795503532988
8.272020702549892
5.9816541727124815
