In [1]:
import requests
import arrow
import pprint
import json
from urllib.parse import urlencode
from functools import reduce
from tqdm import tqdm

In [15]:
import logging
from utils.log import config_log

config_log(
        "notion_api",
        "DBtexts",
        log_root='./logs',
        print_terminal=True,
        enable_monitor=False,
    )

## 环境变量

In [2]:
# 获取token：https://www.notion.so/my-integrations/
token = open("./NOTION_TOKEN", "r").readlines()[0]
# notion_version =  "2021-08-16"
notion_version = "2022-06-28"

notion_header = {"Authorization": f"Bearer {token}",
                 "Notion-Version": notion_version,
                 "Content-Type": "application/json",
                 }

In [3]:
# 需要读取的database ID
database_id = 'a2594f51053a47b3a58a171017ea0435'

# 筛选 property，这里的 Label 是上述 database 中的属性
extra_data = {"filter": {"and": [{"property": "Label",
                                  "multi_select": {"is_not_empty": True}},],},
             }

## 读取数据库

In [50]:
from typing import List
class NotionDBText:
    """
    读取数据库中所有富文本信息
    """
    def __init__(self, notion_header: dict, database_id: str, extra_data: dict = dict()):
        self.header = notion_header
        self.database_id = database_id
        self.extra_data = extra_data
        self.total_texts, self.total_blocks, self.total_pages = [[]] * 3
        self.block_types = ["paragraph", "bulleted_list_item", "numbered_list_item", 
                            "toggle", "to_do", "quote", 
                            "callout", "synced_block", "template", 
                            "column", "child_page", "child_database", "table",
                            "heading_1","heading_2","heading_3"]
    
    def read(self):
        self.total_pages = self.read_pages()
        self.total_blocks = self.read_blocks(self.total_pages)
        self.total_texts = self.read_rich_text(self.total_blocks)
        
    def read_pages(self):
        """
        读取database中所有pages
        """
        total_pages = []
        has_more = True
        next_cursor = ''
        # 有下一页时，继续读取
        while has_more:
            if next_cursor:
                extra_data['start_cursor'] = next_cursor
            r_database = requests.post(
                url=f"https://api.notion.com/v1/databases/{self.database_id}/query",
                headers=self.header,
                data=json.dumps(self.extra_data),
            )
            respond = json.loads(r_database.text)
            total_pages.extend(respond["results"])
            has_more = respond['has_more']
            next_cursor = respond['next_cursor']
        logging.info(f'{len(total_pages)} pages when {arrow.now()}')
        return total_pages
    
    def read_blocks(self, pages: List):
        """
        读取pages中所有blocks
        """
        total_blocks = []
        for page in tqdm(pages, desc='read blocks'):
            page_id = page["id"]
            r_page = requests.get(
                        url=f"https://api.notion.com/v1/blocks/{page_id}/children",
                        headers=self.header,
                        )
            total_blocks.append(json.loads(r_page.text).get("results", []))
        return total_blocks
        
    def read_rich_text(self, blocks: List):
        """
        读取blocks中所有rich text
        """
        total_texts = []
        for page_blocks in blocks:
            page_texts = []
            for block in page_blocks:
                if block['type'] not in self.block_types:
                    logging.warning(block['type'] + ' not in type list')
                    continue
                try:
                    page_texts.extend([x['plain_text'] for x in block[block['type']]['rich_text']])
                except Exception as e:
                    logging.error(block['type'] + '|' + json.dumps(block[block['type']]))
            total_texts.append(page_texts)
        return total_texts

In [51]:
notion_db = NotionDBText(notion_header, database_id, extra_data)

In [52]:
notion_db.read()

[2023-01-29 05:26:14.616] [INFO] [298233] [410687785.py] [42] [67 pages when 2023-01-29T05:26:14.616211+00:00]
read blocks: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 67/67 [00:26<00:00,  2.52it/s]


In [53]:
notion_db.total_texts[0]

['外部的数据大家都可以用，怎样能把MP的独有数据利用起来？或者设计独有的数据收集？', '数据是否能够按照博弈论，分类成外部公开、半公开数据、独家数据？']

## 分析结果

In [73]:
# !pip install scikit-learn jieba -q

import pandas as pd
import jieba

# 标点符号
import sys
from unicodedata import category
codepoints = range(sys.maxunicode + 1)
punctuation = {c for k in codepoints if category(c := chr(k)).startswith("P")}

# 停用词
from glob import glob
stopfiles = glob("./stopwords/*stopwords.txt")
stopwords = reduce(lambda x,y: x.union(y), [set([x.strip() for x in open(file, "r").readlines()]) for file in stopfiles])

In [74]:
def check_stopwords(word):
    return word in stopwords \
        or word in punctuation \
        or word.isdigit()

### 分词、清洗、建立映射

In [75]:
from functional import seq
text_list = [text for item in notion_db.total_texts for text in item]
# text_list = notion_db.total_texts
# 分词
split_text_list = [jieba.lcut(text, HMM=True) for text in text_list]
# 剔除停用词
sequence = seq(split_text_list).map(lambda sent: [word for word in sent if not check_stopwords(word)])

# 包含的词
uniqueWords = (sequence
               .map(lambda sent: set(sent))
               .reduce(lambda x, y: x.union(y))
              )

In [76]:
# 词 --> 句子 查询字典
word2sents = {word.lower(): set() for word in uniqueWords}

for text in text_list:
    for word in uniqueWords:
        if word in text:
            word2sents[word.lower()].add(text)

### 使用标准tf-idf工具来分析

todo:

句子全混合在同一个列表里肯定不对，要按文档分开

In [77]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(sequence.map(lambda x: " ".join(x)).to_list())
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)

In [78]:
# 结果写入文件
df.max(axis=0).sort_values(key=lambda x: -x).to_csv("./tf_idf_topic.csv")

In [79]:
# 检查高频词
for word in df.max(axis=0).sort_values(key=lambda x: -x).head(3).index:
    print(word)
    print(word2sents[word])
    print("-" * 10)

untitled
{'Untitled'}
----------
节奏
{'进入节奏'}
----------
分享
{'分享'}
----------


### 自定义(不是tf*idf)

In [47]:
uniqueWords = (sequence
               .map(lambda sent: set(sent))
               .reduce(lambda x, y: x.union(y))
              )

In [49]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

In [50]:
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

## test