In [1]:
import requests
import arrow
import pprint
import json
from urllib.parse import urlencode
from functools import reduce
from tqdm import tqdm

In [2]:
import logging
from utils.log import config_log

config_log(
        "notion_api",
        "DBtexts",
        log_root='./logs',
        print_terminal=True,
        enable_monitor=False,
    )

## 环境变量

In [3]:
# 获取token：https://www.notion.so/my-integrations/
token = open("./NOTION_TOKEN", "r").readlines()[0]
# notion_version =  "2021-08-16"
notion_version = "2022-06-28"

notion_header = {"Authorization": f"Bearer {token}",
                 "Notion-Version": notion_version,
                 "Content-Type": "application/json",
                 }

In [4]:
# 需要读取的database ID
database_id = 'a2594f51053a47b3a58a171017ea0435'

# 筛选 property，这里的 Label 是上述 database 中的属性
extra_data = {"filter": {"and": [{"property": "Label",
                                  "multi_select": {"is_not_empty": True}
                                 },
                                 {"property": "Label",
                                  "multi_select": {"contains": "思考"}
                                 }
                                ],
                        },
             }

## 读取数据库

In [5]:
from typing import List
class NotionDBText:
    """
    读取数据库中所有富文本信息
    """
    def __init__(self, notion_header: dict, database_id: str, extra_data: dict = dict()):
        self.header = notion_header
        self.database_id = database_id
        self.extra_data = extra_data
        self.total_texts, self.total_blocks, self.total_pages = [[]] * 3
        self.block_types = ["paragraph", "bulleted_list_item", "numbered_list_item", 
                            "toggle", "to_do", "quote", 
                            "callout", "synced_block", "template", 
                            "column", "child_page", "child_database", "table",
                            "heading_1","heading_2","heading_3"]
    
    def read(self):
        self.total_pages = self.read_pages()
        self.total_blocks = self.read_blocks(self.total_pages)
        self.total_texts = self.read_rich_text(self.total_blocks)
        
    def read_pages(self):
        """
        读取database中所有pages
        """
        total_pages = []
        has_more = True
        next_cursor = ''
        # 有下一页时，继续读取
        while has_more:
            if next_cursor:
                extra_data['start_cursor'] = next_cursor
            r_database = requests.post(
                url=f"https://api.notion.com/v1/databases/{self.database_id}/query",
                headers=self.header,
                data=json.dumps(self.extra_data),
            )
            respond = json.loads(r_database.text)
            total_pages.extend(respond["results"])
            has_more = respond['has_more']
            next_cursor = respond['next_cursor']
        logging.info(f'{len(total_pages)} pages when {arrow.now()}')
        return total_pages
    
    def read_blocks(self, pages: List):
        """
        读取pages中所有blocks
        """
        total_blocks = []
        for page in tqdm(pages, desc='read blocks'):
            page_id = page["id"]
            r_page = requests.get(
                        url=f"https://api.notion.com/v1/blocks/{page_id}/children",
                        headers=self.header,
                        )
            total_blocks.append(json.loads(r_page.text).get("results", []))
        return total_blocks
        
    def read_rich_text(self, blocks: List):
        """
        读取blocks中所有rich text
        """
        total_texts = []
        for page_blocks in blocks:
            page_texts = []
            for block in page_blocks:
                if block['type'] not in self.block_types:
                    logging.warning(block['type'] + ' not in type list')
                    continue
                try:
                    page_texts.extend([x['plain_text'] for x in block[block['type']]['rich_text']])
                except Exception as e:
                    logging.error(block['type'] + '|' + json.dumps(block[block['type']]))
            total_texts.append(page_texts)
        return total_texts

In [6]:
notion_db = NotionDBText(notion_header, database_id, extra_data)

In [7]:
notion_db.read()

[2023-01-29 15:36:48.323] [INFO] [339943] [410687785.py] [42] [49 pages when 2023-01-29T15:36:48.323805+00:00]
read blocks: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 49/49 [00:14<00:00,  3.29it/s]
[2023-01-29 15:37:03.246] [ERROR] [339943] [410687785.py] [73] [synced_block|{"synced_from": null}]


In [8]:
notion_db.total_texts[-1]

['不对，要削减的是中产，奢侈品与底层关系不大',
 '关于社会核心本质的思考',
 '控制、稳定',
 '奢侈品是为了小成本消耗生产力，金钱，让底层资产消耗掉，继续劳作，保持社会结构稳定']

## 分析结果

In [9]:
# !pip install -U pkuseg

In [10]:
# !wget https://github.com/lancopku/pkuseg-python/releases/download/v0.0.25/default_v2.zip

In [11]:
# import pkuseg
# from pathlib import Path

# seg = pkuseg.pkuseg(model_name=Path('./pkuseg_model'))  # 程序会自动下载所对应的细领域模型

# text = seg.cut('我爱北京天安门')              # 进行分词
# print(text)

In [12]:
# !pip install scikit-learn jieba -q

import pandas as pd
import jieba

# 标点符号
import sys
from unicodedata import category
codepoints = range(sys.maxunicode + 1)
punctuation = {c for k in codepoints if category(c := chr(k)).startswith("P")}

# 停用词
from glob import glob
stopfiles = glob("./stopwords/*stopwords.txt")
stopwords = reduce(lambda x,y: x.union(y), [set([x.strip() for x in open(file, "r").readlines()]) for file in stopfiles])

In [13]:
def check_stopwords(word: str):
    """
    检查词语是否在停用词列表内
    """
    return word in stopwords \
             or word.isdigit() \
             or word in punctuation \
             or not word.strip()

def check_sentence_available(text: str):
    """
    检查句子是否符合要求
    """
    # 不要#开头的，可能是作为标签输入的
    if text.startswith("#"):
        return False
    return True

### 分词、清洗、建立映射

In [14]:
from functional import seq
text_list = [text for item in notion_db.total_texts for text in item  if check_sentence_available(text)]
# 分词
split_text_list = [jieba.lcut(text, HMM=True) for text in text_list]
# 剔除停用词
sequence = seq(split_text_list).map(lambda sent: [word for word in sent if not check_stopwords(word)])
# sequence = seq(split_text_list)

# 包含的词
uniqueWords = (sequence
               .map(lambda sent: set(sent))
               .reduce(lambda x, y: x.union(y))
              )

Building prefix dict from the default dictionary ...
[2023-01-29 15:37:04.181] [DEBUG] [339943] [__init__.py] [113] [Building prefix dict from the default dictionary ...]
Loading model from cache /tmp/jieba.cache
[2023-01-29 15:37:04.190] [DEBUG] [339943] [__init__.py] [132] [Loading model from cache /tmp/jieba.cache]
Loading model cost 0.825 seconds.
[2023-01-29 15:37:05.015] [DEBUG] [339943] [__init__.py] [164] [Loading model cost 0.825 seconds.]
Prefix dict has been built successfully.
[2023-01-29 15:37:05.022] [DEBUG] [339943] [__init__.py] [166] [Prefix dict has been built successfully.]


In [15]:
# 词 --> 句子 查询字典
word2sents = {word.lower(): set() for word in uniqueWords}

for text in text_list:
    for word in uniqueWords:
        if word in text:
            word2sents[word.lower()].add(text)

### 使用标准tf-idf工具来分析

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(sequence.map(lambda x: " ".join(x)).to_list())
feature_names = vectorizer.get_feature_names_out()
denselist = vectors.todense().tolist()
df = pd.DataFrame(denselist, columns=feature_names)

#### 按不同统计方法逆序输出所有词的tf-idf

In [17]:
from pathlib import Path

# 存储目录
directory = Path('./results')
directory.mkdir(exist_ok=True)

In [18]:
# 剔除最大最小值，求均值
df_drop_maxmin = df.copy()
for col in df.columns:
    df_drop_maxmin[col] = df[col][df[col].between(df[col].min(), df[col].max())]
    df_drop_maxmin[col].dropna(inplace=True)
df_drop_maxmin.mean().sort_values(ascending=False).to_csv(directory / "tf_idf_top.drop_maxmin.csv")

In [19]:
# 最大值
df.max(axis=0).sort_values(ascending=False).to_csv(directory / "tf_idf_top.max.csv")
# 求和
df.sum(axis=0).sort_values(ascending=False).to_csv(directory / "tf_idf_top.sum.csv")

In [20]:
# 检查高频词
with open(directory / "top_word_with_sents.md", "w") as f:
    for word in df.sum(axis=0).sort_values(ascending=False).head(20).index:
        f.write('### '+ word + '\n\n')
        f.write('\n\n'.join([sent.replace("\n", " ").replace(word, f'**{word}**') for sent in word2sents[word]]) + '\n\n')

### 自定义(不是tf*idf)

In [21]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

In [22]:
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

## test