In [1]:
import requests
import arrow
import pprint
import json
from urllib.parse import urlencode
from functools import reduce

In [10]:
token = open("./NOTION_TOKEN", "r").readlines()[0]
notion_version =  "2021-08-16"

In [11]:
extra_data = {"filter": {"and": [{"property": "标签",
                                  "multi_select": {"is_not_empty": True}},],},}

In [12]:
r_database = requests.post(
    url="https://api.notion.com/v1/databases/cecf4bb039dc46bca130a29a9db58906/query",
    headers={"Authorization": "Bearer " + token,
             "Notion-Version": notion_version,
             "Content-Type": "application/json",
             },
    data=json.dumps(extra_data),
)

In [13]:
respond = json.loads(r_database.text)

In [14]:
def take_page_plain_text(respond: dict):
    for result in respond["results"]:
        page_id = result["url"].split("/")[-1].split("-")[-1]
        r_page = requests.get(
                    url=f"https://api.notion.com/v1/blocks/{page_id}/children",
                    headers={"Authorization": f"Bearer {token}",
                             "Notion-Version": notion_version,
                             "Content-Type": "application/json",
                             },
                    )
        for block in json.loads(r_page.text).get("results", []):
            for key in block:
                if not isinstance(block[key], dict):
                    continue
                if "text" not in block[key]:
                    continue
                for text in block[key]["text"]:
                    yield text["plain_text"]

In [15]:
text_list = list(take_page_plain_text(respond))

In [16]:
text_list[:3]

['别听他说什么，要看他做什么：关于总理和掏粪工只是分工不同的反驳，医院里只有高干病房，掏粪工进不去', '要点', '加了爱尔兰']

In [17]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import jieba

In [18]:
import sys
from unicodedata import category
codepoints = range(sys.maxunicode + 1)
punctuation = {c for k in codepoints if category(c := chr(k)).startswith("P")}

In [19]:
from functional import seq
split_text_list = [jieba.lcut(text, HMM=True) for text in text_list]

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 1.491 seconds.
Prefix dict has been built successfully.


In [20]:
from glob import glob

stopfiles = glob("./stopwords/*stopwords.txt")

stopwords = reduce(lambda x,y: x.union(y), [set([x.strip() for x in open(file, "r").readlines()]) for file in stopfiles])

In [21]:
def check_stopwords(word):
    return word in stopwords \
        or word in punctuation \
        or word.isdigit()

In [22]:
sequence = seq(split_text_list).map(lambda sent: [word for word in sent if not check_stopwords(word)])

In [23]:
uniqueWords = (sequence
               .map(lambda sent: set(sent))
               .reduce(lambda x, y: x.union(y))
              )

In [24]:
word2sents = {word.lower(): set() for word in uniqueWords}

In [25]:
for text in text_list:
    for word in uniqueWords:
        if word in text:
            word2sents[word.lower()].add(text)

## 现有库

In [26]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(sequence.map(lambda x: " ".join(x)).to_list())
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)



In [27]:
df.max(axis=0).sort_values(key=lambda x: -x).to_csv("./tf_idf_topic.csv")

In [30]:
for word in df.max(axis=0).sort_values(key=lambda x: -x).head(3).index:
    print(word)
    print(word2sents[word])
    print("-" * 10)

0x320646e7b37d5a31f5dcef9ccff9180eeb63b004
{'0x320646e7b37d5a31f5dcef9ccff9180eeb63b004'}
----------
补充
{'CV补充', '补充'}
----------
分散
{'分散>集中', '注意力分散（大脑很难同时专注于2件事，其中一件事会倾向欲望和直觉）'}
----------


## 自定义(不是tf*idf)

In [47]:
uniqueWords = (sequence
               .map(lambda sent: set(sent))
               .reduce(lambda x, y: x.union(y))
              )

In [49]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

In [50]:
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict