In [None]:
import bertopic as bt
import dotenv
import hydra
import openai
from sentence_transformers import SentenceTransformer
import umap

import bookmarks_topics._common as C
from bookmarks_topics.topics import truncate_doc

In [107]:
hydra.core.global_hydra.GlobalHydra.instance().clear()
hydra.initialize(config_path='../conf', version_base="1.3")
cfg = hydra.compose(config_name='config')

In [None]:
dotenv.load_dotenv()

In [None]:
websites = C.from_pickle(cfg.topics.input_path)
titles = [w.title for w in websites]
content = [truncate_doc(x, cfg.topics.truncate) for x in websites]
docs = [a + b for a, b in zip(titles, content)]

In [None]:
embedding_model = SentenceTransformer(
    model_name_or_path="thenlper/gte-small",
    trust_remote_code=True,
)

umap_model = umap.UMAP(
    n_neighbors=15,
    n_components=10,
    metric="cosine",
)

ctfidf_model = bt.vectorizers.ClassTfidfTransformer(
    reduce_frequent_words=True,
)

keybert_model = bt.representation.KeyBERTInspired(
    top_n_words=10,
    nr_repr_docs=10,
    nr_samples=500,
    nr_candidate_words=100,
    random_state=cfg.seed,
)

prompt = """
I have a topic that contains the following documents: 
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a 1-2 word topic label in the following format:
topic: <topic label>
"""


client = openai.OpenAI(api_key=os.environ["OPENAI_KEY"])
openai_model = bt.representation.OpenAI(
    client, 
    model="gpt-3.5-turbo", 
    delay_in_seconds=0.2, 
    chat=True,
    nr_docs=10,
    prompt=prompt,
)

representation_model = [
    keybert_model,
    openai_model,
]

topic_model = bt.BERTopic(
    top_n_words=10,
    n_gram_range=(1,1),
    min_topic_size=5,
    embedding_model=embedding_model,
    umap_model=umap_model,
    ctfidf_model=ctfidf_model,
    representation_model=representation_model,
)

In [None]:
topic_model.fit(docs)

In [None]:
topics = topic_model.get_topics()
topics

In [None]:
topic_names = [topics[t][0][0] for t in topic_model.topics_]

In [None]:
bookmark_topics = [
    C.Topic(*a, b) for a, b in zip(websites, topic_names)
]

In [None]:
result = [
    {
        "url": t.url,
        "title": t.title,
        "topic": t.topic,
    }
    for t in bookmark_topics
]

In [None]:
C.to_json(
    result,
    "bookmark_topics_20241118-1700.json",
)

In [None]:
embedding_model = SentenceTransformer(
    model_name_or_path="thenlper/gte-small",
    trust_remote_code=True,
)

umap_model = umap.UMAP(
    n_neighbors=5,
    n_components=10,
    metric="cosine",
)

ctfidf_model = bt.vectorizers.ClassTfidfTransformer(
    reduce_frequent_words=True,
)

keybert_model = bt.representation.KeyBERTInspired(
    top_n_words=10,
    nr_repr_docs=10,
    nr_samples=500,
    nr_candidate_words=100,
    random_state=cfg.seed,
)

prompt = """
I have a topic that contains the following documents: 
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a 1-2 word topic label in the following format:
topic: <topic label>

Note that the topic label should be formatted in Title Case.
"""


client = openai.OpenAI(api_key=os.environ["OPENAI_KEY"])
openai_model = bt.representation.OpenAI(
    client, 
    model="gpt-3.5-turbo", 
    delay_in_seconds=0.2, 
    chat=True,
    nr_docs=10,
    prompt=prompt,
)

representation_model = [
    keybert_model,
    openai_model,
]

topic_model = bt.BERTopic(
    top_n_words=10,
    n_gram_range=(1,1),
    min_topic_size=5,
    embedding_model=embedding_model,
    umap_model=umap_model,
    ctfidf_model=ctfidf_model,
    representation_model=representation_model,
)

In [None]:
topic_model.fit(docs)

In [None]:
topics = topic_model.get_topics()
topics

In [None]:
topic_names = [topics[t][0][0] for t in topic_model.topics_]

bookmark_topics = [
    C.Topic(*a, b) for a, b in zip(websites, topic_names)
]

result = [
    {
        "url": t.url,
        "title": t.title,
        "topic": t.topic,
    }
    for t in bookmark_topics
]

C.to_json(
    result,
    "bookmark_topics_20241118-1701.json",
)

In [None]:
def to_html(bookmarks, output_file="new_bookmarks.html"):
    # Start HTML structure for bookmarks file
    html_content = """<!DOCTYPE NETSCAPE-Bookmark-file-1>
    <META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
    <TITLE>Bookmarks</TITLE>
    <H1>Bookmarks</H1>
    <DL><p>\n"""

    # Organize bookmarks by folder (topic)
    folders = {}
    for bookmark in bookmarks:
        folder = bookmark.get("topic", "Miscellaneous")
        if folder not in folders:
            folders[folder] = []
        folders[folder].append(bookmark)

    # Convert folders and bookmarks to HTML format
    for folder, bookmarks in folders.items():
        html_content += f"<DT><H3>{folder}</H3>\n<DL><p>\n"
        for bookmark in bookmarks:
            url = bookmark["url"]
            title = bookmark["title"]
            html_content += f'    <DT><A HREF="{url}">{title}</A>\n'
        html_content += "</DL><p>\n"

    html_content += "</DL><p>"

    # Write HTML content to output file
    with open(output_file, "w", encoding="utf-8") as file:
        file.write(html_content)

In [None]:
to_html(result)

In [None]:
for key, value in topics.items()

In [None]:
len([x for x in bookmark_topics if x.topic.lower() == "data analysis"])

In [103]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,329,-1_Python Libraries,[Python Libraries],[lightning — lightning dev documentationlightn...
1,0,57,0_Data Visualization,[Data Visualization],[Force layout | D3 in DepthD3 Force layout D3 ...
2,1,45,1_Stock Market,[Stock Market],[SPDR - ETF HomepageSPDR Exchange Traded Funds...
3,2,42,2_Documentation Pages,[Documentation Pages],[Plotting — hvPlot 0.8.5a1 documentationPage n...
4,3,37,3_Docker Containers,[Docker Containers],[How To Install Anaconda on Ubuntu 18.04 [Quic...
...,...,...,...,...,...
124,123,6,123_Static Type Checking,[Static Type Checking],[MonkeyType — MonkeyType 19.11.3.dev1 document...
125,124,6,124_Jupyter Notebook Conversion,[Jupyter Notebook Conversion],[Using as a command line tool — nbconvert 7.16...
126,125,6,125_Time Series Forecasting,[Time Series Forecasting],[Prophet | Prophet is a forecasting procedure ...
127,126,6,126_Shell Scripting,[Shell Scripting],[Usage | TaskUsage | Task Create a file called...


In [None]:
topic_model.visualize_topics([-1])

In [105]:
topic_model.save('fit_topic_model', "pytorch")

In [106]:
print(prompt)


I have a topic that contains the following documents: 
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a 1-2 word topic label in the following format:
topic: <topic label>

Note that the topic label should be formatted in Title Case.



In [112]:
openai_model.prompts_

["\nI have a topic that contains the following documents: \n- openpyxl - A Python library to read/write Excel 2010 xlsx/xlsm files — openpyxl 2.5.12 documentationopenpyxl - A Python library to read/write Excel 2010 xlsx/xlsm files — openpyxl 3.1.3 documentation Eric Gazoni, Charlie Clark https://foss.heptapod.net/openpyxl/openpyxl https://foss.heptapod.net/openpyxl/openpyxl/-/issues May 29, 2024 MIT/Expat 3.1.3 openpyxl is a Python library to read/write Excel 2010 xlsx/xlsm/xltx/xltm files. It was born from lack of existing library to read/write natively from Python the Office Open XML format. All kudos to the PHPExcel team as openpyxl was initially base\n- Welcome to the SHAP documentation — SHAP latest documentationWelcome to the SHAP documentation — SHAP latest documentation Introduction Examples Reference Development SHAP (SHapley Additive exPlanations) is a game theoretic approach to explain the output of any machine learning model. It connects optimal credit allocation with local

In [113]:
with open('prompt.txt', 'w') as fp:
    fp.write(prompt)