In [None]:
import sys

sys.path.append("../")

from bunkatopics import Bunka
from bunkatopics.functions.clean_text import clean_tweet
from langchain.embeddings import HuggingFaceEmbeddings
import random
from datasets import load_dataset
import os
from dotenv import load_dotenv

load_dotenv()

In [None]:
dataset = load_dataset("rguo123/trump_tweets")["train"]
full_docs = dataset["content"]
full_docs = random.sample(full_docs, 3000)
full_docs = [clean_tweet(x) for x in full_docs]
full_docs = [x for x in full_docs if len(x)>50]

In [None]:
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
bunka = Bunka(embedding_model=embedding_model)

Chose a Generative AI Model

In [None]:
bunka.fit(full_docs)

# Topic Modeling
df_topics = bunka.get_topics(n_clusters=10)


Chose a Generative AI Model

In [None]:
from langchain.llms import OpenAI
open_ai_generative_model = OpenAI(openai_api_key = os.getenv('OPEN_AI_KEY'))

from langchain.llms import LlamaCpp
generative_model = LlamaCpp(
    model_path=os.getenv("MODEL_PATH"),
    n_ctx=2048,
    temperature=0.75,
    max_tokens=2000,
    top_p=1,
    verbose=False,
)
generative_model.client.verbose = False

In [None]:
# Topic Modeling Clean
df_topics = bunka.get_clean_topic_name(generative_model=open_ai_generative_model)
topic_fig_clean = bunka.visualize_topics(width=800, height=800)
topic_fig_clean.show()

In [None]:
fig_solo = bunka.visualize_bourdieu_one_dimension(
    left=["negative", "bad"],
    right=["positive"],
    width=600,
    height=600,
    explainer=False,
)

fig_solo.show()

In [None]:
bourdieu_fig = bunka.visualize_bourdieu(
    generative_model=open_ai_generative_model,
    x_left_words=["war"],
    x_right_words=["peace"],
    y_top_words=["men"],
    y_bottom_words=["women"],
    height=800,
    width=800,
    label_size_ratio_label=50,
    display_percent=True,
    clustering=True,
    topic_n_clusters=10,
    topic_terms=5,
    topic_top_terms_overall=500,
    topic_gen_name=True,
)

bourdieu_fig.show()

In [None]:
res = bunka.rag_query(query = 'Who is Donald Trump', generative_model = generative_model, top_doc = 5)

In [None]:
print(res['result'])

In [None]:
for doc in res['source_documents']:
    text = doc.page_content.strip()
    print(text)