In [12]:
import sys

sys.path.append("../")

from bunkatopics import Bunka
from bunkatopics.functions.clean_text import clean_tweet
from langchain.embeddings import HuggingFaceEmbeddings
import random
from datasets import load_dataset
import os
from dotenv import load_dotenv

load_dotenv()

True

In [20]:
dataset = load_dataset("rguo123/trump_tweets")["train"]["content"]
full_docs = random.sample(dataset, 5000)
full_docs = [clean_tweet(x) for x in full_docs]
full_docs = [x for x in full_docs if len(x)>50]

In [22]:
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
bunka = Bunka(embedding_model=embedding_model)

Chose a Generative AI Model

In [23]:
bunka.fit(full_docs)

# Topic Modeling
df_topics = bunka.get_topics(n_clusters=10)
topic_fig = bunka.visualize_topics(width=800, height=800)
topic_fig.show()

[1mINFO      [0m|[33m2023-10-02 11:01:16[0m|[35m{}[0m|[34mfit[0m|[1mExtracting Terms[0m
100%|███████████████████████████████████████████████████████████████████████████████████████████| 4081/4081 [00:24<00:00, 163.73it/s]
[1mINFO      [0m|[33m2023-10-02 11:01:42[0m|[35m{}[0m|[34mfit[0m|[1mEmbedding Documents, this may take few minutes[0m
[1mINFO      [0m|[33m2023-10-02 11:01:53[0m|[35m{}[0m|[34mfit[0m|[1mReducing Dimensions[0m


Chose a Generative AI Model

In [24]:
from langchain.llms import OpenAI
open_ai_generative_model = OpenAI(openai_api_key = os.getenv('OPEN_AI_KEY'))

In [25]:
from langchain.llms import LlamaCpp
generative_model = LlamaCpp(
    model_path=os.getenv("MODEL_PATH"),
    n_ctx=2048,
    temperature=0.75,
    max_tokens=2000,
    top_p=1,
    verbose=False,
)
generative_model.client.verbose = False

llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from /Users/charlesdedampierre/Desktop/llama.cpp/models/Mistral/ggml-model-q4_0.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_0     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:              blk.0.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    2:              blk.0.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    4:         blk.0.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    7:            blk.0.ffn

In [26]:
# Topic Modeling Clean
df_topics = bunka.get_clean_topic_name(generative_model=open_ai_generative_model)
topic_fig_clean = bunka.visualize_topics(width=800, height=800)
topic_fig_clean.show()

Creating new labels for clusters: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.25it/s]


In [27]:
fig_solo = bunka.visualize_bourdieu_one_dimension(
    left=["negative", "bad"],
    right=["positive"],
    width=600,
    height=600,
    explainer=False,
)

fig_solo.show()

In [29]:
bourdieu_fig = bunka.visualize_bourdieu(
    generative_model=open_ai_generative_model,
    x_left_words=["war"],
    x_right_words=["peace"],
    y_top_words=["men"],
    y_bottom_words=["women"],
    height=800,
    width=800,
    label_size_ratio_label=50,
    display_percent=True,
    clustering=True,
    topic_n_clusters=10,
    topic_terms=5,
    topic_top_terms_overall=500,
    topic_gen_name=True,
)

bourdieu_fig.show()

Creating new labels for clusters: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.26it/s]


In [41]:
res = bunka.rag_query(query = 'What are the main fight of Donald Trump ?', generative_model = generative_model, top_doc = 5)

[1mINFO      [0m|[33m2023-10-02 11:20:21[0m|[35m{}[0m|[34mrag_query[0m|[1mAnswering your query, please wait a few seconds[0m


In [49]:
print(res['result'])

 The main fight of Donald Trump in the presidential elections of 2016 was against Hillary Clinton. He believed he was the best candidate for president and was able to beat many other candidates in the field due to his fame and political opinions.


In [55]:
for doc in res['source_documents']:
    text = doc.page_content.strip()
    print(text)

what do you say donald  run for president
why only donald trump can beat hillary
via    donald trump on who he likes for president  donald trump
if the 2016  presidential field is so deep  why is donaldtrump beating so many of their  stars
donald trump is a respected businessman with insightful political opinions
