In [1]:
import sys

sys.path.append("../")

from bunkatopics import Bunka
from bunkatopics.functions.clean_text import clean_tweet
from langchain.embeddings import HuggingFaceEmbeddings
import random
from datasets import load_dataset
import os
from dotenv import load_dotenv
from bunkatopics.functions.clean_text import clean_tweet

load_dotenv()

True

In [2]:
dataset = load_dataset("rguo123/trump_tweets")["train"]
full_docs = dataset["content"]
full_docs = random.sample(full_docs, 2000)
full_docs = [clean_tweet(x) for x in full_docs]
full_docs = [x for x in full_docs if len(x)>50]

In [3]:
%%time

#embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

bunka = Bunka(embedding_model=embedding_model)
bunka.fit(full_docs)

[1mINFO      [0m|[33m2023-10-28 19:37:44[0m|[35m{}[0m|[34mfit[0m|[1mExtracting Terms[0m
100%|███████████████████████████████████████████████████████| 1615/1615 [00:09<00:00, 163.71it/s]
[1mINFO      [0m|[33m2023-10-28 19:37:55[0m|[35m{}[0m|[34mfit[0m|[1mEmbedding Documents, this may take few minutes[0m
[1mINFO      [0m|[33m2023-10-28 19:38:10[0m|[35m{}[0m|[34mfit[0m|[1mReducing Dimensions[0m
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


CPU times: user 1min 1s, sys: 8.57 s, total: 1min 9s
Wall time: 31.7 s


In [4]:
import pandas as pd

test = pd.DataFrame({"lemma":[x.lemma for x in bunka.terms], 
                    "count_terms":[x.count_terms for x in bunka.terms], 
                    "term_id":[x.term_id for x in bunka.terms]})
test = test.drop_duplicates()


In [5]:
# Topic Modeling
df_topics = bunka.get_topics(n_clusters=20, 
                             name_lenght=5, 
                             ngrams = [1, 2], 
                             top_terms_overall=2000,
                            min_count_terms=10,)

In [6]:
list(set(df_topics.name))

['unemployment | jobs | economy | cuts | tax',
 'endorsement | vets | crime | military | border',
 'attack | government | politicians | obama | mess',
 'disaster | care | cuts | tax | obama',
 'news | hillary clinton | politicians | story | person',
 'apprentice | celebrity | season | pm | tonight',
 'president | leader | country | office | vote',
 'poll | cruz | governor | obama | candidate',
 'com | honor | family | friends | today',
 'fbi | hunt | collusion | witch | information',
 'golf | hotel | course | tower | trump',
 'trump | president | donald | words | class',
 'dems | immigration | house | gop | election',
 'interview | rod | money | ratings | person',
 'ratings | women | rod | time | attack',
 'crowd | tomorrow | com | tonight | morning',
 'trade | countries | market | u | farmers',
 'thanks | book | fan | success | man',
 'champion | thanks | twitter | honor | com',
 'entrepreneurs | success | deal | work | deals']

In [7]:
from langchain.llms import OpenAI
llm = OpenAI(openai_api_key = os.getenv('OPEN_AI_KEY'))

In [8]:
df_topics = bunka.get_clean_topic_name(generative_model=llm, use_doc=False)
#topic_fig_clean = bunka.visualize_topics(width=800, height=800)

Creating new labels for clusters: 100%|██████████████████████████| 20/20 [00:10<00:00,  1.97it/s]


In [9]:
def clean_topic(topic_text):
    # Remove leading and trailing spaces
    cleaned_string = topic_text.strip()
    
    # Remove double quotes, if they exist
    if cleaned_string.startswith('"') and cleaned_string.endswith('"'):
        cleaned_string = cleaned_string[1:-1]
    
    return cleaned_string

In [10]:
for topic in bunka.topics:
    topic.name = clean_topic(topic.name)

In [11]:
import pandas as pd
df_topics = pd.DataFrame([x.dict() for x in bunka.topics])

In [12]:
list(set(df_topics.name))

['Appreciation and Acknowledgement',
 'Donald Trump Presidency',
 'Political News and Figures',
 'Political Investigation',
 'Political Criticism of Obama Administration',
 "Women's Ratings Over Time",
 'Interviews and Ratings',
 'The Crowd of Tomorrow',
 'Television Programming',
 'Election Campaigns',
 'International Trade Markets',
 'Golfing at a Luxury Hotel',
 'Economic Challenges',
 'Entrepreneurial Success in Deal-Making',
 'Military and Border Security',
 'Politcal Impact of Disasters and Cuts to Care and Taxation',
 'Achieving Success',
 'Relationships and Connections',
 'Politics and Immigration',
 'Political Leadership']

In [None]:
bunka.start_server()

In [19]:
manual_axis_name = {
                    'x_left_name':'hate',
                    'x_right_name':'peace',
                   'y_top_name':'past',
                    'y_bottom_name':'future',
                    }


from bunkatopics.datamodel import BourdieuQuery
boudieu_query = BourdieuQuery(x_left_words=["this is about about positive content"],
                                x_right_words=["this is a about negative content"],
                                y_top_words=["this is about women"],
                                y_bottom_words=["this is about men"],
                                radius_size = 0.5)


bourdieu_fig = bunka.visualize_bourdieu(
    generative_model=llm,
    x_left_words=boudieu_query.x_left_words,
    x_right_words=boudieu_query.x_right_words,
    y_top_words=boudieu_query.y_top_words,
    y_bottom_words=boudieu_query.y_bottom_words,
    height=1000,
    width=1000,
    display_percent=False,
    clustering=True,
    topic_n_clusters=15,
    topic_terms=5,
    topic_top_terms_overall=500,
    topic_gen_name=True,
    convex_hull = True,
    radius_size = 0.5,
    manual_axis_name = None
)

#bourdieu_fig.show()

Creating new labels for clusters: 100%|██████████████████████████| 15/15 [00:08<00:00,  1.76it/s]


In [20]:
bunka.start_server_bourdieu()

Server on port 3000 is already running. Killing it...
NPM server started.

> json-display-app@0.1.0 start
> react-scripts start





[36mStarting the development server...[39m
[36m[39m
[32mCompiled successfully![39m

You can now view [1mjson-display-app[22m in the browser.

  [1mLocal:[22m            http://localhost:[1m3000[22m
  [1mOn Your Network:[22m  http://172.23.1.164:[1m3000[22m

Note that the development build is not optimized.
To create a production build, use [36mnpm run build[39m.

webpack compiled [1m[32msuccessfully[39m[22m
