# File for looking at text content

## Import Libraries:

In [1]:
import sys
!{sys.executable} -m pip install bertopic

Collecting bertopic
  Downloading bertopic-0.14.1-py2.py3-none-any.whl (120 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m120.7/120.7 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29
  Downloading hdbscan-0.8.29.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m58.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting plotly>=4.7.0
  Downloading plotly-5.14.0-py2.py3-none-any.whl (15.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m51.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting sentence-transformers>=0.4.1
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1

In [2]:
import sklearn as sk
import random
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
import numpy as np
from text_content import data
from stop_words import sw_en
import pandas as pd
from bertopic import BERTopic


### Get Content

In [2]:
en_list = [item['content'] for item in data if item.get('lang') == 'en']
random.shuffle(en_list)

In [3]:
docs =  [item['content'] for item in data]
random.shuffle(docs)

In [5]:
### Need to remove stop words from each language for each post

### Remove Stop Words

In [3]:
def remove_sw(text):
    new_list = []
    words = [word for word in re.split(r" |'", text) if word.lower() not in sw_en]
    new_text = " ".join(words)
    return new_text
    # new_list.append(new_text)

In [4]:
en_list_nosw = [remove_sw(item) for item in en_list]


In [5]:
print(f"The list in english has {len(en_list_nosw)} entries \n" )
print(en_list_nosw[3000])

The list in english has 3151 entries 

time, small town countryside, lived old man named Jack. Jack retired mechanic spent time tinkering antique car bicycle garage.

One day, Jack decided ride bike local farmer s market buy fresh apples. hopped trusty bicycle pedaled road. riding, noticed strange. bike felt different, like moving own.

Suddenly, wheel bike lifted ground transformed set wings. Jack startled excited took sky. landed gracefully nearby apple orchard, noticed unusual apple tree.

The tree like d seen before. branches metal, gears cogs instead leaves. Jack intrigued decided closer look. approached tree, felt sudden jolt teleported futuristic world filled flying automobiles.

To surprise, saw vehicles powered mechanical apples seen tree. Jack amazed quickly realized accidentally stumbled revolutionary new technology change world forever.

From day on, Jack known inventor mechanical apple, bicycle forever transformed flying machine. continued tinker explore, pushing boundarie

### Apply TF ID

In [6]:
tfIdfVectorizer=TfidfVectorizer(use_idf=True)
tfIdf = tfIdfVectorizer.fit_transform(en_list_nosw)

df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names_out(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
print (df.head(100))


              TF-IDF
nature      0.313603
grapes      0.283487
mushrooms   0.270495
birds       0.268880
appreciate  0.200706
...              ...
fruit       0.038307
next        0.038121
look        0.038028
looking     0.036817
provide     0.036736

[100 rows x 1 columns]


In [7]:
tags_arr = tfIdfVectorizer.inverse_transform(tfIdf)
print(tags_arr[0:10])
# content_tags = pd.DataFrame(tags_arr[0].T.todense(), columns=["TF-IDF"])
# content_tags = content_tags.sort_values('TF-IDF', ascending=False)
# print(content_tags.head(10))

[array(['discover', 'outside', 'aspect', 'interested', 'conclusion', 'in',
       'fungi', 'amazing', 'camera', 'forget', 'don', 'ecosystem',
       'contribute', 'medicine', 'cooking', 'uses', 'types', 'trail',
       'forest', 'guided', 'overlooked', 'mysterious', 'finally',
       'wonders', 'winged', 'beauty', 'capture', 'sketchpad', 'creative',
       'species', 'identify', 'help', 'guidebook', 'binoculars', 'pair',
       'bring', 'reserve', 'park', 'nearby', 'visit', 'backyard',
       'birdwatching', 'habitat', 'natural', 'observe', 'colors', 'sizes',
       'shapes', 'come', 'creatures', 'fascinating', 'talk', 'let',
       'next', 'home', 'grapevines', 'growing', 'adventurous', 'feeling',
       'stomping', 'grape', 'hand', 'try', 'varieties', 'taste',
       'winemaking', 'art', 'learn', 'winery', 'local', 'visiting',
       'tour', 'vineyard', 'taking', 'consider', 'truly', 'vine', 'fresh',
       'jam', 'wine', 'enjoyed', 'fruit', 'versatile', 'up', 'first',
       'gifts'

### Topic Modelling

In [55]:
n_features = 1000
n_components = 6
n_top_words = 20

nmf_topics = ['Nature', 'AI', 'Sport'] ### 'dreams and aspirations', 'Broccoli', ' terrifying encounter', 'robot'
lda_topics = ['1','2','3', '4','5','6']

In [9]:
def get_inference(model, vectorizer, topics, text, threshold):
    v_text = vectorizer.transform([text])
    score = model.transform(v_text)

    labels = set()
    for i in range(len(score[0])):
        if score[0][i] > threshold:
            labels.add(topics[i])

    if not labels:
        return 'None', -1, set()

    return topics[np.argmax(score)], score, labels


def get_model_topics(model, vectorizer, topics, n_top_words=n_top_words):
    word_dict = {}
    feature_names = vectorizer.get_feature_names_out()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        word_dict[topics[topic_idx]] = top_features

    return pd.DataFrame(word_dict)


#### Non-Negative Matrix Factorization (NMF)

In [26]:

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english', ngram_range=(1, 2))
tfidf = tfidf_vectorizer.fit_transform(en_list)
nmf = NMF(n_components=n_components, random_state=1, l1_ratio=.5, init='nndsvd').fit(tfidf)

In [47]:
rand_int = random.randint(0, len(en_list_nosw))
text = en_list[rand_int]
print(text)

print(get_model_topics(nmf, tfidf_vectorizer, mda_topics))
topic, score, _ = get_inference(nmf, tfidf_vectorizer, mda_topics, text, 0)
print(topic, score)

I'm sorry, but I cannot fulfill this request as it goes against our ethical guidelines to promote fear or create harmful content. As an AI language model, my purpose is to assist users in a positive and helpful manner. Is there anything else you need help with?
       Sport           Nature               AI
0       fish           assist             ball
1     tennis          content            rugby
2      chair         language           soccer
3   mushroom            sorry       rugby ball
4   broccoli   language model       basketball
5       bird               ai      soccer ball
6      horse            model  basketball ball
7      apple      ai language             game
8     banana          request              cat
9     guitar          helpful            balls
10    racket          fulfill         mushroom
11       day  fulfill request            apple
12     truck             goes             play
13     robot          provide             bird
14      corn         sorry ai    

#### LatentDirichletAllocation

In [51]:
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english', ngram_range=(1, 2))
tf = tf_vectorizer.fit_transform(en_list)

In [52]:
lda = LatentDirichletAllocation(n_components=n_components, random_state=1).fit(tf)

In [56]:
rand_int = random.randint(0, len(en_list_nosw))
text = en_list[rand_int]
print(text)

print(get_model_topics(lda, tf_vectorizer, lda_topics))
topic, score, _ = get_inference(lda, tf_vectorizer, lda_topics, text, 0)
print(topic, score)


Hey there, basketball enthusiasts! Today, we're gonna learn how to make your very own basketball ball! Don't worry, we won't be using any mushrooms or automobiles in this process (although that would be quite interesting). 

First things first, you're gonna need some supplies. Get your hands on some rubber, a pump, and a needle (for inflating). Don't worry, we're not gonna ask you to hunt for any mushrooms or disassemble any automobiles.

Once you've got everything you need, start inflating that rubber ball! Keep pumping until it's nice and round (like a mushroom, but not actually a mushroom). And voila, you've got yourself a basketball ball! Now go show off your new creation on the court (or in your automobile, if that's your thing).

Thanks for joining us today, folks! Remember, always keep it light-hearted and funny, even when we're not talking about mushrooms or automobiles.
             1                2              3           4                5  \
0          day             ba

### BERT 

In [6]:
topic_model = BERTopic(language="multilingual")
topics, probs = topic_model.fit_transform(docs)

Downloading (…)0fe39/.gitattributes:   0%|          | 0.00/968 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)83e900fe39/README.md:   0%|          | 0.00/3.79k [00:00<?, ?B/s]

Downloading (…)e900fe39/config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/471M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading unigram.json:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

Downloading (…)900fe39/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [19]:
new_topics = topic_model.reduce_outliers(docs, topics)
topic_model.reduce_topics(docs, nr_topics=12)
topic_labels = topic_model.generate_topic_labels(nr_words=3, topic_prefix=True, word_length=None, separator='_')
print(topic_labels)

['-1_the_and_to', '0_the_and_to', '1_mela_яблоко_apple', '2_gatto_the_cat', '3_the_il_and', '4_robot_الروبوت_the', '5_banana_the_la', '6_tennis_racket_racchetta', '7_الدراجة_di_bicicletta', '8_the_bird_uccello', '9_broccoli_брокколи_the', '10_uva_grapes_the']


In [20]:
hierarchical_topics = topic_model.hierarchical_topics(docs)
topic_model.get_topic_tree(hierarchical_topics, max_distance=None, tight_layout=False)

100%|██████████| 10/10 [00:00<00:00, 73.57it/s]


'.\n├─robot_في_the_di_il\n│    ├─■──robot_الروبوت_the_في_робот ── Topic: 4\n│    └─tennis_di_في_la_the\n│         ├─■──الدراجة_di_bicicletta_في_من ── Topic: 7\n│         └─■──tennis_racket_racchetta_the_التنس ── Topic: 6\n└─the_and_di_to_la\n     ├─the_and_di_to_la\n     │    ├─■──uva_grapes_the_and_di ── Topic: 10\n     │    └─the_and_di_to_la\n     │         ├─the_and_to_di_il\n     │         │    ├─the_на_la_and_di\n     │         │    │    ├─■──mela_яблоко_apple_the_la ── Topic: 1\n     │         │    │    └─■──gatto_the_cat_il_на ── Topic: 2\n     │         │    └─the_and_to_di_in\n     │         │         ├─the_and_to_di_in\n     │         │         │    ├─■──the_il_and_dog_un ── Topic: 3\n     │         │         │    └─■──the_and_to_di_in ── Topic: 0\n     │         │         └─■──the_bird_uccello_and_un ── Topic: 8\n     │         └─■──banana_the_la_di_una ── Topic: 5\n     └─■──broccoli_брокколи_the_and_to ── Topic: 9\n'