# File for looking at text content

## Import Libraries:

In [1]:
import sklearn as sk
import random
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
import numpy as np
from text_content import data
from stop_words import sw_en
import pandas as pd

### Get English Content

In [2]:
en_list = [item['content'] for item in data if item.get('lang') == 'en']
random.shuffle(en_list)

### Remove Stop Words

In [3]:
def remove_sw(text):
    new_list = []
    words = [word for word in re.split(r" |'", text) if word.lower() not in sw_en]
    new_text = " ".join(words)
    return new_text
    # new_list.append(new_text)

In [4]:
en_list_nosw = [remove_sw(item) for item in en_list]


In [5]:
print(f"The list in english has {len(en_list_nosw)} entries \n" )
print(en_list_nosw[3000])

The list in english has 3151 entries 

time, small town countryside, lived old man named Jack. Jack retired mechanic spent time tinkering antique car bicycle garage.

One day, Jack decided ride bike local farmer s market buy fresh apples. hopped trusty bicycle pedaled road. riding, noticed strange. bike felt different, like moving own.

Suddenly, wheel bike lifted ground transformed set wings. Jack startled excited took sky. landed gracefully nearby apple orchard, noticed unusual apple tree.

The tree like d seen before. branches metal, gears cogs instead leaves. Jack intrigued decided closer look. approached tree, felt sudden jolt teleported futuristic world filled flying automobiles.

To surprise, saw vehicles powered mechanical apples seen tree. Jack amazed quickly realized accidentally stumbled revolutionary new technology change world forever.

From day on, Jack known inventor mechanical apple, bicycle forever transformed flying machine. continued tinker explore, pushing boundarie

### Apply TF ID

In [6]:
tfIdfVectorizer=TfidfVectorizer(use_idf=True)
tfIdf = tfIdfVectorizer.fit_transform(en_list_nosw)

df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names_out(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
print (df.head(100))


              TF-IDF
nature      0.313603
grapes      0.283487
mushrooms   0.270495
birds       0.268880
appreciate  0.200706
...              ...
fruit       0.038307
next        0.038121
look        0.038028
looking     0.036817
provide     0.036736

[100 rows x 1 columns]


In [7]:
tags_arr = tfIdfVectorizer.inverse_transform(tfIdf)
print(tags_arr[0:10])
# content_tags = pd.DataFrame(tags_arr[0].T.todense(), columns=["TF-IDF"])
# content_tags = content_tags.sort_values('TF-IDF', ascending=False)
# print(content_tags.head(10))

[array(['discover', 'outside', 'aspect', 'interested', 'conclusion', 'in',
       'fungi', 'amazing', 'camera', 'forget', 'don', 'ecosystem',
       'contribute', 'medicine', 'cooking', 'uses', 'types', 'trail',
       'forest', 'guided', 'overlooked', 'mysterious', 'finally',
       'wonders', 'winged', 'beauty', 'capture', 'sketchpad', 'creative',
       'species', 'identify', 'help', 'guidebook', 'binoculars', 'pair',
       'bring', 'reserve', 'park', 'nearby', 'visit', 'backyard',
       'birdwatching', 'habitat', 'natural', 'observe', 'colors', 'sizes',
       'shapes', 'come', 'creatures', 'fascinating', 'talk', 'let',
       'next', 'home', 'grapevines', 'growing', 'adventurous', 'feeling',
       'stomping', 'grape', 'hand', 'try', 'varieties', 'taste',
       'winemaking', 'art', 'learn', 'winery', 'local', 'visiting',
       'tour', 'vineyard', 'taking', 'consider', 'truly', 'vine', 'fresh',
       'jam', 'wine', 'enjoyed', 'fruit', 'versatile', 'up', 'first',
       'gifts'

### Topic Modelling

#### Non-Negative Matrix Factorization (NMF)

In [26]:
n_features = 1000
n_components = 3
n_top_words = 20

nmf_topics = ['Nature', 'AI', 'Sport'] ### 'dreams and aspirations', 'Broccoli', ' terrifying encounter', 'robot'

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english', ngram_range=(1, 2))
tfidf = tfidf_vectorizer.fit_transform(en_list)
nmf = NMF(n_components=n_components, random_state=1, l1_ratio=.5, init='nndsvd').fit(tfidf)

In [9]:
def get_inference(model, vectorizer, topics, text, threshold):
    v_text = vectorizer.transform([text])
    score = model.transform(v_text)

    labels = set()
    for i in range(len(score[0])):
        if score[0][i] > threshold:
            labels.add(topics[i])

    if not labels:
        return 'None', -1, set()

    return topics[np.argmax(score)], score, labels


def get_model_topics(model, vectorizer, topics, n_top_words=n_top_words):
    word_dict = {}
    feature_names = vectorizer.get_feature_names_out()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        word_dict[topics[topic_idx]] = top_features

    return pd.DataFrame(word_dict)


In [34]:
rand_int = random.randint(0, len(en_list_nosw))
text = en_list[rand_int]
print(text)

print(get_model_topics(nmf, tfidf_vectorizer, nmf_topics))
topic, score, _ = get_inference(nmf, tfidf_vectorizer, nmf_topics, text, 0)
print(topic, score)

Once upon a time, there was a small and curious robot named Zephyr. Every day, Zephyr would go on adventures to explore the world around him. One day, while on his travels, he stumbled upon a banana peel lying on the ground. Intrigued by its bright yellow color and unique shape, Zephyr picked it up and began studying it.

As he examined the peel, he noticed it was strong and durable. Zephyr began to wonder what other things in the world around him were just as strong and durable as the banana peel. He began to search high and low, exploring every nook and cranny, until he came across a simple chair.

At first glance, the chair seemed plain and unremarkable. But when he looked closer, Zephyr saw the intricate details and the strength of the chair, just like the banana peel. He realized that there are often hidden strengths and qualities in things that we take for granted.

From that day on, Zephyr made it his mission to search for these hidden strengths in the world around him. He disco

#### Non-Negative Matrix Factorization (NMF)