# File for looking at text content

## Import Libraries:

In [23]:
import sklearn as sk
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
import numpy as np
from text_content import data
from stop_words import sw_en
import pandas as pd

### Get English Content

In [2]:
en_list = [item['content'] for item in data if item.get('lang') == 'en']

### Remove Stop Words

In [3]:
def remove_sw(text):
    new_list = []
    words = [word for word in re.split(r" |'", text) if word.lower() not in sw_en]
    new_text = " ".join(words)
    return new_text
    # new_list.append(new_text)

In [4]:
en_list_nosw = [remove_sw(item) for item in en_list]

In [10]:
print(f"The list in english has {len(en_list_nosw)} entries \n" )
print(en_list_nosw[3000])

The list in english has 3151 entries 

Hello welcome hilariously informative tutorial airplane, bird, corn. rules simple: going discuss things twist goofy humor, buckle ready soar jokes!

First up, airplane. hear pilot told co-pilot joke mid-flight? went head. like airplane itself! Hey, don t blame me, blame laws physics.

Now, bird. bird join Twitter? tweet, course! Ha, it? Alright, alright, ll fly away one. seriously, birds cool. fly, sing, wake morning.

Finally, corn. don t know you, think corn pretty a-maize-ing. there? Okay, okay, ll out. seriously, corn versatile delicious food. eat cob off, s favorite snacks like popcorn tortilla chips.

So it, folks: quick quirky tutorial airplanes, birds, corn. hope fun reading writing it. aviation, ornithology, agricultural jokes sure friends roll (corn) eyes!


### Apply TF ID

In [13]:
tfIdfVectorizer=TfidfVectorizer(use_idf=True)
tfIdf = tfIdfVectorizer.fit_transform(en_list_nosw)

df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names_out(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
print (df.head(100))


           TF-IDF
rugby    0.224572
racket   0.193265
tennis   0.191449
biking   0.182560
develop  0.176360
...           ...
today    0.055341
next     0.053261
skills   0.053197
high     0.052383
love     0.051729

[100 rows x 1 columns]
             TF-IDF
rugby      0.224572
racket     0.193265
tennis     0.191449
biking     0.182560
develop    0.176360
...             ...
fishball   0.000000
fishbowl   0.000000
fished     0.000000
fisherman  0.000000
يمكنني     0.000000

[13006 rows x 1 columns]


In [24]:
tags_arr = tfIdfVectorizer.inverse_transform(tfIdf)
print(tags_arr[0:10])
# content_tags = pd.DataFrame(tags_arr[0].T.todense(), columns=["TF-IDF"])
# content_tags = content_tags.sort_values('TF-IDF', ascending=False)
# print(content_tags.head(10))

[array(['start', 'hop', 'grab', 'field', 'skills', 'lifelong', 'help',
       'rewarding', 'fun', 'remember', 'cycling', 'so', 'nature',
       'appreciate', 'surroundings', 'explore', 'way', 'friendly', 'eco',
       'levels', 'stress', 'reducing', 'muscles', 'strengthening',
       'health', 'cardiovascular', 'improving', 'workout', 'full', 'away',
       'pedaling', 'point', 'getting', 'good', 'great', 'biking',
       'discuss', 'lastly', 'adventure', 'little', 'love', 'doesn',
       'adversity', 'face', 'perseverance', 'resilience', 'teamwork',
       'sense', 'stronger', 'gain', 'playing', 'difficulty', 'excitement',
       'increasing', 'ways', 'unpredictable', 'bounces', 'means', 'shape',
       'oval', 'agility', 'speed', 'strength', 'physical', 'requiring',
       'known', 'next', 'game', 'focused', 'stay', 'required',
       'fortitude', 'mental', 'footwork', 'coordination', 'eye', 'hand',
       'develop', 'll', 'body', 'mind', 'exercise', 'incredible',
       'making', 's

### Topic Modelling

In [None]:
n_features = 1000
n_components = 5
n_top_words = 20

tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english', ngram_range=(1, 2))
tf = tf_vectorizer.fit_transform(corpus)