In [None]:
# Find most relevant terms for each topic using KMeans clustering

In [None]:
#!pip install --upgrade threadpoolctl
#!pip install wordcloud

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

In [None]:
sample_documents = [
    """
    Economics focuses on the behaviour and interactions of economic agents and how economies work. 
    Microeconomics analyzes what's viewed as basic elements in the economy, including individual agents 
    and markets, their interactions, and the outcomes of interactions. Individual agents may include, 
    for example, households, firms, buyers, and sellers. Macroeconomics analyzes the economy as a system 
    where production, consumption, saving, and investment interact, and factors affecting it: employment 
    of the resources of labour, capital, and land, currency inflation, economic growth, 
    and public policies that have impact on these elements.
    """,
    """
    Literature is any collection of written work, but it is also used more narrowly for writings specifically 
    considered to be an art form, especially prose fiction, drama, and poetry. In recent centuries, the 
    definition has expanded to include oral literature, much of which has been transcribed. Literature is 
    a method of recording, preserving, and transmitting knowledge and entertainment, and can also have a social,
    psychological, spiritual, or political role. Literature, as an art form, can also include works in 
    various non-fiction genres, such as biography, diaries, memoir, letters, and the essay. Within its broad 
    definition, literature includes non-fictional books, articles or other printed information on a particular 
    subject.
    """
    
]

In [None]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import re

stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [None]:
def cleanDocument(text):
    words = text.split()
    
    cleaned = []
    for w in words:
        w = re.sub(r'\W+', '', w)
        #print(w)
        
        w = w.lower()
        
        w = lemmatizer.lemmatize(w)
        
        if w not in stop:
            cleaned.append(w)
            
    return ' '.join(cleaned)

In [None]:
# get rid of stop words, non-alphanumeric text, lemmatize
documents = [cleanDocument(text) for text in sample_documents]

In [None]:
df_transcripts = pd.DataFrame({'text': documents})

In [None]:
df_transcripts

In [None]:
tfv = TfidfVectorizer(ngram_range = (1,1))

In [None]:
vec_text = tfv.fit_transform(df_transcripts['text'])

In [None]:
vec_text

In [None]:
words = tfv.get_feature_names_out()

In [None]:
words

In [None]:
len(words)

In [None]:
lda_model = LatentDirichletAllocation(n_components=2)

#https://www.kaggle.com/rajmehra03/topic-modelling-using-lda-and-lsa-in-sklearn
lda_output = lda_model.fit_transform(vec_text)

In [None]:
df_documents = pd.DataFrame(lda_output)

In [None]:
df_documents

In [None]:
len(lda_model.components_[0])

In [None]:
topic_1 = dict(zip(words, lda_model.components_[1]))

In [None]:
topic_1

In [None]:
df_topic_1 = pd.DataFrame(topic_1, index=[1])

In [None]:
#df_topic_0.T

In [None]:
df_topic_1.T.sort_values(by=1, ascending=False) 

In [None]:
for i, comp in enumerate(lda_model.components_):
    words_comp = dict(zip(words, comp))
    sorted_words = sorted(words_comp.items(), reverse=True, key=lambda item: item[1])
    print("Topic", i)
    for w in sorted_words[:10]:
        print(w[0], w[1])
    print("\n")

In [None]:
pred_text = """
The definition of literature broadly encompasses written works used to transmit culture. However, 
literature is not always limited to the written word. The literature meaning can include stories 
told in the oral tradition and visual literature, such as drama that is intended to be performed 
before an audience.
"""

In [None]:
pred = lda_model.transform(tfv.transform([cleanDocument(pred_text)]))

In [None]:
pred