In [10]:
import enum
import random
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, porter
from nltk.corpus import stopwords
from collections import defaultdict

topic_indices = [0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0]

doc_ls = ["Cute kitty", 
"Eat rice or cake", 
"Kitty and hamster",
"Eat bread", 
"Rice, bread and cake", 
"Cute hamster eats bread and cake"]

In [11]:

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/dhkim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/dhkim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:

class LDA:

    def __init__(self, docs, topic_num, alpha=0.1, beta=0.001):
        self.alpha = alpha
        self.beta = beta
        self.k = topic_num
        self.docs = docs

    def randomly_analysis_topic (self):
        dic = defaultdict()
        t2i = defaultdict(lambda: len(t2i))
        i2t = defaultdict()

        porter = PorterStemmer()
        stopword = stopwords.words('english')
        stopword.append(',')
        index = 0
        for d, tokens in enumerate([word_tokenize(doc) for doc in self.docs]):
            for w, token in enumerate([porter.stem(token.lower()) for token in tokens if token not in stopword]):
                i2t[t2i[token]]= token
                dic[(d, t2i[token], w)] = topic_indices[index]
                # dic[(d, t2i[token], w)] = random.randint(0, self.k -1)
                index += 1

        return dic, t2i, i2t

    def count_doc_topic(self, term_topic, t2i):
        docs = np.zeros((self.k, len(self.docs)))
        terms = np.zeros((self.k, len(t2i)))

        docs.fill(self.alpha)
        terms.fill(self.beta)

        for (d, n, w) in term_topic.keys():
            topic = term_topic[(d, n, w)]
            docs[topic, d] += 1
            terms[topic, n] += 1

        print(docs)
        print(terms)

        return docs, terms



    def iterate_assign_topic(self, term_topic, docs, terms, i2t):
        prev = {}

        while prev != term_topic:
            for (d, n, w) in term_topic:
                topic = [0, 0]

                docs[term_topic[(d, n, w)], d] -= 1
                terms[term_topic[(d, n, w)], n] -= 1

                prev = term_topic

                for t in range(self.k):
                    p_t_d = docs[t, d] /docs[:, d].sum()
                    p_w_t = terms[t, n] / terms[t, :].sum()
                    prob = p_t_d * p_w_t

                    if topic[1] < prob:
                        topic = [t, prob]


                term_topic[(d, n, w)] = topic[0]
                docs[topic[0], d] += 1
                terms[topic[0], n] += 1

        return terms


    def topic_modeling(self, count=3):
        term_topic, t2i, i2t = self.randomly_analysis_topic()
        docs, terms  = self.count_doc_topic(term_topic, t2i)
        terms = self.iterate_assign_topic(term_topic, docs, terms, i2t)

        score = terms / terms.sum(axis=1, keepdims=True)

        for i in range(self.k):
            print(f"topic {i} =>", end=" ")
            sorted_index = np.flip(np.argsort(score[i]), 0)[:count]
            for j in sorted_index:
                print("({}, {})".format(i2t[j], score[i, j].round(3)), end=" ")

            print("")


In [13]:
lda = LDA(doc_ls, 2)
lda.topic_modeling(3)

[[1.1 2.1 0.1 2.1 2.1 2.1]
 [1.1 1.1 2.1 0.1 1.1 3.1]]
[[1.001e+00 1.000e-03 2.001e+00 1.001e+00 3.001e+00 1.000e-03 2.001e+00]
 [1.001e+00 2.001e+00 1.001e+00 1.001e+00 1.000e-03 2.001e+00 1.001e+00]]
topic 0 => (bread, 0.273) (cake, 0.273) (eat, 0.273) 
topic 1 => (hamster, 0.333) (kitti, 0.333) (cute, 0.333) 


In [14]:
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=42, remove=('header', 'footer', 'quotes'))

In [15]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [16]:

print(dataset.data[0])
doc = nlp(dataset.data[0])
result = " ".join([token.text for token in doc if token.is_stop==False and token.tag_[0] == 'N'])
print("*********************************")
print(result)
# for token in doc:
#     print(token.text, token.lemma_, token.tag_, token.pos_, token.is_stop,)

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----





*********************************
lerxst@wam.umd.edu thing Subject car Nntp Posting Host rac3.wam.umd.edu Organization University Maryland College Park Lines car day door sports car 60s/ 70s Bricklin doors addition bumper rest body model engine specs years production car histor

In [17]:
def cleansing(text):
    result = ""
    if text:
        doc = nlp(text)
        result = " ".join([token.text for token in doc if token.is_stop==False and token.tag_[0] == 'N'])
        return result
    return result

In [18]:
import pandas as pd

news_df = pd.DataFrame({'document': dataset.data})
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w.lower() for w in x.split() if len(w) > 3]))
news_df['clean_doc'] = news_df['clean_doc'].apply(cleansing)
#tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split())

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

tfidf_vect = TfidfVectorizer(tokenizer=my_tokenizer)
tfidf = tfidf_vect.fit_transform(news_df['clean_doc'])
lda = LatentDirichletAllocation(n_components=20, max_iter=20, learning_method='online', random_state=42)
lda_output = lda.fit_transform(tfidf)

  and should_run_async(code)


In [25]:
!pip install pyLDAvis

  and should_run_async(code)




In [26]:
import pyLDAvis
import pyLDAvis.sklearn

pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(lda, tfidf, tfidf_vect, mds='tsne')
pyLDAvis.display(vis)

  and should_run_async(code)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  Try to infer an Index from the passed data, raise ValueError on failure.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  Try to infer an Index from the passed data, raise ValueError on failure.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  Try to infer an Index from the passed data, raise ValueError on failure.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  Try to infer an Index from the passed data, raise ValueError on fa

TypeError: Cannot interpret '<attribute 'dtype' of 'numpy.generic' objects>' as a data type