In [14]:
import newspaper
import requests
from newspaper import Article

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation
import nltk

In [3]:
def get_news(sub):
    r = requests.get('https://www.reddit.com/r/{}.json'.format(sub),
        headers={'User-Agent': 'machinelearners-FknsjZnUYesiAk'})
    all_children = [x['data'] for x in r.json()['data']['children']]
    return [{'id': child['id'], 'score': child['score'], 'url': child['url'], 'is_self': child['is_self']} for child in all_children]

In [6]:
def extract_url(url):
    article = Article(url, language='en', keep_article_html=True)
    article.download()
    article.parse()
    article.nlp()
    return {
        'title': article.title,
        'text': article.text,
        'top_image': article.top_image,
        'summary': article.summary,
        'article_html': article.article_html
    }

In [7]:
def get_data(sub):
    return [{**news, **extract_url(news['url'])} for news in get_news(sub)]

In [17]:
data = get_data('javascript')

In [31]:
len(data)

26

In [33]:
documents = [d['text'] for d in data]

In [35]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
# documents = dataset.data

no_features = 100

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.85, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.85, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 5

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

no_top_words = 10

In [36]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(str(topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [37]:
display_topics(lda, tf_feature_names, no_top_words)

0
add function user functions application use default const article like
1
loading render component react return div view state error logic
2
function tree data foo return js response error use file
3
build js javascript type command using example add just time
4
code vue debug new file typescript use like data js


In [38]:
display_topics(nmf, tfidf_feature_names, no_top_words)

0
js file new api like vue test error app node
1
want time vue new work javascript extension easy effects error
2
build code start javascript debug ve time extension effects error
3
add function functions application default use example return const tree
4
just like javascript change right data using ve start time
