In [1]:
import json
import os
import time 
import pickle
import re 
from tqdm import tqdm
import html

In [2]:
def extract_topic(url):
    match = re.search(r'(?<=https://www.washingtonpost.com/)([^/]*)(?=/)',text['contenturl'])
    if match is None:
        return ""
    else:
        return url[match.start():match.end()]

In [3]:
s = time.time()
sections = {}
path = './data/1-10_October_2019/'
all_texts = set()
documents = []
for file in tqdm(os.listdir(path)):
    for text in json.load(open(path+file,'r')):
        if 'body' in text and 'contenturl' in text:
            body = html.unescape(text['body'])
            all_texts.add(body)
            topic = extract_topic(text['contenturl'])
            if topic in sections:
                sections[topic].add(body)
            else:
                sections[topic] = {body}
print(s-time.time())

100%|██████████| 26/26 [00:10<00:00,  2.50it/s]

-10.90625810623169





In [47]:
sections = {topic: items for topic, items in sections.items() if len(items) > 20}

In [48]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

In [49]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/sergey/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [50]:
stemmer = SnowballStemmer('english')

In [51]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [52]:
s = time.time()

processed_docs = {}
for key in tqdm(sections):
    processed_docs[key] = []
    for document in sections[key]:
        processed_docs[key].append(preprocess(document))
   
print(s-time.time())
# query = 'news Basketball Basketball Anthony '
# processed_query = preprocess(query)
# print(processed_query)
# bow_query = dictionary.doc2bow(processed_query)
# bow_query

100%|██████████| 17/17 [00:12<00:00,  2.46it/s]

-12.57628345489502





In [53]:
s = time.time()
dictionary = {}
for key in sections:
    dictionary[key] = gensim.corpora.Dictionary(processed_docs[key])
print(s-time.time())

-0.7677497863769531


In [54]:
s = time.time()
bow_corpus = {}
for key in sections:
    bow_corpus[key] = [dictionary[key].doc2bow(doc) for doc in processed_docs[key]]
print(s-time.time())

-0.39707446098327637


In [55]:
from gensim import corpora, models
tfidf = {}
for key in sections:
    tfidf[key] = models.TfidfModel(bow_corpus[key])

In [56]:
corpus_tfidf = {} 
for key in sections:
    corpus_tfidf[key] = tfidf[key][bow_corpus[key]]


In [62]:
s = time.time()
lda_model_tfidf = {}
for key in tqdm(sections): 
    lda_model_tfidf[key] = gensim.models.LdaMulticore(corpus_tfidf[key], id2word=dictionary[key], workers=4, num_topics=3)
    with open(f"{key}.model", "wb") as f:
        pickle.dump(lda_model_tfidf[key], f)
print(s-time.time())

100%|██████████| 17/17 [00:13<00:00,  1.99it/s]

-13.479897260665894





In [63]:
print([(section, len(items)) for section, items in sections.items()])

[('nation', 42), ('local', 129), ('outlook', 30), ('lifestyle', 89), ('politics', 132), ('sports', 198), ('world', 78), ('news', 59), ('business', 51), ('opinions', 208), ('entertainment', 28), ('arts-entertainment', 23), ('weather', 39), ('technology', 22), ('', 32), ('video', 57), ('national', 21)]


In [64]:
lda_model_tfidf

{'nation': <gensim.models.ldamulticore.LdaMulticore at 0x7f8bbda82cc0>,
 'local': <gensim.models.ldamulticore.LdaMulticore at 0x7f8bbda82e10>,
 'outlook': <gensim.models.ldamulticore.LdaMulticore at 0x7f8bbda82eb8>,
 'lifestyle': <gensim.models.ldamulticore.LdaMulticore at 0x7f8bf4a68e80>,
 'politics': <gensim.models.ldamulticore.LdaMulticore at 0x7f8bd53a39b0>,
 'sports': <gensim.models.ldamulticore.LdaMulticore at 0x7f8bc697cc18>,
 'world': <gensim.models.ldamulticore.LdaMulticore at 0x7f8bf48d5048>,
 'news': <gensim.models.ldamulticore.LdaMulticore at 0x7f8bd5362c18>,
 'business': <gensim.models.ldamulticore.LdaMulticore at 0x7f8bd53a1978>,
 'opinions': <gensim.models.ldamulticore.LdaMulticore at 0x7f8bf48d5080>,
 'entertainment': <gensim.models.ldamulticore.LdaMulticore at 0x7f8bd5362f60>,
 'arts-entertainment': <gensim.models.ldamulticore.LdaMulticore at 0x7f8bd53a16a0>,
 'weather': <gensim.models.ldamulticore.LdaMulticore at 0x7f8bf4a68fd0>,
 'technology': <gensim.models.ldamulti

In [65]:
model = lda_model_tfidf['sports']

In [67]:
with open(f"subtopics3.txt", "w") as f:
    for topic in lda_model_tfidf:
        subtopics = lda_model_tfidf[topic].show_topics(num_topics=12, num_words=5,formatted=False)
        f.write(f"{topic}: {len(sections[topic])} news \n")
        for id, words in subtopics:
            f.write(str([word for word, score in words])+"\n")
        f.write("================\n")
        

In [97]:
for idx, topic in lda_model_tfidf['sports'].print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))


Topic: 0 
Words: 0.001*"nation" + 0.001*"inning" + 0.001*"dodger" + 0.001*"strasburg" + 0.001*"rsquo" + 0.001*"nat" + 0.001*"pitch" + 0.001*"scherzer" + 0.001*"corbin" + 0.001*"patriot"
Topic: 1 
Words: 0.002*"rsquo" + 0.001*"ldquo" + 0.001*"rdquo" + 0.001*"capit" + 0.001*"donn" + 0.001*"dell" + 0.001*"mystic" + 0.001*"gruden" + 0.001*"redskin" + 0.001*"yard"
Topic: 2 
Words: 0.002*"rsquo" + 0.001*"ldquo" + 0.001*"rdquo" + 0.001*"redskin" + 0.001*"nation" + 0.001*"dodger" + 0.001*"gruden" + 0.001*"haskin" + 0.001*"quarterback" + 0.000*"yard"


In [None]:
# lda_model_tfidf.get_document_topics(bow_query, 0.5)