In [2]:
import spacy
from sklearn.datasets import fetch_20newsgroups
from gensim import corpora, models
from tqdm import tqdm_notebook as tqdm
import re
!pwd

/home/ddehueck/.local/share/Trash/files/explorer.3


In [2]:
from collections import Counter

def preprocess(docs, nlp, min_length, min_counts, max_counts, tokenize_fn):
    """Tokenize, clean, and encode documents.
    Arguments:
        docs: A list of tuples (index, string), each string is a document.
        nlp: A spaCy object, like nlp = spacy.load('en').
        min_length: An integer, minimum document length.
        min_counts: An integer, minimum count of a word.
        max_counts: An integer, maximum count of a word.
    Returns:
        encoded_docs: A list of tuples (index, list), each list is a document
            with words encoded by integer values.
        decoder: A dict, integer -> word.
        word_counts: A list of integers, counts of words that are in decoder.
            word_counts[i] is the number of occurrences of word decoder[i]
            in all documents in docs.
    """

    tokenized_docs = [(i, tokenize_fn(doc)) for i, doc in tqdm(docs)]

    # remove short documents
    n_short_docs = sum(1 for i, doc in tokenized_docs if len(doc) < min_length)
    tokenized_docs = [(i, doc) for i, doc in tokenized_docs if len(doc) >= min_length]
    print('number of removed short documents:', n_short_docs)

    # remove some tokens
    counts = _count_unique_tokens(tokenized_docs)
    tokenized_docs = _remove_tokens(tokenized_docs, counts, min_counts, max_counts)
    n_short_docs = sum(1 for i, doc in tokenized_docs if len(doc) < min_length)
    tokenized_docs = [(i, doc) for i, doc in tokenized_docs if len(doc) >= min_length]
    print('number of additionally removed short documents:', n_short_docs)

    counts = _count_unique_tokens(tokenized_docs)
    encoder, decoder, word_counts = _create_token_encoder(counts)

    print('\nminimum word count number:', word_counts[-1])
    print('this number can be less than MIN_COUNTS because of document removal')

    encoded_docs = _encode(tokenized_docs, encoder)
    return encoded_docs, decoder, word_counts


def _count_unique_tokens(tokenized_docs):
    tokens = []
    for i, doc in tokenized_docs:
        tokens += doc
    return Counter(tokens)


def _encode(tokenized_docs, encoder):
    return [(i, [encoder[t] for t in doc]) for i, doc in tokenized_docs]


def _remove_tokens(tokenized_docs, counts, min_counts, max_counts):
    """
    Words with count < min_counts or count > max_counts
    will be removed.
    """
    total_tokens_count = sum(
        count for token, count in counts.most_common()
    )
    print('total number of tokens:', total_tokens_count)

    unknown_tokens_count = sum(
        count for token, count in counts.most_common()
        if count < min_counts or count > max_counts
    )
    print('number of tokens to be removed:', unknown_tokens_count)

    keep = {}
    for token, count in counts.most_common():
        keep[token] = count >= min_counts and count <= max_counts

    return [(i, [t for t in doc if keep[t]]) for i, doc in tokenized_docs]


def _create_token_encoder(counts):

    total_tokens_count = sum(
        count for token, count in counts.most_common()
    )
    print('total number of tokens:', total_tokens_count)

    encoder = {}
    decoder = {}
    word_counts = []
    i = 0

    for token, count in counts.most_common():
        # counts.most_common() is in decreasing count order
        encoder[token] = i
        decoder[i] = token
        word_counts.append(count)
        i += 1

    return encoder, decoder, word_counts

In [24]:
def og_tokenize(doc):
    text = ' '.join(doc.split())  # remove excessive spaces
    text = nlp(text, disable=['parse', 'entity'])
    return [t.lemma_.lower() for t in text if t.is_alpha and len(t) > 2 and not t.is_stop]

import sys
sys.path.append("..")
from datasets.preprocess import Tokenizer

class Args:
    def __init__(self):
        self.nlp = None
        
tokenizer = Tokenizer(merge_noun_chunks=True)

def my_tokenize(doc):
    return tokenizer.tokenize_doc(doc)

In [21]:
nlp = spacy.load('en')
dataset = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

docs = [(i, doc) for i, doc in enumerate(dataset['data'])]

In [22]:
encoded_docs, decoder, word_counts = preprocess(docs, nlp, 15, 20, 1800, og_tokenize)

# OG TOKENIZE
og_texts = [[decoder[j] for j in doc] for i, doc in encoded_docs]
og_dictionary = corpora.Dictionary(og_texts)
og_corpus = [og_dictionary.doc2bow(text) for text in og_texts]

HBox(children=(IntProgress(value=0, max=18846), HTML(value='')))

number of removed short documents: 3979
total number of tokens: 1441495
number of tokens to be removed: 395483
number of additionally removed short documents: 2052
total number of tokens: 1022282

minimum word count number: 13
this number can be less than MIN_COUNTS because of document removal


In [23]:
print(len(decoder))

7435


In [25]:

encoded_docs, decoder, word_counts = preprocess(docs, nlp, 15, 20, 1800, my_tokenize)


HBox(children=(IntProgress(value=0, max=18846), HTML(value='')))

number of removed short documents: 4885
total number of tokens: 1225018
number of tokens to be removed: 490453
number of additionally removed short documents: 3060
total number of tokens: 700663

minimum word count number: 12
this number can be less than MIN_COUNTS because of document removal


In [26]:
# MY TOKENIZE
my_texts = [[decoder[j] for j in doc] for i, doc in encoded_docs]
my_dictionary = corpora.Dictionary(my_texts)
my_corpus = [my_dictionary.doc2bow(text) for text in my_texts]

In [27]:
print(len(decoder)) # 9329 without merge

7187


In [None]:
%time
og_lda = models.LdaModel(og_corpus, alpha=0.9, id2word=og_dictionary, num_topics=24)
og_corpus_lda = og_lda[og_corpus]

In [14]:
%time
my_lda = models.LdaModel(my_corpus, alpha=0.9, id2word=my_dictionary, num_topics=24)
my_corpus_lda = my_lda[my_corpus]

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.25 µs


In [15]:
#print("OG Method")
#for i, topics in og_lda.show_topics(24, formatted=False):
#   print('topic', i, ':', ' '.join([t for t, _ in topics]))
    
"""
OG Method
topic 0 : israel jews country israeli arab jewish war attack public peace
topic 1 : play hockey goal watch season guy shot pick fan end
topic 2 : bit ripem color chip message des pgp copy rsa encryption
topic 3 : book open subject job note library probably openwindows source old
topic 4 : bike myer ride food eat motorcycle dog feel road eye
topic 5 : window server application client widget font user display motif manager
topic 6 : player hit john clutch moncton springfield bad baltimore well providence
topic 7 : president job ground house talk white today technology general decision
topic 8 : battery signal little lot great bad old turn hear actually
topic 9 : jesus church bible christ word sin lord christian faith life
topic 10 : insurance pay private car company money care tax buy cost
topic 11 : team win play player season league nhl hockey period toronto
topic 12 : armenian armenians turkish kill turkey woman greek leave live village
topic 13 : car printer buy driver engine phone print sell laser dealer
topic 14 : gun drug kinsey homosexual health rate child disease report safety
topic 15 : belief argument evidence atheist exist religion claim true reason example
topic 16 : university space book planet water world earth moon technology nuclear
topic 17 : software version ftp datum user package computer tool graphic code
topic 18 : christian moral objective human atheist morality different claim value accept
topic 19 : board heat circuit wire cpu picture current quality sink small
topic 20 : fire weapon fbi gun child koresh firearm death person compound
topic 21 : pitcher pitch bad probably lot average hit morris guy team
topic 22 : space launch earth orbit mission satellite nasa center spacecraft solar
topic 23 : card disk dos price windows hard controller monitor board speed
"""

print()    

print("My Tokenizer")
for i, topics in my_lda.show_topics(24, formatted=False):
    print('topic', i, ':', ' '.join([t for t, _ in topics]))


My Tokenizer
topic 0 : com posting network keyboard service internet newsgroup modem thanks users
topic 1 : jesus bible christ word love life church sin faith man
topic 2 : gun guns crime control firearms weapon weapons firearm fire fbi
topic 3 : thanks science lot actually looking argument post probably logic claim
topic 4 : image software color files images graphics version dos mac format
topic 5 : list send email ftp pub software faq address computer anonymous
topic 6 : period play detroit vancouver san division toronto pts chicago power
topic 7 : neutral wire outlets ground wiring tape dog run dos usually
topic 8 : israel israeli jews state arab rights jewish peace land case
topic 9 : armenian armenians turkish greek turkey war turks children russian genocide
topic 10 : window server application set widget motif running run user sun
topic 11 : card mhz cpu board video cards price apple monitor ram
topic 12 : game team goal ice play blues mark flames season shot
topic 13 : game tea

In [20]:
import numpy as np
doc_weights_init = np.zeros((len(my_corpus_lda), 24))
for i in tqdm(range(len(my_corpus_lda))):
    topics = my_corpus_lda[i]
    for j, prob in topics:
        doc_weights_init[i, j] = prob

HBox(children=(IntProgress(value=0, max=13434), HTML(value='')))