In [13]:
import pandas as pd
from lda2vec.nlppipe import Preprocessor

# Data directory
data_dir ="data"
# Where to save preprocessed data
clean_data_dir = "data/clean_data"
# Name of input file. Should be inside of data_dir
input_file = "performance-sentence.txt"
# Should we load pretrained embeddings from file
load_embeds = True

# Read in data file
df = pd.read_csv(data_dir+"/"+input_file, sep="\t")

# Initialize a preprocessor
P = Preprocessor(df, "texts", max_features=30000, maxlen=10000, min_count=0)

# Run the preprocessing on your dataframe
P.preprocess()

# Load embeddings from file if we choose to do so
if load_embeds:
    # Load embedding matrix from file path - change path to where you saved them
    embedding_matrix = P.load_para("../glove.6B.300d.txt")
else:
    embedding_matrix = None

# Save data to data_dir
P.save_data(clean_data_dir, embedding_matrix=embedding_matrix)


---------- Tokenizing Texts ----------


116it [00:00, 2075.86it/s]


Removing 0 low frequency tokens out of 809 total tokens

---------- Getting Skipgrams ----------


116it [00:00, 3418.89it/s]
  all_embs = np.stack(embeddings_index.values())


In [14]:
from lda2vec import utils, model
import tensorflow as tf
tf.reset_default_graph()

# Path to preprocessed data
data_path  = "data/clean_data"
# Whether or not to load saved embeddings file
load_embeds = True

# Load data from files
(idx_to_word, word_to_idx, freqs, pivot_ids,
 target_ids, doc_ids, embed_matrix) = utils.load_preprocessed_data(data_path, load_embed_matrix=load_embeds)

# Number of unique documents
num_docs = doc_ids.max() + 1
# Number of unique words in vocabulary (int)
vocab_size = len(freqs)
# Embed layer dimension size
# If not loading embeds, change 128 to whatever size you want.
embed_size = embed_matrix.shape[1] if load_embeds else 128
# Number of topics to cluster into
num_topics = 20
# Amount of iterations over entire dataset
num_epochs = 200
# Batch size - Increase/decrease depending on memory usage
batch_size = 4096
# Epoch that we want to "switch on" LDA loss
switch_loss_epoch = 0
# Pretrained embeddings value
pretrained_embeddings = embed_matrix if load_embeds else None
# If True, save logdir, otherwise don't
save_graph = True


# Initialize the model
m = model(num_docs,
          vocab_size,
          num_topics,
          embedding_size=embed_size,
          pretrained_embeddings=pretrained_embeddings,
          freqs=freqs,
          batch_size = batch_size,
          save_graph_def=save_graph)

# Train the model
m.train(pivot_ids,
        target_ids,
        doc_ids,
        len(pivot_ids),
        num_epochs,
        idx_to_word=idx_to_word,
        switch_loss_epoch=switch_loss_epoch)



EPOCH: 1
LOSS 532690.7 w2v 110.06113 lda 532580.6

EPOCH: 2
LOSS 532127.8 w2v 111.84721 lda 532015.94

EPOCH: 3
LOSS 531557.3 w2v 105.52188 lda 531451.8

EPOCH: 4
LOSS 531001.3 w2v 113.32855 lda 530888.0

EPOCH: 5
LOSS 530437.06 w2v 112.120476 lda 530324.94


W0626 21:53:03.590957 15668 deprecation.py:506] From C:\Users\Admin\Anaconda3\lib\site-packages\lda2vec\Lda2vec.py:276: calling reduce_sum_v1 (from tensorflow.python.ops.math_ops) with keep_dims is deprecated and will be removed in a future version.
Instructions for updating:
keep_dims is deprecated, use keepdims instead


---------Closest 10 words to given indexes----------
Topic 0 : requestor, hartley, textstyle, arnold, kbit, dominate, decompressed, bureaucracies, gbit, speedracer
Topic 1 : decompressed, requestor, bureaucracies, gbit, kbit, hartley, dominate, deeper, worse, enlarged
Topic 2 : requestor, bureaucracies, hartley, textstyle, queued, decompressed, fft, mbit, arnold, obligationwhilst
Topic 3 : decompressed, requestor, textstyle, hartley, executes, executing, kbit, hertz, bureaucracies, tightest
Topic 4 : textstyle, requestor, decompressed, kbit, arnold, bureaucracies, fft, deliverables, cue, hartley
Topic 5 : requestor, textstyle, fft, decompressed, bureaucracies, kbit, cue, executes, hartley, arnold
Topic 6 : cue, hartley, requestor, decompressed, bureaucracies, queued, instrumenting, aka, textstyle, claude
Topic 7 : textstyle, requestor, decompressed, bureaucracies, hartley, fft, mbit, cue, executes, hll
Topic 8 : textstyle, decompressed, requestor, hartley, arnold, obligationwhilst, gbi

LOSS 520899.25 w2v 71.3859 lda 520827.88

EPOCH: 23
LOSS 520329.7 w2v 55.991825 lda 520273.7

EPOCH: 24
LOSS 519765.34 w2v 45.40099 lda 519719.94

EPOCH: 25
LOSS 519219.44 w2v 52.73049 lda 519166.72
---------Closest 10 words to given indexes----------
Topic 0 : same, example, as, this, addition, can, one, form, is, only
Topic 1 : same, example, as, this, addition, can, one, is, such, only
Topic 2 : same, example, as, this, can, addition, one, form, only, with
Topic 3 : same, as, example, this, addition, can, one, is, only, with
Topic 4 : same, example, as, this, addition, one, can, form, is, only
Topic 5 : same, as, example, this, can, addition, one, is, only, form
Topic 6 : same, example, as, this, addition, one, can, is, only, with
Topic 7 : same, example, as, this, addition, can, one, is, form, only
Topic 8 : same, as, example, this, addition, can, one, is, only, with
Topic 9 : same, example, as, this, addition, can, one, is, only, form
Topic 10 : same, as, example, this, addition, 

Topic 12 : example, same, addition, as, this, one, also, well, and, however
Topic 13 : example, same, addition, as, this, also, one, and, well, however
Topic 14 : example, same, addition, as, this, one, also, and, well, however
Topic 15 : example, same, addition, as, this, one, also, and, well, the
Topic 16 : example, same, addition, as, this, one, also, and, well, however
Topic 17 : example, same, addition, as, this, one, also, and, well, however
Topic 18 : example, same, addition, as, this, one, also, and, well, however
Topic 19 : example, same, addition, as, this, one, also, and, well, however

EPOCH: 46
LOSS 507667.47 w2v 9.097422 lda 507658.38

EPOCH: 47
LOSS 507127.47 w2v 11.785979 lda 507115.7

EPOCH: 48
LOSS 506581.16 w2v 7.9307003 lda 506573.22

EPOCH: 49
LOSS 506039.25 w2v 7.9087505 lda 506031.34

EPOCH: 50
LOSS 505500.75 w2v 10.763532 lda 505490.0
---------Closest 10 words to given indexes----------
Topic 0 : example, same, addition, as, this, one, also, and, well, however
T

---------Closest 10 words to given indexes----------
Topic 0 : example, same, addition, as, this, well, one, also, and, however
Topic 1 : example, same, addition, as, this, well, also, one, and, however
Topic 2 : example, same, addition, as, this, one, well, also, and, however
Topic 3 : example, same, addition, as, this, well, one, also, and, however
Topic 4 : example, same, addition, as, this, also, well, one, however, and
Topic 5 : example, same, addition, as, this, well, one, also, and, however
Topic 6 : example, same, addition, as, this, one, also, well, however, and
Topic 7 : example, same, addition, as, this, well, also, one, and, however
Topic 8 : example, same, addition, as, this, well, also, one, however, and
Topic 9 : example, same, addition, as, this, one, well, also, and, however
Topic 10 : example, same, addition, as, this, well, one, also, and, however
Topic 11 : example, same, addition, as, this, well, also, one, and, however
Topic 12 : example, same, addition, as, this,

Topic 9 : example, same, addition, as, this, well, actually, one, also, and
Topic 10 : example, addition, same, as, this, well, one, actually, also, and
Topic 11 : example, addition, same, as, this, well, actually, also, one, and
Topic 12 : example, addition, same, as, well, this, also, actually, one, however
Topic 13 : example, addition, same, as, this, well, actually, however, also, and
Topic 14 : example, addition, same, as, this, well, one, actually, also, however
Topic 15 : example, addition, same, as, well, this, actually, one, and, also
Topic 16 : example, addition, same, as, this, well, actually, one, also, and
Topic 17 : example, same, addition, as, well, this, actually, however, and, also
Topic 18 : example, same, addition, as, this, well, actually, also, and, however
Topic 19 : example, addition, same, as, well, this, actually, also, and, however

EPOCH: 91
LOSS 483701.4 w2v 4.5916133 lda 483696.8

EPOCH: 92
LOSS 483180.28 w2v 5.2301826 lda 483175.06

EPOCH: 93
LOSS 482658.3

Topic 16 : example, addition, same, actually, as, aka, well, this, one, also
Topic 17 : example, addition, same, actually, well, as, aka, however, this, and
Topic 18 : example, addition, same, actually, well, aka, as, this, however, and
Topic 19 : example, addition, same, actually, well, as, aka, this, however, also

EPOCH: 111
LOSS 473355.0 w2v 4.594372 lda 473350.4

EPOCH: 112
LOSS 472842.34 w2v 4.4502106 lda 472837.9

EPOCH: 113
LOSS 472330.34 w2v 4.4179606 lda 472325.94

EPOCH: 114
LOSS 471818.88 w2v 4.3670545 lda 471814.5

EPOCH: 115
LOSS 471307.9 w2v 4.444316 lda 471303.47
---------Closest 10 words to given indexes----------
Topic 0 : example, addition, same, actually, aka, well, as, this, however, one
Topic 1 : example, addition, same, aka, actually, well, as, this, one, however
Topic 2 : example, addition, same, actually, aka, well, as, this, one, however
Topic 3 : example, addition, same, actually, well, as, aka, one, and, however
Topic 4 : example, addition, same, actually, a

EPOCH: 132
LOSS 462692.2 w2v 4.3536158 lda 462687.84

EPOCH: 133
LOSS 462189.56 w2v 4.323167 lda 462185.25

EPOCH: 134
LOSS 461687.56 w2v 4.365814 lda 461683.2

EPOCH: 135
LOSS 461185.97 w2v 4.405271 lda 461181.56
---------Closest 10 words to given indexes----------
Topic 0 : example, addition, same, aka, actually, well, supposed, reason, although, result
Topic 1 : example, addition, same, aka, actually, reason, well, this, one, although
Topic 2 : example, addition, same, aka, actually, well, one, as, supposed, reason
Topic 3 : example, addition, same, actually, aka, well, as, one, reason, supposed
Topic 4 : example, addition, same, aka, actually, well, one, reason, as, this
Topic 5 : example, addition, same, aka, actually, well, as, particular, supposed, although
Topic 6 : example, addition, same, aka, actually, well, reason, one, as, result
Topic 7 : example, addition, same, aka, actually, well, reason, supposed, although, as
Topic 8 : example, addition, same, actually, aka, well, al

Topic 17 : addition, example, same, aka, actually, reason, well, although, result, supposed
Topic 18 : example, addition, same, aka, actually, reason, well, supposed, result, although
Topic 19 : example, addition, same, aka, actually, reason, supposed, well, although, particular

EPOCH: 151
LOSS 453224.72 w2v 4.3189216 lda 453220.4

EPOCH: 152
LOSS 452731.2 w2v 4.2763667 lda 452726.9

EPOCH: 153
LOSS 452238.16 w2v 4.2559996 lda 452233.9

EPOCH: 154
LOSS 451745.66 w2v 4.353652 lda 451741.3

EPOCH: 155
LOSS 451253.6 w2v 4.3261857 lda 451249.28
---------Closest 10 words to given indexes----------
Topic 0 : example, addition, same, aka, actually, supposed, reason, although, well, particular
Topic 1 : example, addition, same, aka, actually, reason, although, supposed, well, particular
Topic 2 : example, addition, same, aka, actually, reason, well, supposed, although, particular
Topic 3 : example, addition, same, actually, aka, well, reason, supposed, although, particular
Topic 4 : example, 

Topic 10 : example, addition, same, aka, actually, supposed, reason, well, particular, one
Topic 11 : example, addition, same, aka, actually, reason, supposed, well, either, although
Topic 12 : addition, example, same, aka, actually, reason, well, supposed, result, although
Topic 13 : example, addition, same, aka, actually, reason, although, well, either, result
Topic 14 : addition, example, same, aka, actually, reason, result, supposed, although, particular
Topic 15 : example, addition, same, aka, actually, reason, supposed, well, although, result
Topic 16 : example, addition, same, aka, actually, reason, supposed, particular, well, either
Topic 17 : addition, example, same, aka, actually, reason, although, well, supposed, result
Topic 18 : example, addition, same, aka, actually, reason, supposed, result, well, although
Topic 19 : example, addition, same, aka, actually, reason, supposed, although, well, particular

EPOCH: 171
LOSS 443446.06 w2v 4.148689 lda 443441.9

EPOCH: 172
LOSS 4

Topic 3 : example, addition, same, aka, actually, reason, either, particular, supposed, although
Topic 4 : example, addition, same, aka, actually, reason, supposed, particular, well, although
Topic 5 : example, addition, same, aka, actually, reason, particular, supposed, although, well
Topic 6 : example, addition, aka, same, reason, actually, supposed, although, result, part
Topic 7 : example, addition, same, aka, actually, reason, supposed, although, well, particular
Topic 8 : addition, example, same, actually, although, aka, reason, either, supposed, well
Topic 9 : example, addition, same, aka, actually, reason, supposed, particular, although, either
Topic 10 : example, addition, same, aka, actually, reason, supposed, particular, well, one
Topic 11 : example, addition, same, aka, actually, reason, supposed, either, well, particular
Topic 12 : addition, example, same, aka, actually, reason, supposed, result, although, well
Topic 13 : example, addition, same, aka, actually, reason, eit

In [4]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

def twenty_newsgroup_to_csv():
    newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

    df = pd.DataFrame([newsgroups_train.data, newsgroups_train.target.tolist()]).T
    df.columns = ['text', 'target']

    targets = pd.DataFrame( newsgroups_train.target_names)
    targets.columns=['title']

    out = pd.merge(df, targets, left_on='target', right_index=True)
    out['date'] = pd.to_datetime('now')
    out.to_csv('20_newsgroup.csv')
    
twenty_newsgroup_to_csv()

Downloading 20news dataset. This may take a few minutes.
I0626 21:42:12.768491 15668 twenty_newsgroups.py:247] Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)
I0626 21:42:12.769488 15668 twenty_newsgroups.py:80] Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)
