In [6]:
'''
 * 
 * author Dishant Mittal
 * created on July 10, 2018
 * project MSCI 641
 
 * LDA2Vec is a model that learns dense word vectors jointly with Dirichlet-distributed latent 
 * document-level mixtures of topic vectors. In contrast to continuous dense document 
 * representations, this formulation produces sparse, interpretable document mixtures 
 * through a non-negative simplex constraint.
 *
 
 * In this script, I load the data, clean it, create word2vec, bow, doc_ids, pruned words and flattened vecs.
 * Finally, I save all the objects.
 '''

import logging
import multiprocessing
import pickle
import numpy as np
from gensim.models import Word2Vec, KeyedVectors
import pandas as pd
import nltk
import numpy as np
import csv
import gensim
from keras.layers import Flatten, Dropout
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.embeddings import Embedding
from keras.initializers import Constant
from keras import regularizers

#Please note that spacy module needs to be installed first to make sure that lda2vec works properly.
from lda2vec import preprocess, Corpus
logging.basicConfig()

# Load and preprocess data

In [13]:
df1=pd.read_csv('./all-the-news/articles1.csv', index_col = False)
df2=pd.read_csv('./all-the-news/articles2.csv', index_col = False)
df3=pd.read_csv('./all-the-news/articles3.csv', index_col = False)
df=df1
df=df1.append(df2) 
df=df.append(df3)

# df.head()

# df=df.drop(['Unnamed: 0'], axis=1).reset_index(drop=True)
df = df.dropna(subset=['content']).reset_index(drop=True)#drop rows where col value is nan
df = df.loc[df['content']!=' '].reset_index(drop=True)
df = df.loc[df['content']!=''].reset_index(drop=True)
# print(df[df["content"].isnull()])   #this statement will return null now because we have already dropped null/nan values
# print("loaded")
df = df.rename(columns={'content': 'body'})
texts = list(df['body'])

print("Data loaded")

# Remove tokens with these substrings
bad = set(["ax>", '`@("', '---', '===', '^^^'])


def clean(line):
    return ' '.join(w for w in line.split() if not any(t in w for t in bad))

Data loaded


In [8]:
# Preprocess data
max_length = 10000   # Limit of 10k words per document
# Convert to unicode (spaCy only works with unicode)
texts = [str(clean(d)) for d in texts if len(str(clean(d))) > 0]


In [12]:
#tokens, vocab = preprocess.tokenize(texts, max_length, merge=False,
#                                    n_threads=4)

tokens = np.load("tokens.npy")
vocab = np.load("vocab.npy")

print("tokens and vocabulary loaded")


tokens and vocabulary loaded


In [5]:
vocab = vocab.tolist()

In [6]:
corpus = Corpus()
# Make a ranked list of rare vs frequent words
corpus.update_word_count(tokens)
corpus.finalize()
# The tokenization uses spaCy indices, and so may have gaps
# between indices for words that aren't present in our dataset.
# This builds a new compact index
compact = corpus.to_compact(tokens)
# Remove extremely rare words
pruned = corpus.filter_count(compact, min_count=30)
# Convert the compactified arrays into bag of words arrays
bow = corpus.compact_to_bow(pruned)
# Words tend to have power law frequency, so selectively
# downsample the most prevalent words
clean = corpus.subsample_frequent(pruned)
# Now flatten a 2D array of document per row and word position
# per column to a 1D array of words. This will also remove skips
# and OoV words
doc_ids = np.arange(pruned.shape[0])
flattened, (doc_ids,) = corpus.compact_to_flat(pruned, doc_ids)


# Training word2vec model

In [None]:
#Dimensionality of the resulting word vectors
num_features = 300
#Minimum word count threshold
min_word_count = 1
#Number of threads to run in parallel
num_workers = multiprocessing.cpu_count()
#Context window length
context_size = 2
#Seed for the RNG, to make the result reproducible
seed = 1
#number of negative samples to be drawn
neg_samples = 10

sentences = [nltk.word_tokenize(title) for title in texts]

word2vec_model = gensim.models.Word2Vec(
    sentences,
    sg=1,
    seed=seed,
    workers=num_workers, 
    size=num_features, 
    min_count=min_word_count,
    negative=neg_samples,
    window=context_size,
    iter=20,
    )

word2vec_model.save('./word_vecs.pkl')
print("saved")

In [7]:
assert flattened.min() >= 0
# Fill in the pretrained word vectors
n_dim = 300
trained_wordvc = './word_vecs.pkl'
vectors, s, f = corpus.compact_word_vectors(vocab, filename=trained_wordvc)

# Dump all preprocessed files

In [14]:
# Save all of the preprocessed files
pickle.dump(vocab, open('vocab.pkl', 'w'))
pickle.dump(corpus, open('corpus.pkl', 'w'))
np.save("flattened", flattened)
np.save("doc_ids", doc_ids)
np.save("pruned", pruned)
np.save("bow", bow)
np.save("vectors", vectors)
print("saved")

saved
