In [12]:
import pandas as pd
import numpy as np
import lda
import gensim
import logging
import warnings
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

logging.getLogger("lda").setLevel(logging.WARNING)
warnings.filterwarnings("ignore", category=DeprecationWarning)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(CountVectorizer, self).build_analyzer()
        def stem (doc):
            return [stemmer.stem(x) for x in analyzer(doc)]
        return stem

class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        def stem (doc):
            return [stemmer.stem(x) for x in analyzer(doc)]
        return stem

In [16]:
muestra = 400000
cvz = pickle.load(open("./data/cvzm-20-400-%s.pickle"%muestra, "rb"))
tvz = pickle.load(open("./data/tvzm-20-400-%s.pickle"%muestra, "rb"))

In [17]:
lda_model = lda.LDA(n_topics=20, n_iter=1000)
X_topics = lda_model.fit_transform(cvz)



In [18]:
pickle.dump(X_topics, open("./data/ldadata-20-400-%s-20.pickle"%muestra, "wb"))
pickle.dump(lda_model, open("./data/ldamodel-20-400-%s-20.pickle"%muestra, "wb"))

In [52]:
svd = TruncatedSVD(n_components=10, random_state=0)
svd_tfidf = svd.fit_transform(cvz)

In [53]:
pickle.dump(svd_tfidf, open("./data/lsadata-20-400-%s.pickle"%muestra, "wb"))
pickle.dump(svd, open("./data/lsamodel-20-400-%s.pickle"%muestra, "wb"))

In [5]:
X_topics.shape

(400000, 15)