In [13]:
import pickle
import logging
import gensim
import warnings
import numpy as np
import pandas as pd

In [17]:
# load data
with open('output/train_corpus.pkl', 'rb') as f:
    train_corpus = pickle.load(f)
with open('output/train_id2word.pkl', 'rb') as f:
    train_id2word = pickle.load(f)
with open('output/train_bigram.pkl', 'rb') as f:
    train_bigram = pickle.load(f)
    
text_comments = pd.read_csv('data/labeled_commit_comments.csv')
text_comments.comment = text_comments.comment.astype(str)
documents_train = text_comments[['comment']][:60425].astype(str)
documents_train['label'] = text_comments['label'][:60425]
print(len(documents_train))
print(documents_train[:6])

60425
                                             comment label
0  Yeah, but I don't like to post that until *aft...   neg
1                                Cool. Thank you :-)   pos
2  Thanks -- I thought the slides were pretty goo...   pos
3    Edy, 4.2-milestone-1 hasn't been released yet..   pos
4  ^^ sorry but your index.php don't work with me...   pos
5                                                 ;(   pos


In [3]:
# provide greater visibility during training
logging.basicConfig(filename='output/lda_model.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [5]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    
    lda_train = gensim.models.LdaMulticore(corpus=train_corpus, num_topics=20, id2word=train_id2word, workers=2, passes=2)
    lda_train.save('output/lda_train.model')
    
    lda_train.print_topics(20, num_words=15)

In [6]:
# make feature vectors
train_vecs = []
for i in range(len(documents_train)):
    top_topics = lda_train.get_document_topics(train_corpus[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(20)]
    train_vecs.append(topic_vec)

In [11]:
print(len(train_vecs))
print(np.sum(train_vecs[2]))
train_vecs[2]

60425
1.0


[0.008335098,
 0.21885021,
 0.008335098,
 0.008335098,
 0.008335098,
 0.008335098,
 0.008335098,
 0.18260145,
 0.008335098,
 0.008335098,
 0.008335099,
 0.008335098,
 0.008335098,
 0.008335098,
 0.45685163,
 0.008335098,
 0.008335098,
 0.008335098,
 0.008335098,
 0.008335098]

In [18]:
x = np.array(train_vecs)
y = np.array(documents_train.label)
print(x.shape)
print(y.shape)

(60425, 20)
(60425,)


In [19]:
# save datasets
with open('output/y.pkl', 'wb') as f:
    pickle.dump(y, f)
with open('output/x.pkl', 'wb') as f:
    pickle.dump(x, f)