In [1]:
import pickle
import logging
import gensim
import warnings
import numpy as np
import pandas as pd

In [2]:
# load data
with open('output/train_corpus.pkl', 'rb') as f:
    train_corpus = pickle.load(f)
with open('output/train_id2word.pkl', 'rb') as f:
    train_id2word = pickle.load(f)
with open('output/train_bigram.pkl', 'rb') as f:
    train_bigram = pickle.load(f)
    
text_comments = pd.read_csv('data/labeled_commit_comments.csv')
text_comments.comment = text_comments.comment.astype(str)
documents_train = text_comments[['comment']][:60425].astype(str)
documents_train['label'] = text_comments['label'][:60425]
print(len(documents_train))
print(documents_train[:6])

60425
                                             comment label
0  Yeah, but I don't like to post that until *aft...   neg
1                                Cool. Thank you :-)   pos
2  Thanks -- I thought the slides were pretty goo...   pos
3    Edy, 4.2-milestone-1 hasn't been released yet..   pos
4  ^^ sorry but your index.php don't work with me...   pos
5                                                 ;(   pos


In [3]:
# provide greater visibility during training
logging.basicConfig(filename='output/lda_model.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [4]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    
    lda_train = gensim.models.LdaMulticore(corpus=train_corpus, num_topics=20, id2word=train_id2word, workers=2, passes=2)
    lda_train.save('output/lda_train.model')
    
    lda_train.print_topics(20, num_words=15)

In [5]:
lda_train.print_topics(20,num_words=15)[:10]

[(0,
  '0.037*"know" + 0.028*"data" + 0.019*"work" + 0.018*"like" + 0.017*"self" + 0.015*"think" + 0.015*"templat" + 0.014*"need" + 0.013*"thing" + 0.012*"error" + 0.011*"warn" + 0.011*"index" + 0.010*"want" + 0.009*"model" + 0.009*"variabl"'),
 (1,
  '0.063*"method" + 0.024*"mean" + 0.021*"call" + 0.020*"need" + 0.017*"return" + 0.014*"java" + 0.013*"thread" + 0.013*"queri" + 0.011*"connect" + 0.011*"tabl" + 0.011*"like" + 0.010*"match" + 0.010*"http" + 0.009*"string" + 0.009*"option"'),
 (2,
  '0.069*"file" + 0.066*"version" + 0.026*"error" + 0.023*"delet" + 0.017*"includ" + 0.016*"lgtm" + 0.014*"code" + 0.014*"prefer" + 0.011*"chang" + 0.011*"line" + 0.009*"save" + 0.009*"need" + 0.009*"debug" + 0.008*"print" + 0.008*"necessari"'),
 (3,
  '0.044*"class" + 0.043*"check" + 0.027*"null" + 0.023*"true" + 0.018*"pass" + 0.015*"think" + 0.015*"paramet" + 0.013*"valu" + 0.012*"document" + 0.012*"case" + 0.010*"type" + 0.010*"test" + 0.009*"work" + 0.008*"better" + 0.008*"defin"'),
 (4,
  '

In [6]:
# make feature vectors
train_vecs = []
for i in range(len(documents_train)):
    top_topics = lda_train.get_document_topics(train_corpus[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(20)]
    train_vecs.append(topic_vec)

In [7]:
print(len(train_vecs))
print(np.sum(train_vecs[2]))
train_vecs[2]

60425
1.0


[0.312089,
 0.008334031,
 0.008334031,
 0.008334031,
 0.34650117,
 0.008334031,
 0.008334031,
 0.008334031,
 0.008334031,
 0.008334031,
 0.008334031,
 0.008334031,
 0.008334031,
 0.008334031,
 0.008334031,
 0.008334031,
 0.008334031,
 0.008334031,
 0.19973129,
 0.008334031]

In [12]:
x = np.array(train_vecs)
y = np.array([1 if value=='pos' else 0 for idx, value in documents_train.label.iteritems()])
print(x.shape)
print(y.shape)
print(y[:5])

(60425, 20)
(60425,)
[0 1 1 1 1]


In [13]:
# save datasets
with open('output/y.pkl', 'wb') as f:
    pickle.dump(y, f)
with open('output/x.pkl', 'wb') as f:
    pickle.dump(x, f)