In [2]:
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
import numpy as np
import nltk
import pickle

np.random.seed(2018)
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/esmeralda/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
# load data
text_comments = pd.read_csv('data/labeled_commit_comments.csv')
text_comments.comment = text_comments.comment.astype(str)
documents = text_comments[['comment']][:60425].astype(str)
print(len(documents))
print(documents[:6])

60425
                                             comment
0  Yeah, but I don't like to post that until *aft...
1                                Cool. Thank you :-)
2  Thanks -- I thought the slides were pretty goo...
3    Edy, 4.2-milestone-1 hasn't been released yet..
4  ^^ sorry but your index.php don't work with me...
5                                                 ;(


In [5]:
# auxiliary functions
def lemmatize_stemming(text):
    """ stem and lemmatize text """
    stemmer = PorterStemmer()
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def bigrams(words, bi_min=15, tri_min=10):
    """ make a model that detects common phrases (unigram and bigram)"""
    # train toy bigram model
    bigram = gensim.models.Phrases(words, min_count=bi_min)
    # export trained model to use less RAM and have faster processing
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod

def remove_stopwords(text):
    """ apply preprocessing: lemmatization, stemming to all text """
    result = []
    for token in simple_preprocess(str(text)):
        if token not in STOPWORDS and len(token)>3:
            result.append(lemmatize_stemming(token))
    return result

def get_corpus(df):
    """
    Remove stopwords,
    apply lemmatization and stemming,
    Get Bigram Model & apply it to each comment,
    Corpus = sparse vector of integer word id and its number of distinct occurances per document; needed for LDA,
    id2word = outputs the word frequency count for total training set; mapping between words and their integer ids 
    """
    words = documents['comment'].map(remove_stopwords)
    bigram = bigrams(words)
    bigram = [bigram[comment] for comment in words]
    id2word = gensim.corpora.Dictionary(bigram)
    id2word.filter_extremes(no_below=10, no_above=0.35)
    id2word.compactify()
    corpus = [id2word.doc2bow(text) for text in bigram]
    return corpus, id2word, bigram

In [12]:
corpus, id2word, bigram = get_corpus(documents)

In [13]:
with open('data/train_corpus.pkl', 'wb') as f:
    pickle.dump(corpus, f)
with open('data/train_id2word.pkl', 'wb') as f:
    pickle.dump(id2word, f)
with open('data/train_bigram.pkl', 'wb') as f:
    pickle.dump(bigram, f)