In [3]:
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
import numpy as np
import nltk
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.metrics import f1_score

np.random.seed(2018)
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/esmeralda/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# load data and obtained unseen splice
text_comments = pd.read_csv('data/labeled_commit_comments.csv')
text_comments.comment = text_comments.comment.astype(str)
documents_test = text_comments[['comment']][60425:181275].astype(str)
print(len(documents_test))
print(documents_test[:6])

120850
                                                 comment
60425  No, that is not the best way possible.\\n\\nTh...
60426  \\n>Undo/redo conceptually are concepts that c...
60427  I don't understand why Screenplay doesn't deci...
60428  On Tue, Jul 10, 2012 at 2:05 AM, Osku Salerma\...
60429  I have to digest these options a bit before I ...
60430  > I have to digest these options a bit before ...


In [6]:
# load trained id2word
with open('output/train_id2word.pkl', 'rb') as f:
    train_id2word = pickle.load(f)

In [7]:
# process data
def lemmatize_stemming(text):
    """ stem and lemmatize text """
    stemmer = PorterStemmer()
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def bigrams(words, bi_min=15, tri_min=10):
    """ make a model that detects common phrases (unigram and bigram)"""
    # train toy bigram model
    bigram = gensim.models.Phrases(words, min_count=bi_min)
    # export trained model to use less RAM and have faster processing
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod

def remove_stopwords(text):
    """ apply preprocessing: lemmatization, stemming to all text """
    result = []
    for token in simple_preprocess(str(text)):
        if token not in STOPWORDS and len(token)>3:
            result.append(lemmatize_stemming(token))
    return result


def get_bigram(df):
    """
    need to generate bigram of unseen data
    """
    words = documents['comment'].map(remove_stopwords)
    bigram = bigrams(words)
    bigram = [bigram[comment] for comment in words]
    return bigram

bigram_test = get_bigram(documents_test)

test_corpus = [train_id2word.doc2bow(text) for text in bigram_test]

In [8]:
# load trained LDA model
lda_train = gensim.models.ldamulticore.LdaMulticore.load('output/lda_train.model')

In [9]:
# make feature vectors of test corpus
test_vecs = []
for i in range(len(documents_test)):
    top_topics = lda_train.get_document_topics(test_corpus[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(20)]
    test_vecs.append(topic_vec)

In [11]:
len(test_vecs)
x = np.array(test_vecs)

documents_test['label'] = text_comments['label'][60425:181275]
y = np.array([1 if value=='pos' else 0 for idx, value in documents_test.label.iteritems()])

In [13]:
# Run SVM on test feature vectors
scaler = StandardScaler()
x = scaler.fit_transform(x)

# SGD Hinge
sgd_hinge = linear_model.SGDClassifier(
    max_iter=1000,
    alpha=20,
    loss='hinge',
    class_weight='balanced',shuffle=True
).fit(x, y)
    
y_pred = sgd_hinge.predict(x)

print(f1_score(y, y_pred, average='binary'))

0.913722511606585
