In [85]:
from elasticsearch import Elasticsearch, helpers
import json
import numpy as np

In [3]:
es = Elasticsearch()

In [4]:
es.count(index='do', doc_type='do-comment')

{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5}, u'count': 1186071}

## HTML strip

In [5]:
from HTMLParser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

from bs4 import BeautifulSoup

def strip_specific_tags(html, *tags):
    soup = BeautifulSoup(html)
    for tag in tags:
        for tag in soup.findAll(tag):
            tag.replaceWith("")

    return soup.get_text()

## Prepare data

In [6]:
import string
from nltk.stem.snowball import SnowballStemmer
from sklearn import cross_validation

In [181]:
def prepareText(body):
    body = strip_tags(strip_specific_tags(body, 'code'))
    
    exclude = set(string.punctuation)
    body = ''.join(ch for ch in body if ch not in exclude)
    
    # split the text string, stem each word and append words to a list of words
    words = []
    stemmer = SnowballStemmer('english')
    for word in body.split():
        stemWord = stemmer.stem(word)
        words.append(stemWord)

    words = [x for x in words if not x.isdigit() and not x[0].isdigit()]
    return string.join(words)

In [182]:
count_result = es.count('do', 'do-issue')
print count_result['count']

73787


In [None]:
entries = helpers.scan(es, index="do", doc_type="do-issue",
                       #query={"query":{"bool":{"must":[{"prefix":{"field_issue_version":"8.1"}}],"must_not":[],"should":[]}},"from":0,"sort":[]}
)
entries = list(entries)

In [None]:
def preprocess_entries(result):
    word_data = []
    category_data = []
    for i, entry in enumerate(result):
        if type(entry) == dict and entry['_source'].has_key('field_issue_category') and entry['_source'].has_key('body') and type(entry['_source']['body']) == dict:
            text = prepareText(entry['_source']['body']['value'])
            word_data.append(text)
            category_data.append(entry['_source']['field_issue_version'])
    return word_data, category_data
            
word_data, category_data = preprocess_entries(entries)

## TfIdf vectorization

In [None]:
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(
    word_data, category_data, test_size=0.1
)

In [None]:
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vec = TfidfVectorizer(stop_words='english', lowercase=True)
features_train_transformed = vec.fit_transform(features_train)
features_test_transformed = vec.transform(features_test)

In [None]:
selector = SelectPercentile(f_classif, percentile=1)
selector.fit(features_train_transformed, labels_train)
features_train_transformed = selector.transform(features_train_transformed).toarray()
features_test_transformed  = selector.transform(features_test_transformed).toarray()    

## Naive bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import accuracy_score

In [None]:
clf = GaussianNB()
clf = clf.fit(features_train_transformed, labels_train)

labels_pred = clf.predict(features_test_transformed)

In [None]:
print accuracy_score(labels_test, labels_pred)

## SVM

In [None]:
from sklearn.svm import SVC

In [None]:
clf = SVC(kernel="rbf", C=1000.)
clf = clf.fit(features_train_transformed, labels_train)

labels_pred = clf.predict(features_test_transformed)

In [None]:
print accuracy_score(labels_test, labels_pred)

## Decision trees

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
clf = DecisionTreeClassifier()
clf = clf.fit(features_train_transformed, labels_train)

labels_pred = clf.predict(features_test_transformed)

In [None]:
print accuracy_score(labels_test, labels_pred)

In [None]:
print np.max(clf.feature_importances_)

In [None]:
print vec.get_feature_names()[np.argmax(clf.feature_importances_)]