In [1]:
import pandas as pd
import csv
import sys
import re
import scipy
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from time import time

csv.field_size_limit(sys.maxsize)

131072

In [11]:
def process_content(file_path):
    with open(file_path, "r") as file:
        lines = file.readlines()
        source_code = []
        for line in lines:
            # filter comments
            if not re.match("\s*\/\/\s*isComment", line):
                source_code.append(line)
    text = ' '.join(source_code)
    return text

In [12]:
df = pd.read_pickle('../data/instances.pkl')
labels = list(set(df['target'].values))

LabelEncoder()

In [13]:
X = []
Y = []

print("Preparing lists...")
for index, row in df.iterrows():
    X.append(process_content(row["source_code"]))
    Y.append(row["target"])

Preparing lists...


In [14]:
le = preprocessing.LabelEncoder() # for use in logistic regression
le.fit(labels)
Y = le.transform(Y)

In [15]:
print("Extracting features...")
cv = CountVectorizer(binary=True)
cv.fit(X)
instances = cv.transform(X)

Extracting features...


In [16]:
X_train, X_test, y_train, y_test = train_test_split(instances, Y, train_size = 0.75, random_state=42)



# Default parameters

In [17]:
lr_classifier = LogisticRegression(random_state=42, verbose=1)
lr_classifier.fit(X_train, y_train)



[LibLinear]

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=1, warm_start=False)

In [18]:
print("============ EVALUATION on test set:")
print(accuracy_score(y_test, lr_classifier.predict(X_test)))

0.7723552591694453


# Hyperparametrization

In [None]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

param_dist = {'C': scipy.stats.expon(scale=100)}

lr_classifier2 = LogisticRegression(random_state=42)

n_iter_search = 20
random_search = RandomizedSearchCV(lr_classifier2,
                                   param_distributions=param_dist,
                                   n_iter=n_iter_search,
                                   cv=5,
                                   n_jobs=-1)
start = time()
print("Hyperparameter tuning...")
random_search.fit(X_train, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
  " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)
print("============ EVALUATION on test set:")
print(accuracy_score(Y_test, random_search.best_estimator_.predict(test_instances)))