In [2]:
import pandas as pd
import csv
import sys
import re
import pickle
import scipy
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from time import time

csv.field_size_limit(sys.maxsize)

131072

In [3]:
def process_content(content_list):
    source_code = []
    content_list = eval(content_list)
    for line in content_list:
        # filter comments
        if not re.match("\s*\/\/\s*isComment", line):
            source_code.append(line.replace("\n", " newLine "))
    return ' '.join(source_code)

def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

param_dist = {'C': scipy.stats.expon(scale=100)}

lr_classifier = LogisticRegression(random_state=42)

n_iter_search = 20
random_search = RandomizedSearchCV(lr_classifier,
                                   param_distributions=param_dist,
                                   n_iter=n_iter_search,
                                   cv=5,
                                   n_jobs=-1)

In [5]:
labels = ["tmm", "lc", "dc", "lpl", "lm"]
for label in labels:
    print("===== {} ==============".format(label))
    print("Reading data...")
    df = pd.read_csv('data/df/train_{}.csv'.format(label), engine="python")
    df_test = pd.read_csv('data/df/test_{}.csv'.format(label), engine="python")

    X_train = []
    Y_train = []
    X_test = []
    Y_test = []

    print("Preparing lists...")
    for index, row in df.iterrows():
        X_train.append(process_content(row["content"]))
        Y_train.append(row["smells"])

    for index, row in df_test.iterrows():
        X_test.append(process_content(row["content"]))
        Y_test.append(row["smells"])

    print("Extracting features...")
    cv = CountVectorizer(binary=True)
    cv.fit(X_train)
    train_instances = cv.transform(X_train)
    test_instances = cv.transform(X_test)
    
    X_t, X_v, y_t, y_v = train_test_split(train_instances, Y_train, train_size = 0.75)

    start = time()
    print("Hyperparameter tuning...")
    random_search.fit(train_instances, Y_train)
    print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
    report(random_search.cv_results_)
    print("============ EVALUATION on test set:")
    print(accuracy_score(Y_test, random_search.best_estimator_.predict(test_instances)))

Reading data...
Preparing lists...
Extracting features...
Hyperparameter tuning...




RandomizedSearchCV took 151.05 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.632 (std: 0.025)
Parameters: {'C': 226.21584447040712}

Model with rank: 1
Mean validation score: 0.632 (std: 0.025)
Parameters: {'C': 279.83659756747403}

Model with rank: 3
Mean validation score: 0.632 (std: 0.025)
Parameters: {'C': 302.523804331414}

0.6135338345864662
Reading data...
Preparing lists...
Extracting features...




Hyperparameter tuning...




RandomizedSearchCV took 1.95 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.800 (std: 0.150)
Parameters: {'C': 28.108601949536823}

Model with rank: 1
Mean validation score: 0.800 (std: 0.150)
Parameters: {'C': 20.586448400482407}

Model with rank: 1
Mean validation score: 0.800 (std: 0.150)
Parameters: {'C': 40.63622226326734}

Model with rank: 1
Mean validation score: 0.800 (std: 0.150)
Parameters: {'C': 18.449166701479623}

Model with rank: 1
Mean validation score: 0.800 (std: 0.150)
Parameters: {'C': 49.67448386029197}

Model with rank: 1
Mean validation score: 0.800 (std: 0.150)
Parameters: {'C': 6.488245122086524}

Model with rank: 1
Mean validation score: 0.800 (std: 0.150)
Parameters: {'C': 4.768536532889151}

Model with rank: 1
Mean validation score: 0.800 (std: 0.150)
Parameters: {'C': 52.2620638897252}

Model with rank: 1
Mean validation score: 0.800 (std: 0.150)
Parameters: {'C': 20.78853711584202}

Model with rank: 1
Mean validati



Hyperparameter tuning...




RandomizedSearchCV took 7.00 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.889 (std: 0.028)
Parameters: {'C': 6.711407183845483}

Model with rank: 2
Mean validation score: 0.889 (std: 0.029)
Parameters: {'C': 11.513372919994314}

Model with rank: 3
Mean validation score: 0.888 (std: 0.027)
Parameters: {'C': 5.391865337022511}

0.8405063291139241
Reading data...
Preparing lists...
Extracting features...




Hyperparameter tuning...




RandomizedSearchCV took 0.48 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.944 (std: 0.051)
Parameters: {'C': 21.316923787052225}

Model with rank: 1
Mean validation score: 0.944 (std: 0.051)
Parameters: {'C': 4.983156233561143}

Model with rank: 1
Mean validation score: 0.944 (std: 0.051)
Parameters: {'C': 42.900458853222126}

Model with rank: 1
Mean validation score: 0.944 (std: 0.051)
Parameters: {'C': 28.328174907451185}

Model with rank: 1
Mean validation score: 0.944 (std: 0.051)
Parameters: {'C': 32.214205486055704}

Model with rank: 1
Mean validation score: 0.944 (std: 0.051)
Parameters: {'C': 3.671103087822675}

0.680327868852459
Reading data...
Preparing lists...
Extracting features...
Hyperparameter tuning...




RandomizedSearchCV took 57.40 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.650 (std: 0.055)
Parameters: {'C': 8.427064659721047}

Model with rank: 2
Mean validation score: 0.647 (std: 0.057)
Parameters: {'C': 16.80893997157763}

Model with rank: 2
Mean validation score: 0.647 (std: 0.057)
Parameters: {'C': 17.63232694380791}

0.6809895833333334
