In [None]:
import pandas as pd
import csv
import sys
import re
import scipy
import numpy as np
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from time import time

csv.field_size_limit(sys.maxsize)

In [None]:
def process_content(content_list):
    source_code = []
    content_list = eval(content_list)
    for line in content_list:
        # filter comments
        if not re.match("\s*\/\/\s*isComment", line):
            source_code.append(line.replace("\n", " newLine "))
    return ' '.join(source_code)

def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

param_dist = {'C': scipy.stats.expon(scale=100),
              'gamma': scipy.stats.expon(scale=.1),
              'kernel': ['rbf', 'linear', 'poly'],
              'class_weight':['balanced', None]}

svm_classifier = svm.SVC(random_state=42)

n_iter_search = 20
random_search = RandomizedSearchCV(svm_classifier,
                                   param_distributions=param_dist,
                                   n_iter=n_iter_search,
                                   cv=5,
                                   n_jobs=-1)

In [None]:
labels = ["tmm", "lc", "dc", "lpl", "lm"]
for label in labels:
    print("===== {} ==============".format(label))
    print("Reading data...")
    df = pd.read_csv('data/df/train_{}.csv'.format(label), engine="python")
    df_test = pd.read_csv('data/df/test_{}.csv'.format(label), engine="python")

    X_train = []
    Y_train = []
    X_test = []
    Y_test = []

    print("Preparing lists...")
    for index, row in df.iterrows():
        X_train.append(process_content(row["content"]))
        Y_train.append(row["smells"])

    for index, row in df_test.iterrows():
        X_test.append(process_content(row["content"]))
        Y_test.append(row["smells"])

    print("Extracting features...")
    cv = CountVectorizer(binary=True)
    cv.fit(X_train)
    train_instances = cv.transform(X_train)
    test_instances = cv.transform(X_test)
    
    X_t, X_v, y_t, y_v = train_test_split(train_instances, Y_train, train_size = 0.75)

    start = time()
    print("Hyperparameter tuning...")
    random_search.fit(train_instances, Y_train)
    print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
    report(random_search.cv_results_)
    print("============ EVALUATION on test set:")
    print(accuracy_score(Y_test, random_search.best_estimator_.predict(test_instances)))