In [9]:
import pandas as pd
import csv
import sys
import re
import scipy
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from time import time

csv.field_size_limit(sys.maxsize)

9223372036854775807

In [10]:
metrics = ['cbo','wmc','rfc','lcom','nom','nopm','nosm','nof','nopf','nosf','nosi','loc', "commits","linesAdded","linesDeleted","authors","minorAuthors","majorAuthors","authorOwnership"]

def get_metrics(row):
    features = []
    for metric in metrics:
        features.append(float(row[metric]))
    return features

In [11]:
df = pd.read_pickle('../data/instances.pkl')
labels = list(set(df['target'].values))

In [12]:
X = []
Y = []

print("Preparing lists...")
for index, row in df.iterrows():
    X.append(get_metrics(row))
    Y.append(row["target"])

Preparing lists...


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size = 0.75, random_state=42)



# Default parameters

In [14]:
rf_classifier = RandomForestClassifier(random_state=42, verbose=1, n_jobs=-1)
rf_classifier.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=42, verbose=1, warm_start=False)

In [15]:
print("============ EVALUATION on test set:")
print(accuracy_score(y_test, rf_classifier.predict(X_test)))

0.7972112761442861


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished


In [16]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(2, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

rf_classifier = RandomForestClassifier(random_state=42, verbose=1, n_jobs=-1)

n_iter_search = 20
random_search = RandomizedSearchCV(rf_classifier,
                                   param_distributions=param_dist,
                                   n_iter=n_iter_search,
                                   cv=5,
                                   n_jobs=-1)
start = time()
print("Hyperparameter tuning...")
random_search.fit(X_train, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
  " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)
print("============ EVALUATION on test set:")
print(accuracy_score(y_test, random_search.best_estimator_.predict(X_test)))

Hyperparameter tuning...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.5s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.1s finished


RandomizedSearchCV took 41.99 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.788 (std: 0.009)
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'max_features': 9, 'min_samples_split': 9}

Model with rank: 2
Mean validation score: 0.787 (std: 0.008)
Parameters: {'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 6, 'min_samples_split': 4}

Model with rank: 3
Mean validation score: 0.786 (std: 0.008)
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'max_features': 8, 'min_samples_split': 3}

0.802667474992422
