In [1]:
import json
import numpy as np
import sys

def printf(format, *args):
    sys.stdout.write(format % args)
    
def load_json():
    
    authors = []
    features = []
    scores = []
    
    with open('dataset.json') as data_file:
        data = json.load(data_file)
        labels = data["column_descriptors"]
        author_data = data["author_data"]
        
        for author in author_data.keys():
            authors.append(author)
            features.append(author_data[author]['feature_vecs'])
            scores_str = author_data[author]['scores']
            scores.append(np.asarray([int(score_) for score_ in scores_str]))
    
    features = [np.asarray(feature) for feature in features]
    
    return labels, authors, features, scores

In [2]:
labels, authors, features, scores = load_json()
print (authors[0], features[0].shape, scores[0].shape, "\n")
print (labels)

yamauchi (7, 34) (7,) 

['number_of_modules', 'lines_of_code', 'lines_of_code_per_module', 'McCabes_cyclomatic_complexity', 'McCabes_cyclomatic_complexity_per_module', 'lines_of_comment', 'lines_of_comment_per_module', 'lines_of_code_per_line_of_comment', 'McCabes_cyclomatic_complexity_per_line_of_comment', 'IF4', 'IF4_per_module', 'IF4_visible', 'IF4_visible_per_module', 'IF4_concrete', 'IF4_concrete', 'rejected_lines_of_code\n', 'Files', 'Lines', 'AVG Len', 'Code', 'Comments', 'White SP', 'Cd/Cm+WS', 'Cd/Cm', 'Cd/WS', '% Code', 'cnt_classes', 'max_member_funs', 'max_nested_loops', 'max_nesting_depth', 'max_params_in_decl', 'member_funs', 'member_vars', 'min_member_funs\n']


In [3]:
# y is vector of labels
def create_labels(y):
    y_l = np.copy(y)
    for i in range(y.shape[0]):
        if y[i] >= 80: 
            y_l[i] = 5
        if y[i] >= 60 and y[i] < 80: 
            y_l[i] = 4
        if y[i] >= 40 and y[i] < 60:
            y_l[i] = 3
        if y[i] >= 20 and y[i] < 40:
            y_l[i] = 2
        if y[i] < 20: 
            y_l[i] = 1
    return y_l

In [4]:
# training and test split
import random
def createTrainTestSplit():
    cnt_train = int(0.8 * len(authors)) + 1

    train_authors_indices = set()

    while len(train_authors_indices) < cnt_train:
        train_authors_indices.add(random.randint(0, len(authors) - 1))

    test_authors_indices = set()

    for i in range(len(authors)):
        if i not in train_authors_indices:
            test_authors_indices.add(i)

    train_authors = [authors[i] for i in train_authors_indices]

    test_authors = [authors[i] for i in test_authors_indices]
    
    X_train_list = []
    y_train_list = []
    X_test_list = []
    y_test_list = []

    for index in train_authors_indices:
        X_train_list.extend(features[index])
        y_train_list.extend(scores[index])

    for index in test_authors_indices:
        X_test_list.extend(features[index])
        y_test_list.extend(scores[index])

    X_train = np.asarray(X_train_list)
    y_train = np.asarray(y_train_list)
    X_test = np.asarray(X_test_list)
    y_test = np.asarray(y_test_list)
    
    y_train = create_labels(y_train) 
    y_test = create_labels(y_test)
    
    return X_train, y_train, X_test, y_test

In [5]:
X_train, y_train, X_test, y_test = createTrainTestSplit()
print (X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(14817, 34) (14817,) (3850, 34) (3850,)


In [6]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train, y_train)

print (clf.score(X_test, y_test))

0.251948051948


In [7]:
from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier()
abc.fit(X_train, y_train)
print (abc.score(X_test, y_test))


0.291168831169


In [8]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
print (gbc.score(X_test, y_test))

0.376623376623


In [9]:
from sklearn.neighbors import KNeighborsClassifier

for n in range(1, 11):
    neigh = KNeighborsClassifier(n_neighbors=n)
    neigh.fit(X_train, y_train)
    print ('neighbors', n, "score", neigh.score(X_test, y_test))

neighbors 1 score 0.701038961039
neighbors 2 score 0.648571428571
neighbors 3 score 0.596363636364
neighbors 4 score 0.447272727273
neighbors 5 score 0.39974025974
neighbors 6 score 0.370649350649
neighbors 7 score 0.367532467532
neighbors 8 score 0.354285714286
neighbors 9 score 0.351948051948
neighbors 10 score 0.345194805195


In [10]:
from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier(n_estimators=30)
etc.fit(X_train, y_train)
print (etc.score(X_test, y_test))

0.701558441558


In [11]:
from sklearn import tree
dtc = tree.DecisionTreeClassifier()
dtc.fit(X_train, y_train)
print (dtc.score(X_test, y_test))

0.697922077922


In [12]:
from sklearn.tree import _tree

def tree_to_code(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    print ("def tree({}):".format(", ".join(feature_names)))

    def recurse(node, depth):
        indent = "  " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print ("{}if {} <= {}:".format(indent, name, threshold))
            recurse(tree_.children_left[node], depth + 1)
            print ("{}else:  # if {} > {}".format(indent, name, threshold))
            recurse(tree_.children_right[node], depth + 1)
        else:
            print ("{}return {}".format(indent, tree_.value[node]))

    recurse(0, 1)

In [13]:
tree_to_code(dtc, labels)

def tree(number_of_modules, lines_of_code, lines_of_code_per_module, McCabes_cyclomatic_complexity, McCabes_cyclomatic_complexity_per_module, lines_of_comment, lines_of_comment_per_module, lines_of_code_per_line_of_comment, McCabes_cyclomatic_complexity_per_line_of_comment, IF4, IF4_per_module, IF4_visible, IF4_visible_per_module, IF4_concrete, IF4_concrete, rejected_lines_of_code
, Files, Lines, AVG Len, Code, Comments, White SP, Cd/Cm+WS, Cd/Cm, Cd/WS, % Code, cnt_classes, max_member_funs, max_nested_loops, max_nesting_depth, max_params_in_decl, member_funs, member_vars, min_member_funs
):
  if rejected_lines_of_code
 <= 1.5:
    if McCabes_cyclomatic_complexity_per_module <= 25.5:
      if Comments <= 2.5:
        if % Code <= 0.9449999928474426:
          if lines_of_code_per_module <= 44.5:
            if lines_of_code_per_module <= 28.75:
              if lines_of_code_per_module <= 22.5:
                if max_params_in_decl <= 1.5:
                  if Cd/Cm <= 42.5:
          

In [14]:
from sklearn.neural_network import MLPClassifier

alphas = [0.0001, 0.001, 0.01, 0.1]
solvers = ["adam", "lbfgs", "sgd"]
learning_rates = ["constant", "adaptive", "invscaling"]

for alpha in alphas:
    print("Alpha:\t", alpha)
    for solver in solvers:
        printf("Solver:\t%s\n", solver)
        for learning_rate in learning_rates:
            printf("Learning rate:\t%s\n", learning_rate)
            values = []
            for i in range(0, 5):
                X_train, y_train, X_test, y_test = createTrainTestSplit()
                mlpC = MLPClassifier(alpha=alpha, batch_size='auto', learning_rate=learning_rate, learning_rate_init=0.01, power_t=0.5, shuffle=True, max_iter=500)
                mlpC.fit(X_train, y_train)
                values.append(mlpC.score(X_test, y_test))
                
            values.sort()
            print("Median:", values[2])
            print("Average:", sum(values) / 5)
            print("Min:", min(values))
            print("Max:", max(values), "\n")

Alpha:	 0.0001
Solver:	adam
Learning rate:	constant
Median: 0.246599948678
Average: 0.251534727437
Min: 0.223664503245
Max: 0.298391281785 

Learning rate:	adaptive
Median: 0.257264957265
Average: 0.248464341857
Min: 0.207892700542
Max: 0.26766004415 

Learning rate:	invscaling
Median: 0.261464403344
Average: 0.251063405793
Min: 0.20748116254
Max: 0.289089994972 

Solver:	lbfgs
Learning rate:	constant
Median: 0.244052616849
Average: 0.243575092749
Min: 0.233045299817
Max: 0.255561047949 

Learning rate:	adaptive
Median: 0.215318285569
Average: 0.225496741696
Min: 0.200164293538
Max: 0.265101556318 

Learning rate:	invscaling
Median: 0.25332508506
Average: 0.215784144789
Min: 0.0722784057109
Max: 0.266611524676 

Solver:	sgd
Learning rate:	constant
Median: 0.262543137775
Average: 0.24082354489
Min: 0.189726381002
Max: 0.264843965755 

Learning rate:	adaptive
Median: 0.268542199488
Average: 0.270128515227
Min: 0.26125193999
Max: 0.282470017862 

Learning rate:	invscaling
Median: 0.228968

In [15]:
from sklearn.ensemble import RandomForestClassifier

criterions = [ "gini", "entropy"]
estimators_size = [50, 100, 150, 200]

for criterion in criterions:
    for n_estimators in estimators_size:
        values = []
        printf("RFC\t%d estimators\t%s criterion\n", n_estimators, criterion)
        for i in range(0, 5):
            X_train, y_train, X_test, y_test = createTrainTestSplit()
            rfc = RandomForestClassifier(n_estimators=n_estimators, max_features='log2', criterion=criterion)
            rfc.fit(X_train, y_train)
            values.append(rfc.score(X_test, y_test))
        
        values.sort()
        print("Median:", values[2])
        print("Average:", sum(values) / 5)
        print("Min:", min(values))
        print("Max:", max(values), "\n")

RFC	50 estimators	gini criterion
Median: 0.68532526475
Average: 0.695277865318
Min: 0.675807278319
Max: 0.736037788275 

RFC	100 estimators	gini criterion
Median: 0.702797202797
Average: 0.693885194043
Min: 0.646305991856
Max: 0.723763225622 

RFC	150 estimators	gini criterion
Median: 0.678713858425
Average: 0.677775439165
Min: 0.647689375893
Max: 0.711288711289 

RFC	200 estimators	gini criterion
Median: 0.679077521534
Average: 0.691873134892
Min: 0.659303534304
Max: 0.732016925247 

RFC	50 estimators	entropy criterion
Median: 0.689740761169
Average: 0.684652396595
Min: 0.642140468227
Max: 0.722823286965 

RFC	100 estimators	entropy criterion
Median: 0.678914176035
Average: 0.683431585801
Min: 0.646781009197
Max: 0.710540254237 

RFC	150 estimators	entropy criterion
Median: 0.704579025111
Average: 0.693454764829
Min: 0.631990794016
Max: 0.735854189336 

RFC	200 estimators	entropy criterion
Median: 0.666390041494
Average: 0.678228564773
Min: 0.642522269105
Max: 0.73652856756 



In [16]:
from sklearn import tree
export_dtc = tree.DecisionTreeClassifier(max_depth=25)
export_dtc.fit(X_train, y_train)
print (export_dtc.score(X_test, y_test))

0.616320885201


In [17]:
import sklearn.tree

tree.export_graphviz(dtc, 'tree.dot')

In [21]:
from collections import Counter
class Ensemble:
    
    def __init__(self, clfs):
        self.clfs = clfs
        
    def fit(self, X, y):
        for clf in self.clfs:
            clf.fit(X, y)
            
    def predict(self, X):
        predictions = []
        for clf in clfs:
            predictions.append(clf.predict(X))
            
        final_predictions = []
        
        temp = []
        for i in range(len(X)):
            values = []
            for j in range(len(self.clfs)):
                values.append(predictions[j][i])
            temp.append(values)

        for tmp in temp:
            final_predictions.append(Counter(tmp).most_common(1)[0][0])
            
        return final_predictions

In [22]:
from sklearn.metrics import accuracy_score
clfs = []

clfs.append(tree.DecisionTreeClassifier())
clfs.append(RandomForestClassifier(n_estimators=150, max_features='log2', criterion="entropy"))
clfs.append(ExtraTreesClassifier(n_estimators=30))
clfs.append(KNeighborsClassifier(n_neighbors=1))

ens = Ensemble(clfs)

ens.fit(X_train, y_train)

In [23]:
import warnings
warnings.filterwarnings('ignore')

y_pred = ens.predict(X_test)
print ("Ensemble score: ", accuracy_score(y_pred, y_test))

Ensemble score:  0.660857538036
