In [1]:
import json
import numpy as np
import sys

def printf(format, *args):
    sys.stdout.write(format % args)
    
def load_json():
    
    authors = []
    features = []
    scores = []
    
    with open('dataset.json') as data_file:
        data = json.load(data_file)
        labels = data["column_descriptors"]
        author_data = data["author_data"]
        
        for author in author_data.keys():
            authors.append(author)
            features.append(author_data[author]['feature_vecs'])
            scores_str = author_data[author]['scores']
            scores.append(np.asarray([int(score_) for score_ in scores_str]))
    
    features = [np.asarray(feature) for feature in features]
    
    return labels, authors, features, scores

In [2]:
labels, authors, features, scores = load_json()
print (authors[0], features[0].shape, scores[0].shape, "\n")
print (labels)

tommy (5, 34) (5,) 

['number_of_modules', 'lines_of_code', 'lines_of_code_per_module', 'McCabes_cyclomatic_complexity', 'McCabes_cyclomatic_complexity_per_module', 'lines_of_comment', 'lines_of_comment_per_module', 'lines_of_code_per_line_of_comment', 'McCabes_cyclomatic_complexity_per_line_of_comment', 'IF4', 'IF4_per_module', 'IF4_visible', 'IF4_visible_per_module', 'IF4_concrete', 'IF4_concrete', 'rejected_lines_of_code\n', 'Files', 'Lines', 'AVG Len', 'Code', 'Comments', 'White SP', 'Cd/Cm+WS', 'Cd/Cm', 'Cd/WS', '% Code', 'cnt_classes', 'max_member_funs', 'max_nested_loops', 'max_nesting_depth', 'max_params_in_decl', 'member_funs', 'member_vars', 'min_member_funs\n']


In [3]:
# y is vector of labels
def create_labels(y):
    y_l = np.copy(y)
    for i in range(y.shape[0]):
        if y[i] >= 80: 
            y_l[i] = 5
        if y[i] >= 60 and y[i] < 80: 
            y_l[i] = 4
        if y[i] >= 40 and y[i] < 60:
            y_l[i] = 3
        if y[i] >= 20 and y[i] < 40:
            y_l[i] = 2
        if y[i] < 20: 
            y_l[i] = 1
    return y_l

In [None]:
# training and test split
import random
def createTrainTestSplit():
    cnt_train = int(0.8 * len(authors)) + 1

    train_authors_indices = set()

    while len(train_authors_indices) < cnt_train:
        train_authors_indices.add(random.randint(0, len(authors) - 1))

    test_authors_indices = set()

    for i in range(len(authors)):
        if i not in train_authors_indices:
            test_authors_indices.add(i)

    train_authors = [authors[i] for i in train_authors_indices]

    test_authors = [authors[i] for i in test_authors_indices]
    
    X_train_list = []
    y_train_list = []
    X_test_list = []
    y_test_list = []

    for index in train_authors_indices:
        X_train_list.extend(features[index])
        y_train_list.extend(scores[index])

    for index in test_authors_indices:
        X_test_list.extend(features[index])
        y_test_list.extend(scores[index])

    X_train = np.asarray(X_train_list)
    y_train = np.asarray(y_train_list)
    X_test = np.asarray(X_test_list)
    y_test = np.asarray(y_test_list)
    
    y_train = create_labels(y_train) 
    y_test = create_labels(y_test)
    
    return X_train, y_train, X_test, y_test

In [None]:
X_train, y_train, X_test, y_test = createTrainTestSplit()
print (X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train, y_train)

print (clf.score(X_test, y_test))

In [None]:
from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier()
abc.fit(X_train, y_train)
print (abc.score(X_test, y_test))


In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
print (gbc.score(X_test, y_test))

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier(n_estimators=30)
etc.fit(X_train, y_train)
print (etc.score(X_test, y_test))

In [None]:
from sklearn import tree
dtc = tree.DecisionTreeClassifier()
dtc.fit(X_train, y_train)
print (dtc.score(X_test, y_test))

In [None]:
from sklearn.tree import _tree

def tree_to_code(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    print ("def tree({}):".format(", ".join(feature_names)))

    def recurse(node, depth):
        indent = "  " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print ("{}if {} <= {}:".format(indent, name, threshold))
            recurse(tree_.children_left[node], depth + 1)
            print ("{}else:  # if {} > {}".format(indent, name, threshold))
            recurse(tree_.children_right[node], depth + 1)
        else:
            print ("{}return {}".format(indent, tree_.value[node]))

    recurse(0, 1)

In [None]:
tree_to_code(dtc, labels)

In [None]:
from sklearn.neural_network import MLPClassifier

alphas = [0.0001, 0.001, 0.01, 0.1]
solvers = ["adam", "lbfgs", "sgd"]
learning_rates = ["constant", "adaptive", "invscaling"]

for alpha in alphas:
    print("Alpha:\t", alpha)
    for solver in solvers:
        printf("Solver:\t%s\n", solver)
        for learning_rate in learning_rates:
            for i in range(0, 5):
                X_train, y_train, X_test, y_test = createTrainTestSplit()
                printf("Train/test split number %d\n", i)
                printf("\tLearning rate:\t%s\n", learning_rate)
                mlpC = MLPClassifier(alpha=alpha, batch_size='auto', learning_rate=learning_rate, learning_rate_init=0.01, power_t=0.5, shuffle=True, max_iter=500)
                mlpC.fit(X_train, y_train)
                printf("\tMLPC Score:\t%f\n\n", mlpC.score(X_test, y_test))

In [None]:
from sklearn.ensemble import RandomForestClassifier

criterions = [ "gini", "entropy"]
estimators_size = [50, 100, 150, 200]

for criterion in criterions:
    for n_estimators in estimators_size:
        values = []
        printf("RFC\t%d estimators\t%s criterion\n", n_estimators, criterion)
        for i in range(0, 5):
            X_train, y_train, X_test, y_test = createTrainTestSplit()
            rfc = RandomForestClassifier(n_estimators=n_estimators, max_features='log2', criterion=criterion)
            rfc.fit(X_train, y_train)
            values.append(rfc.score(X_test, y_test))
        
        values.sort()
        print("Median:", values[2])
        print("Average:", sum(values) / 5)
        print("Min:", min(values))
        print("Max:", max(values))