In [1]:
import json
import numpy as np
def load_json():
    
    authors = []
    features = []
    scores = []
    
    with open('dataset.json') as data_file:
        data = json.load(data_file)
        labels = data["column_descriptors"]
        author_data = data["author_data"]
        
        for author in author_data.keys():
            authors.append(author)
            features.append(author_data[author]['feature_vecs'])
            scores_str = author_data[author]['scores']
            scores.append(np.asarray([int(score_) for score_ in scores_str]))
    
    # features to numpy array
    features = [np.asarray(feature) for feature in features]
    
    return labels, authors, features, scores

In [2]:
labels, authors, features, scores = load_json()
print (authors[0], features[0].shape, scores[0].shape)
print (labels)

(u'bodmas', (3, 34), (3,))
[u'number_of_modules', u'lines_of_code', u'lines_of_code_per_module', u'McCabes_cyclomatic_complexity', u'McCabes_cyclomatic_complexity_per_module', u'lines_of_comment', u'lines_of_comment_per_module', u'lines_of_code_per_line_of_comment', u'McCabes_cyclomatic_complexity_per_line_of_comment', u'IF4', u'IF4_per_module', u'IF4_visible', u'IF4_visible_per_module', u'IF4_concrete', u'IF4_concrete', u'rejected_lines_of_code\n', u'Files', u'Lines', u'AVG Len', u'Code', u'Comments', u'White SP', u'Cd/Cm+WS', u'Cd/Cm', u'Cd/WS', u'% Code', u'cnt_classes', u'max_member_funs', u'max_nested_loops', u'max_nesting_depth', u'max_params_in_decl', u'member_funs', u'member_vars', u'min_member_funs\n']


In [3]:
# training and test split
import random
cnt_train = int(0.8 * len(authors)) + 1

print (cnt_train)

train_authors_indices = set()

while len(train_authors_indices) < cnt_train:
    train_authors_indices.add(random.randint(0, len(authors) - 1))
    
test_authors_indices = set()

for i in range(len(authors)):
    if i not in train_authors_indices:
        test_authors_indices.add(i)

train_authors = [authors[i] for i in train_authors_indices]
        
test_authors = [authors[i] for i in test_authors_indices]

print(len(train_authors))
print(len(test_authors))

1458
1458
364


In [4]:
X_train_list = []
y_train_list = []
X_test_list = []
y_test_list = []

for index in train_authors_indices:
    X_train_list.extend(features[index])
    y_train_list.extend(scores[index])
    
for index in test_authors_indices:
    X_test_list.extend(features[index])
    y_test_list.extend(scores[index])

X_train = np.asarray(X_train_list)
y_train = np.asarray(y_train_list)
X_test = np.asarray(X_test_list)
y_test = np.asarray(y_test_list)

print (X_train.shape, y_train.shape, X_test.shape, y_test.shape)

((15206, 34), (15206,), (3461, 34), (3461,))


In [5]:
# y is vector of labels
def create_labels(y):
    y_l = np.copy(y)
    for i in range(y.shape[0]):
        if y[i] >= 80: 
            y_l[i] = 5
        if y[i] >= 60 and y[i] < 80: 
            y_l[i] = 4
        if y[i] >= 40 and y[i] < 60:
            y_l[i] = 3
        if y[i] >= 20 and y[i] < 40:
            y_l[i] = 2
        if y[i] < 20: 
            y_l[i] = 1
    return y_l

In [6]:
 # creating score labels for classifying
    
y_train = create_labels(y_train) 
y_test = create_labels(y_test)

In [None]:
from sklearn.neural_network import MLPClassifier

alphas = [0.0001, 0.001, 0.01, 0.1]
solvers = ["adam", "lbfgs", "sgd"]
learning_rates = ["constant", "adaptive", "invscaling"]

for alpha in alphas:
    print ("Alpha:\t", alpha)
    for solver in solvers:
        print ("\tSolver:\t", solver)
        for learning_rate in learning_rates:
            print ("\t\tLearning rate:\t", learning_rate, end="\t")
            mlpC = MLPClassifier(alpha=alpha, batch_size='auto', learning_rate=learning_rate, learning_rate_init=0.01, power_t=0.5, shuffle=True, max_iter=500)
            mlpC.fit(X_train, y_train)
            print ("MLPC Score:\t", mlpC.score(X_test, y_test))



In [19]:
from sklearn.ensemble import RandomForestClassifier

criterions = [ "gini", "entropy"]
estimators_size = [50, 100, 150, 200]

import sys
def printf(format, *args):
    sys.stdout.write(format % args)

for criterion in criterions:
    for n_estimators in estimators_size:
        printf("RFC\t%d estimators\t%s criterion\n", n_estimators, criterion)
        rfc = RandomForestClassifier(n_estimators=n_estimators, max_features='log2', criterion=criterion)
        rfc.fit(X_train, y_train)
        printf("Score:\t%f\n\n",rfc.score(X_test, y_test))

RFC	50 estimators	gini criterion
Score:	0.692863

RFC	100 estimators	gini criterion
Score:	0.697775

RFC	150 estimators	gini criterion
Score:	0.694019

RFC	200 estimators	gini criterion
Score:	0.694886

RFC	50 estimators	entropy criterion
Score:	0.693730

RFC	100 estimators	entropy criterion
Score:	0.700376

RFC	150 estimators	entropy criterion
Score:	0.698931

RFC	200 estimators	entropy criterion
Score:	0.699798



In [20]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train, y_train)

print (clf.score(X_test, y_test))

0.256284310893


In [21]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
print (gbc.score(X_test, y_test))

0.365501300202


In [22]:
from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier()
abc.fit(X_train, y_train)
print (abc.score(X_test, y_test))

0.292112106328


In [23]:
from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier(n_estimators=30)
etc.fit(X_train, y_train)
print (etc.score(X_test, y_test))

0.700375613984


In [24]:
from sklearn import tree
dtc = tree.DecisionTreeClassifier()
dtc.fit(X_train, y_train)
print (dtc.score(X_test, y_test))

0.684195319272


In [25]:
from sklearn.tree import _tree

def tree_to_code(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    print ("def tree({}):".format(", ".join(feature_names)))

    def recurse(node, depth):
        indent = "  " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print ("{}if {} <= {}:".format(indent, name, threshold))
            recurse(tree_.children_left[node], depth + 1)
            print ("{}else:  # if {} > {}".format(indent, name, threshold))
            recurse(tree_.children_right[node], depth + 1)
        else:
            print ("{}return {}".format(indent, tree_.value[node]))

    recurse(0, 1)

In [26]:
tree_to_code(dtc, labels)

def tree(number_of_modules, lines_of_code, lines_of_code_per_module, McCabes_cyclomatic_complexity, McCabes_cyclomatic_complexity_per_module, lines_of_comment, lines_of_comment_per_module, lines_of_code_per_line_of_comment, McCabes_cyclomatic_complexity_per_line_of_comment, IF4, IF4_per_module, IF4_visible, IF4_visible_per_module, IF4_concrete, IF4_concrete, rejected_lines_of_code
, Files, Lines, AVG Len, Code, Comments, White SP, Cd/Cm+WS, Cd/Cm, Cd/WS, % Code, cnt_classes, max_member_funs, max_nested_loops, max_nesting_depth, max_params_in_decl, member_funs, member_vars, min_member_funs
):
  if rejected_lines_of_code
 <= 1.5:
    if Comments <= 2.5:
      if % Code <= 0.944999992847:
        if lines_of_code_per_module <= 27.75:
          if lines_of_code_per_module <= 22.75:
            if max_params_in_decl <= 1.5:
              if lines_of_code <= 40.5:
                if McCabes_cyclomatic_complexity <= 6.5:
                  if Lines <= 19.5:
                    if lines_of_code

                                      return [[ 1.  0.  0.  0.  0.]]
                                else:  # if Cd/Cm+WS > 9.41499996185
                                  if Cd/WS <= 18.75:
                                    return [[  0.  13.   0.   0.   0.]]
                                  else:  # if Cd/WS > 18.75
                                    return [[ 0.  0.  0.  0.  1.]]
                            else:  # if lines_of_code_per_module > 18.5
                              if AVG Len <= 15.5:
                                if McCabes_cyclomatic_complexity_per_module <= 3.5:
                                  return [[ 0.  0.  2.  0.  0.]]
                                else:  # if McCabes_cyclomatic_complexity_per_module > 3.5
                                  if Cd/WS <= 11.25:
                                    return [[ 0.  0.  0.  0.  6.]]
                                  else:  # if Cd/WS > 11.25
                                    if McCabes_cyclomatic_complexity

                                      if % Code <= 0.90499997139:
                                        if AVG Len <= 14.0:
                                          return [[ 0.  0.  1.  0.  0.]]
                                        else:  # if AVG Len > 14.0
                                          return [[ 0.  7.  0.  0.  0.]]
                                      else:  # if % Code > 0.90499997139
                                        return [[ 0.  0.  0.  6.  0.]]
                                  else:  # if lines_of_code_per_line_of_comment > 34.0
                                    return [[ 0.  0.  6.  0.  0.]]
                                else:  # if AVG Len > 20.5
                                  return [[ 0.  6.  0.  0.  0.]]
                            else:  # if McCabes_cyclomatic_complexity_per_module > 10.5
                              if max_nested_loops <= 2.5:
                                if Cd/Cm+WS <= 6.75:
                                  return

                                                    else:  # if Cd/Cm+WS > 10.1000003815
                                                      return [[ 1.  0.  0.  0.  0.]]
                                                  else:  # if max_nested_loops > 1.5
                                                    if max_params_in_decl <= 2.5:
                                                      return [[  0.   0.   0.  24.   0.]]
                                                    else:  # if max_params_in_decl > 2.5
                                                      return [[ 0.  0.  1.  0.  0.]]
                                            else:  # if Code > 53.5
                                              if lines_of_code_per_module <= 64.5:
                                                if Cd/WS <= 6.94499969482:
                                                  if McCabes_cyclomatic_complexity <= 14.5:
                                                    return [[ 0.  0.  0.  8. 

                          return [[ 2.  0.  0.  0.  0.]]
                        else:  # if rejected_lines_of_code
 > 0.5
                          return [[ 0.  1.  0.  0.  0.]]
                      else:  # if AVG Len > 18.0
                        return [[  0.   0.  11.   0.   0.]]
                    else:  # if Code > 27.5
                      if Cd/WS <= 45.5:
                        if Lines <= 44.5:
                          if lines_of_comment_per_module <= 0.5:
                            if AVG Len <= 20.5:
                              if AVG Len <= 16.5:
                                if McCabes_cyclomatic_complexity_per_module <= 9.5:
                                  if Cd/WS <= 8.75:
                                    if AVG Len <= 12.5:
                                      return [[ 0.  0.  4.  0.  0.]]
                                    else:  # if AVG Len > 12.5
                                      if McCabes_cyclomatic_complexity <= 7.5:
                   

                            else:  # if max_nesting_depth > 2.5
                              return [[  0.   0.  12.   0.   0.]]
                          else:  # if Cd/WS > 33.75
                            return [[ 0.  0.  0.  0.  2.]]
                        else:  # if max_nested_loops > 2.5
                          return [[ 0.  0.  0.  2.  0.]]
                    else:  # if Code > 82.5
                      return [[ 4.  0.  0.  0.  0.]]
                  else:  # if % Code > 0.985000014305
                    if McCabes_cyclomatic_complexity <= 8.0:
                      return [[ 0.  0.  0.  2.  0.]]
                    else:  # if McCabes_cyclomatic_complexity > 8.0
                      if AVG Len <= 30.0:
                        if McCabes_cyclomatic_complexity_per_module <= 13.5:
                          return [[  0.   0.  24.   0.   0.]]
                        else:  # if McCabes_cyclomatic_complexity_per_module > 13.5
                          if Cd/WS <= 76.0:
 

                          return [[ 3.  0.  0.  0.  0.]]
                    else:  # if Code > 46.0
                      if McCabes_cyclomatic_complexity_per_module <= 9.25:
                        if lines_of_code <= 42.5:
                          return [[ 0.  2.  0.  0.  0.]]
                        else:  # if lines_of_code > 42.5
                          if McCabes_cyclomatic_complexity_per_line_of_comment <= 2.20000004768:
                            if lines_of_code_per_module <= 46.0:
                              if Cd/Cm <= 12.875:
                                return [[ 4.  0.  0.  0.  0.]]
                              else:  # if Cd/Cm > 12.875
                                return [[ 0.  0.  1.  0.  0.]]
                            else:  # if lines_of_code_per_module > 46.0
                              return [[ 0.  0.  1.  0.  0.]]
                          else:  # if McCabes_cyclomatic_complexity_per_line_of_comment > 2.20000004768
                            

                              else:  # if McCabes_cyclomatic_complexity > 10.0
                                return [[ 4.  0.  0.  0.  0.]]
                            else:  # if Lines > 96.5
                              if Cd/WS <= 6.06999969482:
                                if lines_of_code_per_module <= 86.5:
                                  if McCabes_cyclomatic_complexity <= 9.0:
                                    if lines_of_comment <= 19.5:
                                      return [[ 0.  0.  2.  0.  0.]]
                                    else:  # if lines_of_comment > 19.5
                                      return [[ 1.  0.  0.  0.  0.]]
                                  else:  # if McCabes_cyclomatic_complexity > 9.0
                                    return [[  0.   0.  15.   0.   0.]]
                                else:  # if lines_of_code_per_module > 86.5
                                  if Lines <= 482.5:
                                    if McCabes

                                return [[ 0.  1.  0.  0.  0.]]
                          else:  # if AVG Len > 18.5
                            if lines_of_code_per_module <= 55.5:
                              return [[ 0.  8.  0.  0.  0.]]
                            else:  # if lines_of_code_per_module > 55.5
                              if max_nested_loops <= 5.5:
                                if Code <= 59.5:
                                  return [[ 0.  0.  0.  2.  0.]]
                                else:  # if Code > 59.5
                                  return [[ 0.  0.  3.  0.  0.]]
                              else:  # if max_nested_loops > 5.5
                                return [[ 0.  4.  0.  0.  0.]]
                        else:  # if Code > 67.5
                          if McCabes_cyclomatic_complexity <= 17.5:
                            if lines_of_code <= 53.5:
                              return [[ 0.  0.  0.  2.  0.]]
                            else: 

                                            return [[  0.   0.   0.   0.  16.]]
                                          else:  # if White SP > 14.5
                                            if White SP <= 16.5:
                                              if max_params_in_decl <= 2.0:
                                                return [[ 0.  0.  2.  0.  0.]]
                                              else:  # if max_params_in_decl > 2.0
                                                return [[ 1.  0.  0.  0.  0.]]
                                            else:  # if White SP > 16.5
                                              return [[ 0.  0.  0.  2.  0.]]
                                  else:  # if AVG Len > 16.5
                                    if lines_of_code <= 56.5:
                                      if McCabes_cyclomatic_complexity <= 11.5:
                                        if Code <= 72.5:
                                          if Cd/WS <= 8.715

                                                return [[ 1.  0.  0.  0.  0.]]
                                              else:  # if Cd/WS > 5.46500015259
                                                return [[ 0.  0.  0.  2.  0.]]
                                            else:  # if McCabes_cyclomatic_complexity > 6.5
                                              return [[  0.   0.   0.  25.   0.]]
                                        else:  # if lines_of_code_per_module > 42.5
                                          if Cd/Cm+WS <= 2.64499998093:
                                            return [[ 0.  0.  2.  0.  0.]]
                                          else:  # if Cd/Cm+WS > 2.64499998093
                                            if McCabes_cyclomatic_complexity_per_module <= 8.5:
                                              return [[ 0.  0.  2.  0.  0.]]
                                            else:  # if McCabes_cyclomatic_complexity_per_module > 8.5
  

                                return [[ 0.  0.  0.  0.  4.]]
                          else:  # if White SP > 5.5
                            if lines_of_code <= 36.0:
                              if McCabes_cyclomatic_complexity <= 12.5:
                                return [[ 0.  1.  0.  0.  0.]]
                              else:  # if McCabes_cyclomatic_complexity > 12.5
                                return [[ 0.  0.  0.  0.  2.]]
                            else:  # if lines_of_code > 36.0
                              return [[  0.   0.   0.  13.   0.]]
                      else:  # if Lines > 51.5
                        if AVG Len <= 12.5:
                          if max_params_in_decl <= 1.5:
                            if McCabes_cyclomatic_complexity_per_module <= 11.5:
                              return [[ 1.  0.  0.  0.  0.]]
                            else:  # if McCabes_cyclomatic_complexity_per_module > 11.5
                              return [[ 0.  0.  0

                        else:  # if AVG Len > 13.5
                          if Cd/WS <= 5.875:
                            if max_nesting_depth <= 3.5:
                              if Lines <= 76.0:
                                return [[ 1.  0.  0.  0.  0.]]
                              else:  # if Lines > 76.0
                                return [[ 0.  0.  0.  3.  0.]]
                            else:  # if max_nesting_depth > 3.5
                              return [[ 5.  0.  0.  0.  0.]]
                          else:  # if Cd/WS > 5.875
                            if Code <= 61.5:
                              if Cd/WS <= 8.5:
                                if % Code <= 0.810000002384:
                                  return [[ 0.  0.  2.  0.  0.]]
                                else:  # if % Code > 0.810000002384
                                  return [[ 0.  0.  0.  2.  0.]]
                              else:  # if Cd/WS > 8.5
                                retu

                        if Cd/Cm+WS <= 16.6450004578:
                          if White SP <= 7.5:
                            if rejected_lines_of_code
 <= 6.5:
                              if lines_of_code_per_module <= 101.5:
                                return [[ 1.  0.  0.  0.  0.]]
                              else:  # if lines_of_code_per_module > 101.5
                                return [[  0.  19.   0.   0.   0.]]
                            else:  # if rejected_lines_of_code
 > 6.5
                              return [[ 0.  0.  0.  1.  0.]]
                          else:  # if White SP > 7.5
                            if Lines <= 135.5:
                              if Cd/WS <= 10.9099998474:
                                if lines_of_code_per_module <= 111.5:
                                  return [[ 0.  0.  0.  4.  0.]]
                                else:  # if lines_of_code_per_module > 111.5
                                  return [[ 0.  0.  2.  0.  0.]

                            else:  # if McCabes_cyclomatic_complexity_per_line_of_comment > 1.0
                              return [[ 0.  0.  3.  0.  0.]]
                          else:  # if lines_of_code_per_line_of_comment > 14.8334999084
                            return [[ 0.  5.  0.  0.  0.]]
                        else:  # if % Code > 0.824999988079
                          if max_nesting_depth <= 4.5:
                            if AVG Len <= 16.0:
                              return [[ 0.  0.  1.  0.  0.]]
                            else:  # if AVG Len > 16.0
                              return [[  0.   0.   0.  30.   0.]]
                          else:  # if max_nesting_depth > 4.5
                            return [[ 0.  2.  0.  0.  0.]]
                    else:  # if Cd/Cm+WS > 5.23999977112
                      if lines_of_code_per_module <= 44.5:
                        if Code <= 35.5:
                          return [[ 0.  0.  0.  8.  0.]]
                

                                            if Lines <= 45.5:
                                              if Lines <= 42.5:
                                                if rejected_lines_of_code
 <= 2.5:
                                                  return [[ 0.  2.  0.  0.  0.]]
                                                else:  # if rejected_lines_of_code
 > 2.5
                                                  return [[ 0.  0.  0.  2.  0.]]
                                              else:  # if Lines > 42.5
                                                return [[  0.   0.   0.   0.  11.]]
                                            else:  # if Lines > 45.5
                                              if Lines <= 47.0:
                                                return [[ 4.  0.  0.  0.  0.]]
                                              else:  # if Lines > 47.0
                                                if Lines <= 59.5:
                                   

                                                                              if lines_of_code_per_module <= 92.5:
                                                                                return [[ 0.  0.  7.  0.  0.]]
                                                                              else:  # if lines_of_code_per_module > 92.5
                                                                                return [[ 0.  1.  0.  0.  0.]]
                                                                      else:  # if Cd/WS > 21.4150009155
                                                                        return [[ 0.  0.  0.  2.  0.]]
                                                                  else:  # if Code > 105.0
                                                                    if Code <= 107.0:
                                                                      return [[ 0.  0.  0.  0.  2.]]
                                                        

                                      return [[ 0.  0.  0.  6.  0.]]
                              else:  # if Lines > 162.0
                                if Cd/Cm+WS <= 3.86000013351:
                                  return [[ 0.  0.  8.  0.  0.]]
                                else:  # if Cd/Cm+WS > 3.86000013351
                                  return [[  0.   0.   0.  13.   0.]]
                            else:  # if Lines > 171.5
                              if White SP <= 63.5:
                                if lines_of_code <= 114.5:
                                  return [[ 0.  0.  2.  0.  0.]]
                                else:  # if lines_of_code > 114.5
                                  return [[  0.  20.   0.   0.   0.]]
                              else:  # if White SP > 63.5
                                if McCabes_cyclomatic_complexity_per_module <= 20.2145004272:
                                  return [[ 1.  0.  0.  0.  0.]]
                           

In [27]:
from sklearn import preprocessing
from sklearn.svm import SVC

X_train = preprocessing.scale(X_train) 
X_test = preprocessing.scale(X_test)

print ("Starting grid search...")

kernels = ['linear', 'rbf']
Cs = [1, 10, 100, 1000]

"""
for kernel in kernels:
    for C in Cs:
        print ("SVC:\t", kernel, " kernel\t| ", C, " C\t", end="\t")
        svc = SVC(C=C, kernel=kernel)
        svc.fit(X_train, y_train)
        print ("Score: ", svc.score(X_test, y_test))
        print ("\n")
"""

Starting grid search...


'\nfor kernel in kernels:\n    for C in Cs:\n        print ("SVC:\t", kernel, " kernel\t| ", C, " C\t", end="\t")\n        svc = SVC(C=C, kernel=kernel)\n        svc.fit(X_train, y_train)\n        print ("Score: ", svc.score(X_test, y_test))\n        print ("\n")\n'