In [1]:
import json
import numpy as np
def load_json():
    
    authors = []
    features = []
    scores = []
    
    with open('dataset.json') as data_file:
        data = json.load(data_file)
        labels = data["column_descriptors"]
        author_data = data["author_data"]
        
        for author in author_data.keys():
            authors.append(author)
            features.append(author_data[author]['feature_vecs'])
            scores_str = author_data[author]['scores']
            scores.append(np.asarray([int(score_) for score_ in scores_str]))
    
    # features to numpy array
    features = [np.asarray(feature) for feature in features]
    
    return labels, authors, features, scores

In [2]:
labels, authors, features, scores = load_json()
print (authors[0], features[0].shape, scores[0].shape)
print (labels)

webNeat (6, 26) (6,)
['number_of_modules', 'lines_of_code', 'lines_of_code_per_module', 'McCabes_cyclomatic_complexity', 'McCabes_cyclomatic_complexity_per_module', 'lines_of_comment', 'lines_of_comment_per_module', 'lines_of_code_per_line_of_comment', 'McCabes_cyclomatic_complexity_per_line_of_comment', 'IF4', 'IF4_per_module', 'IF4_visible', 'IF4_visible_per_module', 'IF4_concrete', 'IF4_concrete', 'rejected_lines_of_code\n', 'Files', 'Lines', 'AVG Len', 'Code', 'Comments', 'White SP', 'Cd/Cm+WS', 'Cd/Cm', 'Cd/WS', '% Code']


In [3]:
# training and test split
import random
cnt_train = int(0.7 * len(authors)) + 1

print (cnt_train)

train_authors_indices = set()

while len(train_authors_indices) < cnt_train:
    train_authors_indices.add(random.randint(0, len(authors) - 1))
    
test_authors_indices = set()

for i in range(len(authors)):
    if i not in train_authors_indices:
        test_authors_indices.add(i)

train_authors = [authors[i] for i in train_authors_indices]
        
test_authors = [authors[i] for i in test_authors_indices]

print(len(train_authors))
print(len(test_authors))

1095
1095
469


In [4]:
X_train_list = []
y_train_list = []
X_test_list = []
y_test_list = []

for index in train_authors_indices:
    X_train_list.extend(features[index])
    y_train_list.extend(scores[index])
    
for index in test_authors_indices:
    X_test_list.extend(features[index])
    y_test_list.extend(scores[index])

X_train = np.asarray(X_train_list)
y_train = np.asarray(y_train_list)
X_test = np.asarray(X_test_list)
y_test = np.asarray(y_test_list)

print (X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(12139, 26) (12139,) (5205, 26) (5205,)


In [5]:
# y is vector of labels
def create_labels(y):
    y_l = np.copy(y)
    for i in range(y.shape[0]):
        if y[i] >= 80: 
            y_l[i] = 5
        if y[i] >= 60 and y[i] < 80: 
            y_l[i] = 4
        if y[i] >= 40 and y[i] < 60:
            y_l[i] = 3
        if y[i] >= 20 and y[i] < 40:
            y_l[i] = 2
        if y[i] < 20: 
            y_l[i] = 1
    return y_l

In [6]:
 # creating score labels for classifying
    
y_train = create_labels(y_train) 
y_test = create_labels(y_test)

In [7]:
from sklearn.neural_network import MLPClassifier

alphas = [0.0001, 0.001, 0.01, 0.1]
solvers = ["adam", "lbfgs", "sgd"]
learning_rates = ["constant", "adaptive", "invscaling"]

for alpha in alphas:
    print ("Alpha:\t", alpha)
    for solver in solvers:
        print ("\tSolver:\t", solver)
        for learning_rate in learning_rates:
            print ("\t\tLearning rate:\t", learning_rate, end="\t")
            mlpC = MLPClassifier(alpha=alpha, batch_size='auto', learning_rate=learning_rate, learning_rate_init=0.01, power_t=0.5, shuffle=True, max_iter=500)
            mlpC.fit(X_train, y_train)
            print ("MLPC Score:\t", mlpC.score(X_test, y_test))



Alpha:	 0.0001
	Solver:	 adam
		Learning rate:	 constant	MLPC Score:	 0.288376560999
		Learning rate:	 adaptive	MLPC Score:	 0.279538904899
		Learning rate:	 invscaling	MLPC Score:	 0.296829971182
	Solver:	 lbfgs
		Learning rate:	 constant	MLPC Score:	 0.276080691643
		Learning rate:	 adaptive	MLPC Score:	 0.294524495677
		Learning rate:	 invscaling	MLPC Score:	 0.261671469741
	Solver:	 sgd
		Learning rate:	 constant	MLPC Score:	 0.274543707973
		Learning rate:	 adaptive	MLPC Score:	 0.270701248799
		Learning rate:	 invscaling	MLPC Score:	 0.286647454371
Alpha:	 0.001
	Solver:	 adam
		Learning rate:	 constant	MLPC Score:	 0.284726224784
		Learning rate:	 adaptive	MLPC Score:	 0.283189241114
		Learning rate:	 invscaling	MLPC Score:	 0.268203650336
	Solver:	 lbfgs
		Learning rate:	 constant	MLPC Score:	 0.282804995197
		Learning rate:	 adaptive	MLPC Score:	 0.272430355427
		Learning rate:	 invscaling	MLPC Score:	 0.281652257445
	Solver:	 sgd
		Learning rate:	 constant	MLPC Score:	 0.2568

In [8]:
from sklearn.ensemble import RandomForestClassifier

criterions = [ "gini", "entropy"]
estimators_size = [50, 100, 150, 200]

for criterion in criterions:
    for n_estimators in estimators_size:
        print ("RFC:\t", n_estimators, "estimators\t| ", criterion, " criterion", end='\t')
        rfc = RandomForestClassifier(n_estimators=n_estimators, max_features='log2', criterion=criterion)
        rfc.fit(X_train, y_train)
        print ("Score:\t",rfc.score(X_test, y_test))
        print ("\n")



RFC:	 50 estimators	|  gini  criterion	Score:	 0.609990393852


RFC:	 100 estimators	|  gini  criterion	Score:	 0.62055715658


RFC:	 150 estimators	|  gini  criterion	Score:	 0.608261287224


RFC:	 200 estimators	|  gini  criterion	Score:	 0.611143131604


RFC:	 50 estimators	|  entropy  criterion	Score:	 0.612680115274


RFC:	 100 estimators	|  entropy  criterion	Score:	 0.616330451489


RFC:	 150 estimators	|  entropy  criterion	Score:	 0.614409221902


RFC:	 200 estimators	|  entropy  criterion	Score:	 0.614793467819




In [None]:
from sklearn import preprocessing
from sklearn.svm import SVC

X_train = preprocessing.scale(X_train) 
X_test = preprocessing.scale(X_test)

print ("Starting grid search...")

kernels = ['linear', 'rbf']
Cs = [1, 10, 100, 1000]

for kernel in kernels:
    for C in Cs:
        print ("SVC:\t", kernel, " kernel\t| ", C, " C\t", end="\t")
        svc = SVC(C=C, kernel=kernel)
        svc.fit(X_train, y_train)
        print ("Score: ", svc.score(X_test, y_test))
        print ("\n")

Starting grid search...
SVC:	 linear  kernel	|  1  C		Score:  0.265129682997


SVC:	 linear  kernel	|  10  C		

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train, y_train)

print (clf.score(X_test, y_test))

In [11]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
print (gbc.score(X_test, y_test))

0.355916558981


In [12]:
from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier()
abc.fit(X_train, y_train)
print (abc.score(X_test, y_test))

0.288351486062


In [13]:
from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier(n_estimators=30)
etc.fit(X_train, y_train)
print (etc.score(X_test, y_test))

0.604762783829


In [15]:
print X.shape, y.reshape((y.shape[0], 1)).shape

SyntaxError: invalid syntax (<ipython-input-15-ed0f812466c7>, line 1)

In [None]:
#dataset = X#np.concatenate((X, y.T), axis=1)

dataset = np.concatenate((X,y[:,None]),axis=1)
#print X.shape[0] == y.shape[0]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
df = pd.DataFrame(dataset)
#sns.pairplot(df)
#plt.show()