In [17]:
import pandas as pd
df_siamx = pd.read_csv('data/up/SiamX_sampled.csv', index_col=0)

In [22]:
df_train = pd.read_csv('data/train.csv', index_col=0)
df_train = df_train.sample(len(df_siamx.index))

In [23]:
df = pd.concat([df_siamx, df_train])

In [32]:
from helpers import similarity_generator
siamx = similarity_generator.get_algorithm_by_name('SiamX', True)
comments1, comments2, word_counts, name_similarities = siamx.features(df)
labels = df['label'].to_numpy()

In [33]:
from keras.callbacks import EarlyStopping
es = EarlyStopping(patience=3)
siamx.model.fit([comments1, comments2, word_counts, name_similarities], labels, epochs=siamx.epochs, validation_split=0.1, callbacks=[es])

Train on 783 samples, validate on 87 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20


<keras.callbacks.callbacks.History at 0x7f477a6aac40>

In [35]:
siamx.model.save('cache/models/siamx2')
super(type(siamx), siamx).train(df, labels, True, 'SiamX2')

Threshold = 0.5851, train accuracy = 0.9897


In [36]:
from metrics import metrics
df_test = pd.read_csv('data/test.csv', index_col=0)
y_pred = siamx.run_similarity(df_test)
y_test = df_test['label'].to_numpy()

metrics.get_metrics(siamx, y_test, y_pred)

{'accuracy': 0.9429347826086957,
 'f1': 0.9445910290237467,
 'precision': 0.9179487179487179,
 'roc': 0.9803579867674859,
 'tp': 358,
 'tn': 336,
 'fp': 32,
 'fn': 10}

In [37]:
siamx = similarity_generator.get_algorithm_by_name('SiamX', True)
df_test = pd.read_csv('data/test.csv', index_col=0)
y_pred = siamx.run_similarity(df_test)
y_test = df_test['label'].to_numpy()
metrics.get_metrics(siamx, y_test, y_pred)

{'accuracy': 0.9239130434782609,
 'f1': 0.9270833333333334,
 'precision': 0.89,
 'roc': 0.9700717745746692,
 'tp': 356,
 'tn': 324,
 'fp': 44,
 'fn': 12}

In [1]:
from tabulate import tabulate

siamx_d = {'accuracy': 0.9239130434782609,
 'f1': 0.9270833333333334,
 'precision': 0.89,
 'roc': 0.9700717745746692,
 'tp': 356,
 'tn': 324,
 'fp': 44,
 'fn': 12}
siamx2_d = {'accuracy': 0.9429347826086957,
 'f1': 0.9445910290237467,
 'precision': 0.9179487179487179,
 'roc': 0.9803579867674859,
 'tp': 358,
 'tn': 336,
 'fp': 32,
 'fn': 10}

table = []
headers = ['Algorithm', 'Accuracy', 'F1', 'Precision', 'TP', 'TN', 'FP', 'FN']
name = 'SiamX'
for d, n in (siamx_d, name), (siamx2_d, name+' +'):
    table.append((n, d['accuracy'], d['f1'], d['precision'], d['tp'], d['tn'], d['fp'], d['fn']))
print(tabulate(table, headers, tablefmt='grid', floatfmt='.4f'))

+-------------+------------+--------+-------------+------+------+------+------+
| Algorithm   |   Accuracy |     F1 |   Precision |   TP |   TN |   FP |   FN |
| SiamX       |     0.9239 | 0.9271 |      0.8900 |  356 |  324 |   44 |   12 |
+-------------+------------+--------+-------------+------+------+------+------+
| SiamX +     |     0.9429 | 0.9446 |      0.9179 |  358 |  336 |   32 |   10 |
+-------------+------------+--------+-------------+------+------+------+------+


In [6]:
import pandas as pd
import numpy as np
from helpers import similarity_generator
from keras.models import load_model

df = pd.read_csv('data/unlabeled/junit4.csv', index_col=0, na_filter=False)

name = 'SiamX2'
alg = similarity_generator.get_algorithm_by_name('SiamX', True)
alg.model = load_model('cache/models/siamx2')
super(type(alg), alg).load('SiamX2')

scores = alg.run_similarity(df)
predictions = alg.predict(scores)
scores = scores[predictions == 1]
df_up = df[predictions == 1]
df_up.drop(columns=['label', 'name1', 'name2'], inplace=True)
df_up.insert(4, 'score', scores)
df_up.sort_values(by='score', ascending=False, inplace=True)
df_up.to_csv('data/up/scores/' + name + '.csv')

print(name, len(df_up.index))

SiamX2 268


In [8]:
import pickle
with open('data/up/accepted/all.pkl', 'rb') as f:
    all = pickle.load(f)
import pandas as pd
df_siam = pd.read_csv('data/up/scores/SiamX2.csv', index_col=0)
labels = [int(i in all) for i in df_siam.index]
df_siam.insert(5, 'label', labels)
df_siam.to_csv('data/up/SiamX2_labeled.csv')

In [13]:
df_siam.groupby('label').count()

Unnamed: 0_level_0,comment1,comment2,meta1,meta2,score
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,107,107,107,107,107
1,161,161,161,161,161


In [1]:
import pandas as pd
df_dict = {}
names = ['LCS', 'COS', 'LEV', 'LSH', 'WMD', 'SiamX']
df = pd.concat([pd.read_csv('data/up/scores/'+name+'.csv', index_col=0) for name in names]).drop(columns=['score']).drop_duplicates()
ix = df.index.values
df_siamx2 = pd.read_csv('data/up/scores/SiamX2.csv', index_col=0)
ix_siamx2 = df_siamx2.index.values

In [2]:
import numpy as np
unique_siamx2 = np.setdiff1d(ix_siamx2, ix)

In [4]:
df_siam = pd.read_csv('data/up/SiamX2_labeled.csv', index_col=0)
df_siam.loc[unique_siamx2]

Unnamed: 0,comment1,comment2,meta1,meta2,score,label
33657,The Theories runner allows to test a certain f...,Can not be instantiated outside org.junit.Comp...,"<!-- META {""entityType"": ""Class"", ""entitySigna...","<!-- META {""entityType"": ""Method"", ""entitySign...",0.606557,0
47829,"Returns, efficiently, all the non-overridden m...",Returns the methods that run tests. Default im...,"<!-- META {""entityType"": ""Method"", ""entitySign...","<!-- META {""entityType"": ""Method"", ""entitySign...",0.75329,0
78977,# Annotation for a method which provides param...,# Annotation for fields of the test class whic...,"<!-- META {""entityType"": ""Annotation"", ""entity...","<!-- META {""entityType"": ""Annotation"", ""entity...",0.734076,0
79428,# Method that returns the index of the paramet...,@return a ParametersRunnerFactory class (must ...,"<!-- META {""entityType"": ""Annotation"", ""entity...","<!-- META {""entityType"": ""Annotation"", ""entity...",0.60311,0
123585,Creates a Filter.\n @param filterFactoryFqcn T...,Creates a org.junit.experimental.categories.Ca...,"<!-- META {""entityType"": ""Method"", ""entitySign...","<!-- META {""entityType"": ""Method"", ""entitySign...",0.632071,0
125319,The Test annotation tells JUnit that the publi...,A Test can be run and collect its results.\n @...,"<!-- META {""entityType"": ""Annotation"", ""entity...","<!-- META {""entityType"": ""Interface"", ""entityS...",0.606732,0
125475,The Test annotation tells JUnit that the publi...,...as the moon sets over the early morning Mer...,"<!-- META {""entityType"": ""Annotation"", ""entity...","<!-- META {""entityType"": ""Method"", ""entitySign...",0.805489,0
125532,The Test annotation tells JUnit that the publi...,Can not be instantiated outside org.junit.Comp...,"<!-- META {""entityType"": ""Annotation"", ""entity...","<!-- META {""entityType"": ""Method"", ""entitySign...",0.678461,0
132396,"Asserts that two arrays are equal, according t...",Asserts that two objects are not equals. If th...,"<!-- META {""entityType"": ""Method"", ""entitySign...","<!-- META {""entityType"": ""Method"", ""entitySign...",0.986025,0
132398,"Asserts that two arrays are equal, according t...",Asserts that two longs are not equals. If they...,"<!-- META {""entityType"": ""Method"", ""entitySign...","<!-- META {""entityType"": ""Method"", ""entitySign...",0.781803,0
