In [1]:
import pandas as pd
import numpy as np
from classifier import *
from dataloader import *

In [2]:
import multiprocessing as mp

In [3]:
df = get_dataset(data_files, True).drop_duplicates(['TC','TM','D'])
df.head()

Unnamed: 0,TC,TM,CUT,D,L,ABS,INT,JDK,ICB,DEP,...,TUAPI,UINT,SYNC,CALLSITES,AFPR,RBFA,EXPCAT,CONDCALL,PROJ,IS_MOCK
0,org.apache.hadoop.resourceestimator.translator...,testGetContainerSpec,org.apache.hadoop.resourceestimator.translator...,org.apache.hadoop.yarn.api.records.Resource,real,True,False,False,False,0.5,...,0.0,False,0.0,0.0,0.0,0.0,0.0,0.0,Hadoop,False
1,org.apache.hadoop.resourceestimator.translator...,testGetJobSize,org.apache.hadoop.resourceestimator.translator...,org.apache.hadoop.yarn.api.records.Resource,real,True,False,False,False,0.5,...,0.0,False,0.0,0.0,0.0,0.0,0.0,0.0,Hadoop,False
2,org.apache.hadoop.resourceestimator.translator...,testGetRecurrenceeId,org.apache.hadoop.resourceestimator.translator...,org.apache.hadoop.yarn.api.records.Resource,real,True,False,False,False,0.5,...,0.0,False,0.0,0.0,0.0,0.0,0.0,0.0,Hadoop,False
3,org.apache.hadoop.resourceestimator.translator...,testStringToUnixTimestamp,org.apache.hadoop.resourceestimator.translator...,org.apache.hadoop.yarn.api.records.Resource,real,True,False,False,False,0.5,...,0.0,False,0.0,0.0,0.0,0.0,0.0,0.0,Hadoop,False
4,org.apache.hadoop.resourceestimator.translator...,testResourceSkyline,org.apache.hadoop.resourceestimator.translator...,org.apache.hadoop.yarn.api.records.Resource,real,True,False,False,False,0.5,...,0.0,False,0.0,0.0,0.0,0.0,0.0,0.0,Hadoop,False


In [4]:
df['TC'].count()

543171

In [5]:
pd.DataFrame([
    {
        'PRIOJ': p,
        'TOTAL': df[(df['PROJ']==p)]['PROJ'].count(),
        'MOCK': df[(df['PROJ']==p) & df['IS_MOCK']]['PROJ'].count(),

    }
    for p in projects
])

Unnamed: 0,PRIOJ,TOTAL,MOCK
0,Hadoop,323078,13484
1,Flink,86141,6719
2,Hive,23368,1490
3,Camel,12936,1855
4,CXF,22550,1489
5,Druid,45332,1869
6,HBase,11966,1072
7,Dubbo,8623,758
8,Oozie,5539,278
9,Storm,3638,369


In [6]:
("%d mocks(%.2f%%)" % (df['IS_MOCK'].sum(), df['IS_MOCK'].sum()/df['IS_MOCK'].count()*100))

'29383 mocks(5.41%)'

# MockSniffer (Intra Project Prediction)

In [7]:
def _model_core(p):
    train = p[0]
    test = p[1]
    X_train = train.drop(['CUT', 'IS_MOCK', 'TC', 'TM', 'D', 'L', 'PROJ'], axis=1)
    y_train = train['IS_MOCK']
    X_test = test.drop(['CUT', 'IS_MOCK', 'TC', 'TM', 'D', 'L', 'PROJ'], axis=1)
    y_test = test['IS_MOCK']
    return run_classifier(X_train, X_test, y_train, y_test)

def run_intra_project(project_data, n_folds = 10, iter_count = 100):
    scores = []
    pl = mp.Pool(100)
    project_mocks = project_data[project_data['IS_MOCK']].sample(frac=1).reset_index(drop=True)
    project_not_mocks = project_data[~project_data['IS_MOCK']]
    fold_size = len(project_mocks)//n_folds
    for i in range(n_folds):
        tfidx = fold_size*i
        mock_train = pd.concat([project_mocks.iloc[0:tfidx], project_mocks.iloc[tfidx+fold_size:]])
        mock_test = project_mocks.iloc[tfidx:tfidx+fold_size]
        def _gen_data():
            train = pd.concat([
                mock_train,
                project_not_mocks.sample(n=fold_size*(n_folds-1))
            ])
            test = pd.concat([
                mock_test,
                project_not_mocks.sample(n=fold_size)
            ])
            return train, test
        proj_score = pl.map(_model_core, [_gen_data() for n in range(iter_count)])
        scores.extend(proj_score)
    pl.close()
    return {
        k: np.mean([s[k] for s in scores])*100
        for k in metrics
    }

In [8]:
def pool_core_bal(par):
    project = par
    p = df[df['PROJ'] == project]
    return project, run_intra_project(p)

runing_result =[pool_core_bal(it) for it in [*projects]]

In [9]:
perf_df = pd.DataFrame([
    {
        'project': proj,
        **perf
    }
    for proj, perf in runing_result
]).set_index(['project'])

In [10]:
perf_df

Unnamed: 0_level_0,accuracy,precision,recall,f1-score
project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Hadoop,82.89911,80.36106,87.091395,83.587826
Flink,82.919449,81.316721,85.510432,83.345406
Hive,84.364765,84.456354,84.319463,84.331364
Camel,79.515946,76.18584,86.123784,80.802284
CXF,83.054392,84.668299,80.828378,82.645191
Druid,83.411022,83.531625,83.382796,83.393547
HBase,86.576168,84.393521,89.886916,86.997095
Dubbo,88.554667,85.753923,92.705333,89.029563
Oozie,91.142593,91.122036,91.574074,91.169781
Storm,91.769444,87.747912,97.480556,92.265536


In [11]:
perf_df.describe()

Unnamed: 0,accuracy,precision,recall,f1-score
count,10.0,10.0,10.0,10.0
mean,85.420755,83.953729,87.890313,85.756759
std,3.978239,4.08734,5.000845,3.891283
min,79.515946,76.18584,80.828378,80.802284
25%,82.953184,81.870447,84.617205,83.357442
50%,83.887893,84.424937,86.607589,83.959595
75%,88.060042,85.482517,91.152285,88.521446
max,91.769444,91.122036,97.480556,92.265536


In [12]:
from baselineutil import run_baseline

# Baseline 1 (Exisiting Heuristics)

In [14]:
from baselines.b1 import baseline1
perf_bs1 = run_baseline(df, baseline1)
perf_bs1

Unnamed: 0_level_0,accuracy,precision,recall,f1-score
project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Hadoop,64.457876,60.005132,86.710175,70.927235
Flink,68.581634,62.538917,92.677482,74.682178
Hive,58.52349,58.52349,58.52349,58.52349
Camel,64.90566,62.28343,75.579515,68.290307
CXF,59.372733,59.475353,58.83143,59.15163
Druid,64.499732,66.749073,57.784912,61.944365
HBase,75.279851,68.871866,92.257463,78.867624
Dubbo,65.44591,69.073851,55.936675,61.814873
Oozie,64.235612,61.34559,76.978417,68.278294
Storm,52.735772,53.11819,46.612466,49.652989


# Baseline 2 (EvoSuite Mock List)

In [15]:
from baselines.b2 import baseline2
perf_bs2 = run_baseline(df, baseline2)
perf_bs2

Unnamed: 0_level_0,accuracy,precision,recall,f1-score
project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Hadoop,48.71329,13.929314,0.496885,0.959542
Flink,48.898199,8.886022,0.238131,0.463831
Hive,49.899329,47.058824,1.610738,3.11486
Camel,48.51752,6.349206,0.215633,0.417101
CXF,48.824715,15.686275,0.537273,1.038961
Druid,49.06367,2.702703,0.053505,0.104932
HBase,49.300373,20.0,0.466418,0.911577
Dubbo,49.139182,23.96,0.791557,1.53247
Oozie,46.582734,0.0,0.0,0.0
Storm,49.322493,0.0,0.0,0.0


# Baseline 3 (Empirical Rules)

In [16]:
from baselines.b3 import baseline3
perf_bs3 = run_baseline(df, baseline3, transform_whole_dataset=True)
perf_bs3

Unnamed: 0_level_0,accuracy,precision,recall,f1-score
project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Hadoop,63.297241,57.856454,97.923465,72.737289
Flink,54.643548,52.463677,98.883762,68.554919
Hive,63.936577,58.818885,92.95302,72.047468
Camel,53.396226,51.8,97.735849,67.712418
CXF,54.734721,52.598599,95.836132,67.920038
Druid,52.70198,51.499851,92.776886,66.233766
HBase,54.610075,52.467445,98.027052,68.351049
Dubbo,54.547493,52.492309,95.778364,67.816881
Oozie,52.338129,51.403888,85.611511,64.237517
Storm,52.834688,51.57756,92.682927,66.273971


# Compare

In [32]:
comp_df = perf_df.rename(columns = {m: f"{m}_ms" for m in metrics }).join([
    perf_bs1.rename(columns = {m: f"{m}_bs1" for m in metrics }),
    perf_bs2.rename(columns = {m: f"{m}_bs2" for m in metrics }),
    perf_bs3.rename(columns = {m: f"{m}_bs3" for m in metrics })
]).round(2)[[ f"{m}_{c}" for m in metrics for c in ["ms", "bs1", "bs2", "bs3" ]]]
comp_df

Unnamed: 0_level_0,accuracy_ms,accuracy_bs1,accuracy_bs2,accuracy_bs3,precision_ms,precision_bs1,precision_bs2,precision_bs3,recall_ms,recall_bs1,recall_bs2,recall_bs3,f1-score_ms,f1-score_bs1,f1-score_bs2,f1-score_bs3
project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Hadoop,82.9,64.46,48.71,63.3,80.36,60.01,13.93,57.86,87.09,86.71,0.5,97.92,83.59,70.93,0.96,72.74
Flink,82.92,68.58,48.9,54.64,81.32,62.54,8.89,52.46,85.51,92.68,0.24,98.88,83.35,74.68,0.46,68.55
Hive,84.36,58.52,49.9,63.94,84.46,58.52,47.06,58.82,84.32,58.52,1.61,92.95,84.33,58.52,3.11,72.05
Camel,79.52,64.91,48.52,53.4,76.19,62.28,6.35,51.8,86.12,75.58,0.22,97.74,80.8,68.29,0.42,67.71
CXF,83.05,59.37,48.82,54.73,84.67,59.48,15.69,52.6,80.83,58.83,0.54,95.84,82.65,59.15,1.04,67.92
Druid,83.41,64.5,49.06,52.7,83.53,66.75,2.7,51.5,83.38,57.78,0.05,92.78,83.39,61.94,0.1,66.23
HBase,86.58,75.28,49.3,54.61,84.39,68.87,20.0,52.47,89.89,92.26,0.47,98.03,87.0,78.87,0.91,68.35
Dubbo,88.55,65.45,49.14,54.55,85.75,69.07,23.96,52.49,92.71,55.94,0.79,95.78,89.03,61.81,1.53,67.82
Oozie,91.14,64.24,46.58,52.34,91.12,61.35,0.0,51.4,91.57,76.98,0.0,85.61,91.17,68.28,0.0,64.24
Storm,91.77,52.74,49.32,52.83,87.75,53.12,0.0,51.58,97.48,46.61,0.0,92.68,92.27,49.65,0.0,66.27


In [33]:
print(comp_df.to_latex())

\begin{tabular}{lrrrrrrrrrrrrrrrr}
\toprule
{} &  accuracy\_ms &  accuracy\_bs1 &  accuracy\_bs2 &  accuracy\_bs3 &  precision\_ms &  precision\_bs1 &  precision\_bs2 &  precision\_bs3 &  recall\_ms &  recall\_bs1 &  recall\_bs2 &  recall\_bs3 &  f1-score\_ms &  f1-score\_bs1 &  f1-score\_bs2 &  f1-score\_bs3 \\
project &              &               &               &               &               &                &                &                &            &             &             &             &              &               &               &               \\
\midrule
Hadoop  &        82.90 &         64.46 &         48.71 &         63.30 &         80.36 &          60.01 &          13.93 &          57.86 &      87.09 &       86.71 &        0.50 &       97.92 &        83.59 &         70.93 &          0.96 &         72.74 \\
Flink   &        82.92 &         68.58 &         48.90 &         54.64 &         81.32 &          62.54 &           8.89 &          52.46 &      85.51 &       

In [35]:
print(comp_df.describe().round(2).to_latex())

\begin{tabular}{lrrrrrrrrrrrrrrrr}
\toprule
{} &  accuracy\_ms &  accuracy\_bs1 &  accuracy\_bs2 &  accuracy\_bs3 &  precision\_ms &  precision\_bs1 &  precision\_bs2 &  precision\_bs3 &  recall\_ms &  recall\_bs1 &  recall\_bs2 &  recall\_bs3 &  f1-score\_ms &  f1-score\_bs1 &  f1-score\_bs2 &  f1-score\_bs3 \\
\midrule
count &        10.00 &         10.00 &         10.00 &         10.00 &         10.00 &          10.00 &          10.00 &          10.00 &      10.00 &       10.00 &       10.00 &       10.00 &        10.00 &         10.00 &         10.00 &         10.00 \\
mean  &        85.42 &         63.81 &         48.82 &         55.70 &         83.95 &          62.20 &          13.86 &          53.30 &      87.89 &       70.19 &        0.44 &       94.82 &        85.76 &         65.21 &          0.85 &         68.19 \\
std   &         3.98 &          6.05 &          0.88 &          4.27 &          4.09 &           4.96 &          14.28 &           2.71 &       5.00 &       16.73 