In [1]:
% pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
from __future__ import print_function
import os.path
import pandas
import src
import sklearn
import os
import scipy
import scipy.stats

In [3]:
def fake(*args, **kwargs):
    print('Fake called with', str(args), str(kwargs))
    sys.exit(1)

# fake out the create_model so we don't accidentally attempt to create data
src.common.create_model = fake

In [4]:
# import seaborn
# seaborn.set_palette("colorblind")

In [5]:
print(os.getcwd())
if os.getcwd().endswith('notebooks'):
    os.chdir('..')
print(os.getcwd())

/home/cscorley/git/triage/notebooks
/home/cscorley/git/triage


In [6]:
args = dict(level='file', force=False, model='lda', source=['release', 'changeset', 'temporal'], random_seed_value=1)

model_config, model_config_string = src.main.get_default_model_config(args)
args.update({'model_config': model_config, 'model_config_string': model_config_string})

changeset_config, changeset_config_string = src.main.get_default_changeset_config()
args.update({'changeset_config': changeset_config, 'changeset_config_string': changeset_config_string})

projects = src.common.load_projects(args)
projects

[Project(name='tika', printable_name='Tika v1.8', version='v1.8', ref='refs/tags/1.8', data_path='data/tika/', full_path='data/tika/v1.8/', src_path='data/tika/v1.8/src/', changeset_config_string=u'True-True-False-True', source=['release', 'changeset', 'temporal'], model_config_string=u'seed1-batch-0.002-0.5-0.002-1000-1000-500-1.0-1', force=False, level='file', model_config={'passes': 1, 'eta': 0.002, 'num_topics': 500, 'iterations': 1000, 'decay': 0.5, 'algorithm': 'batch', 'alpha': 0.002, 'offset': 1.0, 'max_bound_iterations': 1000}, model='lda', random_seed_value=1, changeset_config={'include_removals': True, 'include_context': True, 'include_message': False, 'include_additions': True}),
 Project(name='pig', printable_name='Pig v0.14.0', version='v0.14.0', ref='refs/tags/release-0.14.0', data_path='data/pig/', full_path='data/pig/v0.14.0/', src_path='data/pig/v0.14.0/src/', changeset_config_string=u'True-True-False-True', source=['release', 'changeset', 'temporal'], model_config_st

# Data read

In [171]:
data = dict()
csvs = dict()
for project in projects:
    ownership = src.ownership.read_ownership(project)
    devs = set()
    for v in ownership.values():
        devs.update(v.keys())
        
    goldsets = pandas.read_csv(os.path.join(project.full_path, 'goldset-info.csv'))
    changes = pandas.read_csv(os.path.join(project.full_path, 'changeset-info.csv'))
    release = pandas.read_csv(os.path.join(project.full_path, 'releasefile-info.csv'))
    queries = pandas.read_csv(os.path.join(project.full_path, 'queries-info.csv'))
    info = {"Developers": len(devs), "Changesets": len(changes), "Files": len(release), "Issues": len(queries)}
    data[project.printable_name] = info
    
    csvs[project.name] = {'g': goldsets, 'c': changes, 'r': release, 'q': queries, 'd': devs, 'o': ownership}


sizes = pandas.DataFrame(data)
sizes['Total'] = sizes.T.sum()
sizes = sizes.T
sizes

Unnamed: 0,Changesets,Developers,Files,Issues
BookKeeper v4.3.0,574,5,843,164
Mahout v0.10.0,3283,38,1556,133
OpenJPA v2.3.0,4616,26,4968,137
Pig v0.14.0,2584,28,2098,222
Tika v1.8,2469,26,954,40
ZooKeeper v3.5.0,1245,16,927,359
Total,14771,139,11346,1055


In [7]:
ALL_ORDER = ["Snapshot", "Changesets", "Historical"]
RQ1_ORDER = ["Snapshot", "Changesets"]
RQ2_ORDER = ["Changesets", "Historical"]

In [8]:
def get_panel(projects, fn):
    datarank = dict()
    for project in projects:
        results = fn(project) 
        x, y = src.common.merge_first_rels(results['changeset'], results['release'], ignore=True)
        _, z = src.common.merge_first_rels(results['changeset'], results['temporal'], ignore=True)
        print(len(x), len(y), len(z))
        datarank[project.printable_name] = {'Changesets': pandas.Series(x),
                                            'Snapshot': pandas.Series(y),
                                            'Historical': pandas.Series(z)}

    return pandas.Panel(datarank)

In [9]:
tpanel = get_panel(projects, src.triage.run_experiment)

40 40 40
222 222 222
164 164 164
137 137 137
133 133 133
359 359 359


In [10]:
fpanel = get_panel(projects, src.feature_location.run_experiment)

36 36 36
174 174 174
143 143 143
131 131 131
50 50 50
241 241 241


In [243]:
def print_stats(df, first, second, max_bound, bounds):
    diff = df[first] - df[second]
    diff = diff.abs().dropna()
    diff = diff[diff != 0] # drop sames
    
    ones = df[(df[first] == 1) & (df[second] == 1)]
    same = df[df[first] == df[second]]
    
    total = len(diff) + len(same)
    print("max bound:", max_bound, sep="\t")
    print("fancy:", fancy_bounds, sep="\t")
    
    print("same:", len(same), float(len(same)) / total, sep="\t")
    print("same (ones):", len(ones), float(len(ones)) / total, sep="\t")
    
    for each in range(1, 4):
        diffof = diff[diff == each]
        print("diff of %d:" % each,
              len(diffof), float(len(diffof)) / total, sep="\t")

    for each in bounds:
        bound = each
        diffin = diff[diff <= bound]
        print("within <=%d:" % (bound),
              len(diffin), float(len(diffin)) / total, sep="\t")
        
        
    bound = max(bounds)

    other = diff[diff > bound]
    print("other (>%f):" % (bound), len(other), float(len(other)) / total, sep="\t")
        
    for each in bounds:
        bound = int(((float(each) / 100.0) * max_bound) + 0.5)
        diffin = diff[diff <= bound]
        print("within <= %d (%f%%):" % (bound, each),
              len(diffin), float(len(diffin)) / total, sep="\t")

    bound = int(((float(each) / 100.0) * max_bound) + 0.5)
    other = diff[diff > bound]
    print("other > %d (%f%%):" % (bound, each), len(other), float(len(other)) / total, sep="\t")

    print("total:", total, sep="\t")

In [251]:
def plot_panel(panel, a, b, bound_by):
    print("<!--", a, b, bound_by)
    allt = pandas.DataFrame()
    max_max_bound = 0
    for each in panel:
        allt = allt.append(panel[each], ignore_index=True)
        print("-->\n\n<!--", each)
        max_bound = sizes[bound_by][each]
        max_max_bound = max(max_max_bound, max_bound)
        print_stats(panel[each], a, b, max_bound, [1, 5, 10, 50])

    print("-->\n\n<!--All")
    print_stats(allt, a, b, max_max_bound, [1, 5, 10, 50])
    
    print("-->")

In [252]:
plot_panel(fpanel, "Changesets", "Snapshot", "Files")

<!-- Changesets Snapshot Files
-->

<!-- BookKeeper v4.3.0
max bound:	843
fancy:	False
same:	38	0.265734265734
same (ones):	34	0.237762237762
diff of 1:	14	0.0979020979021
diff of 2:	13	0.0909090909091
diff of 3:	4	0.027972027972
within <=1:	14	0.0979020979021
within <=5:	40	0.27972027972
within <=10:	46	0.321678321678
within <=50:	74	0.517482517483
other (>50.000000):	31	0.216783216783
within <= 8 (1.000000%):	42	0.293706293706
within <= 42 (5.000000%):	73	0.51048951049
within <= 84 (10.000000%):	84	0.587412587413
within <= 422 (50.000000%):	95	0.664335664336
other > 422 (50.000000%):	10	0.0699300699301
total:	143
-->

<!-- Mahout v0.10.0
max bound:	1556
fancy:	False
same:	6	0.12
same (ones):	5	0.1
diff of 1:	5	0.1
diff of 2:	4	0.08
diff of 3:	0	0.0
within <=1:	5	0.1
within <=5:	12	0.24
within <=10:	15	0.3
within <=50:	23	0.46
other (>50.000000):	21	0.42
within <= 16 (1.000000%):	17	0.34
within <= 78 (5.000000%):	24	0.48
within <= 156 (10.000000%):	28	0.56
within <= 778 (50.000000%):	

In [253]:
plot_panel(fpanel, "Changesets", "Historical", "Files")

<!-- Changesets Historical Files
-->

<!-- BookKeeper v4.3.0
max bound:	843
fancy:	False
same:	29	0.202797202797
same (ones):	21	0.146853146853
diff of 1:	16	0.111888111888
diff of 2:	8	0.0559440559441
diff of 3:	8	0.0559440559441
within <=1:	16	0.111888111888
within <=5:	43	0.300699300699
within <=10:	59	0.412587412587
within <=50:	83	0.58041958042
other (>50.000000):	31	0.216783216783
within <= 8 (1.000000%):	52	0.363636363636
within <= 42 (5.000000%):	82	0.573426573427
within <= 84 (10.000000%):	95	0.664335664336
within <= 422 (50.000000%):	112	0.783216783217
other > 422 (50.000000%):	2	0.013986013986
total:	143
-->

<!-- Mahout v0.10.0
max bound:	1556
fancy:	False
same:	6	0.12
same (ones):	4	0.08
diff of 1:	7	0.14
diff of 2:	4	0.08
diff of 3:	1	0.02
within <=1:	7	0.14
within <=5:	14	0.28
within <=10:	16	0.32
within <=50:	27	0.54
other (>50.000000):	17	0.34
within <= 16 (1.000000%):	21	0.42
within <= 78 (5.000000%):	32	0.64
within <= 156 (10.000000%):	37	0.74
within <= 778 (50.00000

In [254]:
plot_panel(tpanel, "Changesets", "Snapshot", "Developers")

<!-- Changesets Snapshot Developers
-->

<!-- BookKeeper v4.3.0
max bound:	5
fancy:	False
same:	30	0.182926829268
same (ones):	17	0.103658536585
diff of 1:	61	0.371951219512
diff of 2:	20	0.121951219512
diff of 3:	41	0.25
within <=1:	61	0.371951219512
within <=5:	134	0.817073170732
within <=10:	134	0.817073170732
within <=50:	134	0.817073170732
other (>50.000000):	0	0.0
within <= 0 (1.000000%):	0	0.0
within <= 0 (5.000000%):	0	0.0
within <= 1 (10.000000%):	61	0.371951219512
within <= 3 (50.000000%):	122	0.743902439024
other > 3 (50.000000%):	12	0.0731707317073
total:	164
-->

<!-- Mahout v0.10.0
max bound:	38
fancy:	False
same:	16	0.12030075188
same (ones):	4	0.0300751879699
diff of 1:	17	0.127819548872
diff of 2:	19	0.142857142857
diff of 3:	16	0.12030075188
within <=1:	17	0.127819548872
within <=5:	67	0.503759398496
within <=10:	94	0.706766917293
within <=50:	117	0.87969924812
other (>50.000000):	0	0.0
within <= 0 (1.000000%):	0	0.0
within <= 2 (5.000000%):	36	0.270676691729
within <

In [255]:
plot_panel(tpanel, "Changesets", "Historical", "Developers")

<!-- Changesets Historical Developers
-->

<!-- BookKeeper v4.3.0
max bound:	5
fancy:	False
same:	43	0.262195121951
same (ones):	30	0.182926829268
diff of 1:	58	0.353658536585
diff of 2:	40	0.243902439024
diff of 3:	17	0.103658536585
within <=1:	58	0.353658536585
within <=5:	121	0.737804878049
within <=10:	121	0.737804878049
within <=50:	121	0.737804878049
other (>50.000000):	0	0.0
within <= 0 (1.000000%):	0	0.0
within <= 0 (5.000000%):	0	0.0
within <= 1 (10.000000%):	58	0.353658536585
within <= 3 (50.000000%):	115	0.701219512195
other > 3 (50.000000%):	6	0.0365853658537
total:	164
-->

<!-- Mahout v0.10.0
max bound:	38
fancy:	False
same:	12	0.0902255639098
same (ones):	4	0.0300751879699
diff of 1:	14	0.105263157895
diff of 2:	15	0.112781954887
diff of 3:	11	0.0827067669173
within <=1:	14	0.105263157895
within <=5:	51	0.383458646617
within <=10:	89	0.669172932331
within <=50:	121	0.90977443609
other (>50.000000):	0	0.0
within <= 0 (1.000000%):	0	0.0
within <= 2 (5.000000%):	29	0.218045

In [162]:
df = fpanel["BookKeeper v4.3.0"]

In [78]:
len(df[(df["Changesets"] == 1) & (df["Snapshot"] == 1)])

34

In [98]:
df[df["Changesets"] == df["Snapshot"]]

Unnamed: 0,Changesets,Historical,Snapshot
3,1,2,1
4,1,1,1
6,1,1,1
11,1,7,1
17,1,6,1
20,1,1,1
23,1,1,1
25,1,2,1
27,1,67,1
31,1,1,1
