In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
from __future__ import print_function, division
import pandas
import src
import os, os.path
import matplotlib.pyplot as plt
import scipy.stats
from operator import itemgetter

In [3]:
def fake(*args, **kwargs):
    print('Fake called with', str(args), str(kwargs))
    sys.exit(1)

# fake out the create_model so we don't accidentally attempt to create data
src.common.create_model = fake

print(os.getcwd())
if os.getcwd().endswith('notebooks'):
    os.chdir('..')
print(os.getcwd())

/home/cscorley/git/triage/notebooks
/home/cscorley/git/triage


In [4]:
d1 = pandas.read_csv('data/bookkeeper/v4.3.0/changeset-feature_location-lda-true-true-false-true-seed1-batch-0.002-0.5-0.002-1000-1000-500-1.0-1-file-ranks.csv.gz')
d2 = pandas.read_csv('data/bookkeeper/v4.3.0/changeset-feature_location-lda-true-true-false-true-batch-0.002-0.5-0.002-1000-1000-500-1.0-1-file-ranks.csv.gz')

In [5]:
(d1 == d2).all()

id          True
rank        True
distance    True
item        True
dtype: bool

In [6]:
def best_pair(s, t):
    s = reversed(sorted(enumerate(s), key=itemgetter(1)))
    t = reversed(sorted(enumerate(t), key=itemgetter(1)))
    for (i, item_i), (j, item_j) in zip(s, t):
        if i == j:
            return i

In [7]:
projects = src.common.load_projects(dict(level='file'))
projects

[Project(name='tika', printable_name='Tika v1.8', version='v1.8', ref='refs/tags/1.8', data_path='data/tika/', full_path='data/tika/v1.8/', src_path='data/tika/v1.8/src/', level='file'),
 Project(name='pig', printable_name='Pig v0.14.0', version='v0.14.0', ref='refs/tags/release-0.14.0', data_path='data/pig/', full_path='data/pig/v0.14.0/', src_path='data/pig/v0.14.0/src/', level='file'),
 Project(name='bookkeeper', printable_name='BookKeeper v4.3.0', version='v4.3.0', ref='refs/tags/release-4.3.0', data_path='data/bookkeeper/', full_path='data/bookkeeper/v4.3.0/', src_path='data/bookkeeper/v4.3.0/src/', level='file'),
 Project(name='openjpa', printable_name='OpenJPA v2.3.0', version='v2.3.0', ref='refs/tags/2.3.0', data_path='data/openjpa/', full_path='data/openjpa/v2.3.0/', src_path='data/openjpa/v2.3.0/src/', level='file'),
 Project(name='mahout', printable_name='Mahout v0.10.0', version='v0.10.0', ref='refs/tags/mahout-0.10', data_path='data/mahout/', full_path='data/mahout/v0.10.0

In [8]:
names = {'model': {'score': 'score',
                   'model_base_alpha': 'alpha',
                   'model_base_eta': 'eta',
                   'num_topics': 'K'
                  },
         'corpus': {'score': 'score',
                    'changeset_include_additions': 'Additions',
                    'changeset_include_context': 'Context',
                    'changeset_include_message': 'Message',
                    'changeset_include_removals': 'Removals',
                    },
        }
exps = ['triage', 'feature_location']
table_headers = {
    'model': ['K', 'alpha', 'eta', 'Feature Location', 'Triage'],
    'corpus': ['Additions', 'Removals', 'Context', 'Message', 'Feature Location', 'Triage']
}

include_fmt = lambda x: "Included" if x else ""

formatters = {
    'Feature Location': lambda x: r"$\bm{%.4f}$" % x if x == max(main_df["Feature Location"]) else "$%.4f$" % x,
    'alpha': lambda x: "$%s/K$" % x if x != 'auto' else x,
    'eta': lambda x: "$%s/K$" % x if x != 'auto' else x,
    'K': lambda x: "$%s$" % x,
    'Additions': include_fmt,
    'Removals': include_fmt,
    'Context': include_fmt,
    'Message': include_fmt,
    'Triage': lambda x:  r"$\bm{%.4f}$" % x if x == max(main_df["Triage"]) else "$%.4f$" % x,
}

full_tex = r"""
\begin{table}
\begin{spacing}{1.2}
\centering
\caption{MRR values of %s %s construction sweep}
\label{table:%s}
\vspace{0.2em}
%s
\end{spacing}
\end{table}
"""

best_flt = list()
best_dit = list()
tex_dir = os.path.expanduser("~/git/dissertation/tables")
for project in projects:
    for rq in names.keys():
        names[rq]['score'] = 'score'
        main_df = pandas.DataFrame(columns=names[rq])
        for exp in exps:
            path = os.path.join(project.full_path, 'optimized-%s-changeset-%s.csv' % (rq, exp))
            exp = ' '.join(exp.title().split('_'))
            names[rq]['score'] = exp
            exp_df = pandas.read_csv(path)
            exp_df = exp_df.rename(columns=names[rq])
            if len(main_df):
                main_df = main_df.merge(exp_df)
            else:
                main_df = exp_df
        
        # filter out uninteresting rows, like there was no corpus
        main_df = main_df[(main_df["Feature Location"] != 0) | (main_df["Triage"] != 0)]
        if rq == "model":
            main_df = main_df.sort(["K", "alpha", "eta"])
        else:
            main_df = main_df.sort(["Additions", "Removals", "Context", "Message"], ascending=False)
        
        best_flt.append((project, main_df[main_df["Feature Location"] == main_df["Feature Location"].max()]))
        best_dit.append((project, main_df[main_df["Triage"] == main_df["Triage"].max()]))
        
        label = "%s_%s_sweep" % (project.name, rq)
        op = os.path.join(tex_dir, label + ".tex")
        
        if len(main_df) > 24:
            tex = r"\parbox{.45\linewidth}{\centering %s} \hfill \parbox{.45\linewidth}{\centering %s}"
            mid = len(main_df)//2
            tex = tex % (main_df[:mid].to_latex(index=False,
                                                escape=False, # needed so it doesn't screw up formatters
                                                formatters=formatters,
                                                columns=table_headers[rq]),
                         main_df[mid:].to_latex(index=False,
                                                escape=False, # needed so it doesn't screw up formatters
                                                formatters=formatters,
                                                columns=table_headers[rq]))
        else:
            tex = main_df.to_latex(index=False,
                                   escape=False, # needed so it doesn't screw up formatters
                                   formatters=formatters,
                                   columns=table_headers[rq],)
            
        #print(best_pair(main_df["Triage"], main_df["Feature Location"]))
        
        # and now the lazy
        this_full_tex = full_tex % (project.printable_name, rq, label, tex)
        this_full_tex = this_full_tex.replace(" alpha ", r" $\alpha$ ")
        this_full_tex = this_full_tex.replace(" eta ", r" $\eta$ ")
        this_full_tex = this_full_tex.replace(" Feature Location ", " FLT ")
        this_full_tex = this_full_tex.replace(" Triage ", " DIT ")
        this_full_tex = this_full_tex.replace(r"\begin{tabular}{rllrr}", r"\begin{tabular}{rll|rr}")
        this_full_tex = this_full_tex.replace(r"\begin{tabular}{llllrr}", r"\begin{tabular}{llll|rr}")
        this_full_tex = this_full_tex.replace(r"$500$ &  $1/K$ &  $1/K$ &", r"\myrowcolor $500$ &  $1/K$ &  $1/K$ &")
        this_full_tex = this_full_tex.replace(r"Included &  Included &  Included &           &", r"\myrowcolor Included &  Included &  Included &           &")
        print("Writing to: %s\n%s\n" % (op, this_full_tex))
        with open(op, 'wt') as f:
            f.write(this_full_tex)

Writing to: /home/cscorley/git/dissertation/tables/tika_model_sweep.tex

\begin{table}
\begin{spacing}{1.2}
\centering
\caption{MRR values of Tika v1.8 model construction sweep}
\label{table:tika_model_sweep}
\vspace{0.2em}
\parbox{.45\linewidth}{\centering \begin{tabular}{rll|rr}
\toprule
    K &  $\alpha$ &    $\eta$ & FLT &   DIT \\
\midrule
$100$ &  $1/K$ &  $1/K$ &         $0.3030$ & $0.2813$ \\
$100$ &  $1/K$ &  $2/K$ &         $0.3104$ & $0.2858$ \\
$100$ &  $1/K$ &  $5/K$ &         $0.3377$ & $0.2377$ \\
$100$ &  $1/K$ &   auto &         $0.2793$ & $0.2814$ \\
$100$ &  $2/K$ &  $1/K$ &         $0.3013$ & $0.2732$ \\
$100$ &  $2/K$ &  $2/K$ &         $0.3007$ & $0.2609$ \\
$100$ &  $2/K$ &  $5/K$ &         $0.3418$ & $0.2410$ \\
$100$ &  $2/K$ &   auto &         $0.2782$ & $0.2881$ \\
$100$ &  $5/K$ &  $1/K$ &         $0.3199$ & $0.2898$ \\
$100$ &  $5/K$ &  $2/K$ &         $0.3396$ & $0.2799$ \\
$100$ &  $5/K$ &  $5/K$ &         $0.3273$ & $0.2223$ \\
$100$ &  $5/K$ &   auto & 

Writing to: /home/cscorley/git/dissertation/tables/mahout_corpus_sweep.tex

\begin{table}
\begin{spacing}{1.2}
\centering
\caption{MRR values of Mahout v0.10.0 corpus construction sweep}
\label{table:mahout_corpus_sweep}
\vspace{0.2em}
\begin{tabular}{llll|rr}
\toprule
Additions &  Removals &   Context &   Message & FLT &        DIT \\
\midrule
 Included &  Included &  Included &  Included &         $0.3119$ &      $0.2785$ \\
 \myrowcolor Included &  Included &  Included &           &         $0.2730$ &      $0.3340$ \\
 Included &  Included &           &  Included &         $0.2766$ &      $0.2797$ \\
 Included &  Included &           &           &         $0.2606$ &      $0.3052$ \\
 Included &           &  Included &  Included &         $0.3422$ &      $0.3124$ \\
 Included &           &  Included &           &         $0.3063$ &      $0.3492$ \\
 Included &           &           &  Included &         $0.2799$ &      $0.3116$ \\
 Included &           &           &           &      

In [9]:
best_flt_model = {p:x for p,x in best_flt if hasattr(x, 'alpha')}
best_flt_corpus = {p:x for p,x in best_flt if hasattr(x, 'Additions')}
best_dit_model = {p:x for p,x in best_dit if hasattr(x, 'alpha')}
best_dit_corpus = {p:x for p,x in best_dit if hasattr(x, 'Additions')}

In [10]:
best_dit_corpus

{Project(name='bookkeeper', printable_name='BookKeeper v4.3.0', version='v4.3.0', ref='refs/tags/release-4.3.0', data_path='data/bookkeeper/', full_path='data/bookkeeper/v4.3.0/', src_path='data/bookkeeper/v4.3.0/src/', level='file'):      Triage Additions Context Message Removals  Feature Location
 0  0.721646      True    True    True     True            0.5246,
 Project(name='mahout', printable_name='Mahout v0.10.0', version='v0.10.0', ref='refs/tags/mahout-0.10', data_path='data/mahout/', full_path='data/mahout/v0.10.0/', src_path='data/mahout/v0.10.0/src/', level='file'):      Triage Additions Context Message Removals  Feature Location
 7  0.382731      True   False   False    False          0.260056,
 Project(name='openjpa', printable_name='OpenJPA v2.3.0', version='v2.3.0', ref='refs/tags/2.3.0', data_path='data/openjpa/', full_path='data/openjpa/v2.3.0/', src_path='data/openjpa/v2.3.0/src/', level='file'):      Triage Additions Context Message Removals  Feature Location
 5  0.4

In [11]:
kwargs = dict(model='lda', level='file', source=['changeset'], force=False)

def get_model_config(table, project):
    model_config, model_config_string = src.main.get_default_model_config(kwargs)
    changeset_config, changeset_config_string = src.main.get_default_changeset_config()
    
    K = int(table[project]['K'].values[0])
    alpha = table[project]['alpha'].values[0]
    eta = table[project]['eta'].values[0]
    model_config['alpha'] =  int(alpha) / K if alpha != 'auto' else alpha
    model_config['eta'] = int(eta) / K if eta != 'auto' else eta
    model_config['num_topics'] = K

    model_config_string =  '-'.join([unicode(v) for k, v in sorted(model_config.items())])
    changeset_config_string = '-'.join([unicode(v) for k, v in sorted(changeset_config.items())])

    kwargs.update({'changeset_config': changeset_config,
                   'changeset_config_string': changeset_config_string})

    kwargs.update({'model_config': model_config,
                   'model_config_string': model_config_string})
    
    return [x for x in src.common.load_projects(kwargs) if x.name == project.name][0]

def get_corpus_config(table, project):
    model_config, model_config_string = src.main.get_default_model_config(kwargs)
    changeset_config, changeset_config_string = src.main.get_default_changeset_config()
    
    changeset_config['include_additions'] = table[project]['Additions'].values[0]
    changeset_config['include_removals'] = table[project]['Removals'].values[0]
    changeset_config['include_context'] = table[project]['Context'].values[0]
    changeset_config['include_message'] = table[project]['Message'].values[0]

    model_config_string =  '-'.join([unicode(v) for k, v in sorted(model_config.items())])
    changeset_config_string = '-'.join([unicode(v) for k, v in sorted(changeset_config.items())])

    kwargs.update({'changeset_config': changeset_config,
                   'changeset_config_string': changeset_config_string})

    kwargs.update({'model_config': model_config,
                   'model_config_string': model_config_string})
    
    return [x for x in src.common.load_projects(kwargs) if x.name == project.name][0]

In [12]:
best_model_flt_projects = list()
best_model_dit_projects = list()
best_corpus_flt_projects = list()
best_corpus_dit_projects = list()
for project in projects:
    # load project info
    best_model_flt_projects.append(get_model_config(best_flt_model, project))
    best_model_dit_projects.append(get_model_config(best_dit_model, project))
    
    best_corpus_flt_projects.append(get_corpus_config(best_flt_corpus, project))
    best_corpus_dit_projects.append(get_corpus_config(best_dit_corpus, project))

In [13]:
print("best dit corpus")
for each in best_corpus_dit_projects:
    print(each.printable_name, src.common.check_ranks(each, "changeset", "triage")[1])

print()
print("best flt corpus")
for each in best_corpus_flt_projects:
    print(each.printable_name, src.common.check_ranks(each, "changeset", "feature_location")[1])

print()
print("best dit model")
for each in best_model_dit_projects:
    print(each.printable_name, src.common.check_ranks(each, "changeset", "triage")[1])

print()
print("best flt model")
for each in best_model_flt_projects:
    print(each.printable_name, src.common.check_ranks(each, "changeset", "feature_location")[1])

best dit corpus
Tika v1.8 changeset-triage-lda-true-true-false-false-batch-0.002-0.5-0.002-1000-1000-500-1.0-1
Pig v0.14.0 changeset-triage-lda-false-true-false-false-batch-0.002-0.5-0.002-1000-1000-500-1.0-1
BookKeeper v4.3.0 changeset-triage-lda-true-true-true-true-batch-0.002-0.5-0.002-1000-1000-500-1.0-1
OpenJPA v2.3.0 changeset-triage-lda-true-true-false-false-batch-0.002-0.5-0.002-1000-1000-500-1.0-1
Mahout v0.10.0 changeset-triage-lda-true-false-false-false-batch-0.002-0.5-0.002-1000-1000-500-1.0-1
ZooKeeper v3.5.0 changeset-triage-lda-false-true-false-false-batch-0.002-0.5-0.002-1000-1000-500-1.0-1

best flt corpus
Tika v1.8 changeset-feature_location-lda-true-true-true-true-batch-0.002-0.5-0.002-1000-1000-500-1.0-1
Pig v0.14.0 changeset-feature_location-lda-true-false-false-true-batch-0.002-0.5-0.002-1000-1000-500-1.0-1
BookKeeper v4.3.0 changeset-feature_location-lda-true-true-true-false-batch-0.002-0.5-0.002-1000-1000-500-1.0-1
OpenJPA v2.3.0 changeset-feature_location-lda-t

KeyboardInterrupt: 

In [None]:
def generate_panels(best_dit, best_flt):
    flt_data_ranks = dict()
    for each in best_dit:
        flt_data_ranks[each.printable_name] = dict()
    dit_data_ranks = dict()
    for each in best_dit:
        dit_data_ranks[each.printable_name] = dict()


    for each in best_dit:
        dit_results = src.triage.run_experiment(each)['changeset']
        dit_data_ranks[each.printable_name]["Optimal"] = pandas.Series([int(x) for x,_,_ in dit_results])

        flt_results = src.feature_location.run_experiment(each)['changeset']
        flt_data_ranks[each.printable_name]["Alternate"] = pandas.Series([int(x) for x,_,_ in flt_results])

    for each in best_flt:
        dit_results = src.triage.run_experiment(each)['changeset']
        dit_data_ranks[each.printable_name]["Alternate"] = pandas.Series([int(x) for x,_,_ in dit_results])

        flt_results = src.feature_location.run_experiment(each)['changeset']
        flt_data_ranks[each.printable_name]["Optimal"] = pandas.Series([int(x) for x,_,_ in flt_results])

    dit_panel = pandas.Panel(dit_data_ranks)
    flt_panel = pandas.Panel(flt_data_ranks)

    return dit_panel, flt_panel

model_dit_panel, model_flt_panel = generate_panels(best_model_dit_projects, best_model_flt_projects)
corpus_dit_panel, corpus_flt_panel = generate_panels(best_corpus_dit_projects, best_corpus_flt_projects)

# THIS FIGURE GENERATION IS OUTDATED -- USE SWEEP ANALYSIS

In [None]:
FIG_TEX="""
\\begin{figure}
\\centering
\\includegraphics[height=0.4\\textheight]{%s}
\\caption{%s effectiveness measures of optimal and alternate %s configurations for %s}
\\label{fig:combo:%s}
\\end{figure}
"""

FIG_TEX="""
\\begin{figure}
    \\centering
    \\begin{subfigure}{.4\\textwidth}
        \\centering
        \\includegraphics[height=0.4\\textheight]{%s}
        \\caption{Including outliers}\\label{fig:combo:%s_outlier}
    \\end{subfigure}%%
    \\begin{subfigure}{.4\\textwidth}
        \\centering
        \\includegraphics[height=0.4\\textheight]{%s_no_outlier}
        \\caption{Excluding outliers}\\label{fig:combo:%s_no_outlier}
    \\end{subfigure}
\\caption[%s effectiveness measures of optimal and alternate %s configurations for %s]%%
{%s effectiveness measures of optimal ($MRR=%.4f$) and alternate ($MRR=%.4f$) %s configurations for %s}
\\label{fig:combo:%s}
\\end{figure}
"""

def plot_dataframe(df, each, name, kind):
    order = ["Optimal", "Alternate"]
    
    size = (len(order)*1.6, 4.5)
    limitgrowth = 0.5
    fontsize = None
    widths = 0.3
    lower = 0
    kinds = {"flt": "Feature Location", "dit": "Developer Identification"}
    rqs = {"flt": {"rq1": "\\cone", "rq2": "\\ctwo", "all": "Overview"},
           "dit": {"rq1": "\\cone", "rq2": "\\ctwo", "all": "Overview"}}
    config_name = "model" if name == "rq1" else "corpus"
    
    result = df.plot(kind='box',
                     fontsize=fontsize,
                     figsize=size,
                     widths=widths,
                     y=order)
    
    limit = result.get_ylim()
    lower = limit[0] - limitgrowth
    if (lower < 0):
        lower = 0
        
    result.set_ylim(lower, limit[1] + limitgrowth)
    
    plt.tight_layout()
    short_each = each.lower().split(' ')[0]
    fig_name = 'figures/combo/%s_%s_%s' % (kind, name, short_each)
    path = os.path.expanduser('~/git/dissertation/') + fig_name
    plt.savefig(path + ".pdf", dpi=300)

    optimal_mrr = src.utils.calculate_mrr(df["Optimal"].dropna())
    alternative_mrr = src.utils.calculate_mrr(df["Alternate"].dropna())

    with open(path + ".tex", "wt") as f:
        figlabel = ":".join([x.lower() for x in [kind, name, short_each]])
        f.write(FIG_TEX % (fig_name, figlabel,
                           fig_name, figlabel,
                           kinds[kind], config_name, each, # toc caption
                           kinds[kind], # full figure caption
                           optimal_mrr, alternative_mrr,
                           config_name, each,
                           figlabel))
    
    # no outliers
    
                
    result = df.plot(kind='box',
         fontsize=fontsize,
         figsize=size,
         widths=widths,
         y=order,
         showfliers=False)
    limit = result.get_ylim()
    lower = limit[0] - limitgrowth
    if (lower < 0):
        lower = 0

    result.set_ylim(lower, limit[1] + limitgrowth)

    plt.tight_layout()

    fig_name = 'figures/combo/%s_%s_%s_no_outlier' % (kind, name, short_each)
    path = os.path.expanduser('~/git/dissertation/') + fig_name
    plt.savefig(path + ".pdf", dpi=300)

def plot_panel(panel, name, kind):
    print(name, kind)
    for each in panel:
        plot_dataframe(panel[each], each, name, kind)
    
    """ tiny
    upper = allt.max().max()+1
    allt.plot(kind='box', figsize=(4,1.5), grid=False, vert=False, y=list(reversed(order)))
    plt.tight_layout()
    short_each = "tiny"
    fig_name = 'figures/combo/%s_%s_%s' % (kind, name, short_each)
    path = os.path.expanduser('~/git/dissertation/') + fig_name
    plt.savefig(path + ".pdf", dpi=300)
    optimal_mrr = src.utils.calculate_mrr(allt["Optimal"].dropna())
    alternative_mrr = src.utils.calculate_mrr(allt["Alternate"].dropna())

    with open(path + ".tex", "wt") as f:
        figlabel = ":".join([x.lower() for x in [kind, name, short_each]])
        f.write(FIG_TEX % (fig_name, figlabel,
                           fig_name, figlabel,
                           kinds[kind], 
                           optimal_mrr, alternative_mrr,
                           config_name, "all subject systems",
                           figlabel))
    """

In [None]:
plot_panel(model_dit_panel, "rq1", "dit")
plot_panel(model_flt_panel, "rq1", "flt")

In [None]:
plot_panel(corpus_dit_panel, "rq2", "dit")
plot_panel(corpus_flt_panel, "rq2", "flt")

In [None]:
def stat_panel(panel):
    for each in panel:
        print(each)
        opt = panel[each]["Optimal"].dropna()
        alt = panel[each]["Alternate"].dropna()
        print(opt, alt)
        assert len(opt) == len(alt)
        print(each, scipy.stats.wilcoxon(opt, alt, correction=True))