In [1]:
%matplotlib inline

In [2]:
from __future__ import print_function, division
import pandas
import src
import os, os.path
from operator import itemgetter
if not os.path.exists('data'):
    os.chdir('..')

In [3]:
def best_pair(s, t):
    s = reversed(sorted(enumerate(s), key=itemgetter(1)))
    t = reversed(sorted(enumerate(t), key=itemgetter(1)))
    for (i, item_i), (j, item_j) in zip(s, t):
        if i == j:
            return i

In [4]:
projects = src.common.load_projects(dict(level='file'))
projects

[Project(name='tika', printable_name='Tika v1.8', version='v1.8', ref='refs/tags/1.8', data_path='data/tika/', full_path='data/tika/v1.8/', src_path='data/tika/v1.8/src/', level='file'),
 Project(name='pig', printable_name='Pig v0.14.0', version='v0.14.0', ref='refs/tags/release-0.14.0', data_path='data/pig/', full_path='data/pig/v0.14.0/', src_path='data/pig/v0.14.0/src/', level='file'),
 Project(name='bookkeeper', printable_name='BookKeeper v4.3.0', version='v4.3.0', ref='refs/tags/release-4.3.0', data_path='data/bookkeeper/', full_path='data/bookkeeper/v4.3.0/', src_path='data/bookkeeper/v4.3.0/src/', level='file'),
 Project(name='openjpa', printable_name='OpenJPA v2.3.0', version='v2.3.0', ref='refs/tags/2.3.0', data_path='data/openjpa/', full_path='data/openjpa/v2.3.0/', src_path='data/openjpa/v2.3.0/src/', level='file'),
 Project(name='mahout', printable_name='Mahout v0.10.0', version='v0.10.0', ref='refs/tags/mahout-0.10', data_path='data/mahout/', full_path='data/mahout/v0.10.0

In [5]:
names = {'model': {'score': 'score',
                   'model_base_alpha': 'alpha',
                   'model_base_eta': 'eta',
                   'num_topics': 'K'
                  },
         'corpus': {'score': 'score',
                    'changeset_include_additions': 'Additions',
                    'changeset_include_context': 'Context',
                    'changeset_include_message': 'Message',
                    'changeset_include_removals': 'Removals',
                    },
        }
exps = ['triage', 'feature_location']
table_headers = {
    'model': ['K', 'alpha', 'eta', 'Feature Location', 'Triage'],
    'corpus': ['Additions', 'Removals', 'Context', 'Message', 'Feature Location', 'Triage']
}
formatters = {
    'Feature Location': lambda x: r"{\bf %.4f}" % x if x == max(main_df["Feature Location"]) else "%.4f" % x,
    'Triage': lambda x: r"{\bf %.4f}" % x if x == max(main_df["Triage"]) else "%.4f" % x,
}

full_tex = r"""
\begin{table}
\begin{spacing}{1.2}
\centering
\caption{MRR values of %s %s construction sweep}
\label{table:%s}
\vspace{0.2em}
%s
\end{spacing}
\end{table}
"""

tex_dir = os.path.expanduser("~/git/dissertation/tables")
for project in projects:
    for rq in names.keys():
        names[rq]['score'] = 'score'
        main_df = pandas.DataFrame(columns=names[rq])
        for exp in exps:
            path = os.path.join(project.full_path, 'optimized-%s-changeset-%s.csv' % (rq, exp))
            exp = ' '.join(exp.title().split('_'))
            names[rq]['score'] = exp
            exp_df = pandas.read_csv(path)
            exp_df = exp_df.rename(columns=names[rq])
            if len(main_df):
                main_df = main_df.merge(exp_df)
            else:
                main_df = exp_df
        
        # filter out uninteresting rows, like there was no corpus
        main_df = main_df[(main_df["Feature Location"] != 0) | (main_df["Triage"] != 0)]
        if rq == "model":
            main_df = main_df.sort(["K", "alpha", "eta"])
            
        label = "%s_%s_sweep" % (project.name, rq)
        op = os.path.join(tex_dir, label + ".tex")
        
        if len(main_df) > 24:
            tex = r"\parbox{.45\linewidth}{\centering %s} \hfill \parbox{.45\linewidth}{\centering %s}"
            mid = len(main_df)//2
            tex = tex % (main_df[:mid].to_latex(index=False,
                                                escape=False, # needed so it doesn't screw up formatters
                                                formatters=formatters,
                                                columns=table_headers[rq]),
                         main_df[mid:].to_latex(index=False,
                                                escape=False, # needed so it doesn't screw up formatters
                                                formatters=formatters,
                                                columns=table_headers[rq]))
        else:
            tex = main_df.to_latex(index=False,
                                   escape=False, # needed so it doesn't screw up formatters
                                   formatters=formatters,
                                   columns=table_headers[rq],)
            
        print(best_pair(main_df["Triage"], main_df["Feature Location"]))
        
        # and now the lazy
        this_full_tex = full_tex % (project.printable_name, rq, label, tex)
        this_full_tex = this_full_tex.replace(" alpha ", r" $\alpha$ ")
        this_full_tex = this_full_tex.replace(" eta ", r" $\eta$ ")
        this_full_tex = this_full_tex.replace(" Feature Location ", " FLT ")
        this_full_tex = this_full_tex.replace(" Triage ", " DIT ")
        this_full_tex = this_full_tex.replace(r"\begin{tabular}{rllrr}", r"\begin{tabular}{rll|rr}")
        this_full_tex = this_full_tex.replace(r"\begin{tabular}{llllrr}", r"\begin{tabular}{llll|rr}")
        print("Writing to: %s\n%s\n" % (op, this_full_tex))
        with open(op, 'wt') as f:
            f.write(this_full_tex)

47
Writing to: /home/cscorley/git/dissertation/tables/tika_model_sweep.tex

\begin{table}
\begin{spacing}{1.2}
\centering
\caption{MRR values of Tika v1.8 model construction sweep}
\label{table:tika_model_sweep}
\vspace{0.2em}
\parbox{.45\linewidth}{\centering \begin{tabular}{rll|rr}
\toprule
   K & $\alpha$ &   $\eta$ & FLT & DIT \\
\midrule
 100 &     1 &     1 &           0.2149 & 0.2912 \\
 100 &     1 &     2 &           0.2418 & 0.3040 \\
 100 &     1 &     5 &           0.3358 & 0.2462 \\
 100 &     1 &  auto &           0.2107 & 0.2645 \\
 100 &     2 &     1 &           0.2618 & 0.3548 \\
 100 &     2 &     2 &           0.3256 & 0.2350 \\
 100 &     2 &     5 &           0.3811 & 0.3009 \\
 100 &     2 &  auto &           0.3716 & 0.2599 \\
 100 &     5 &     1 &           0.3104 & 0.2910 \\
 100 &     5 &     2 &           0.3042 & 0.2662 \\
 100 &     5 &     5 &           0.2412 & 0.2847 \\
 100 &     5 &  auto &           0.2741 & 0.2676 \\
 100 &  auto &     1 &         

In [6]:
main_df

Unnamed: 0,Triage,Additions,Context,Message,Removals,Feature Location
0,0.376213,True,True,True,True,0.487292
1,0.344632,True,True,False,True,0.444076
2,0.349625,True,False,True,True,0.461062
3,0.356383,True,False,False,True,0.476825
4,0.358121,True,True,True,False,0.482711
5,0.352604,True,True,False,False,0.45247
6,0.365811,True,False,True,False,0.460786
7,0.330849,True,False,False,False,0.443428
8,0.333073,False,True,True,True,0.47878
9,0.359117,False,True,False,True,0.412791
