In [1]:
%matplotlib inline

In [2]:
from __future__ import print_function, division

from matplotlib import pyplot as plt
import pandas
import src
import gensim
import os
import os.path
import csv
import functools
import itertools
import collections
import scipy
import scipy.stats
from operator import itemgetter

2015-12-23 18:15:28 bocmaxima gensim.corpora.sharded_corpus[5056] INFO Could not import Theano, will use standard float for default ShardedCorpus dtype.
2015-12-23 18:15:28 bocmaxima summa.preprocessing.cleaner[5056] INFO 'pattern' package not found; tag filters are not available for English


In [71]:
model_config = {
    'num_topics': 500,
    'alpha': 1/500,
    'eta': 1/500,
    'alpha_base': 1,
    'eta_base': 1,
    'decay': 0.5,
    'offset': 1.0,
    'iterations': 1000,
    'passes': 1,
    'max_bound_iterations': 1000, # special
    'algorithm': 'batch', # special
}

changeset_config = {
    'include_additions': True,
    'include_context': True,
    'include_message': False,
    'include_removals': True,
}

def get_config_string(config):
    return '-'.join([unicode(v) for k, v in sorted(config.items())])

alpha_bases = ['auto', 1, 2, 5]
eta_bases = ['auto', 1, 2, 5]
num_topics = [100, 200, 500]

def get_rank_name(kind, experiment, changeset_config, model_config):
    """
    kind = [changeset, release, temporal]
    experiment = [triage, feature_location]
    """
    cs_str = get_config_string(changeset_config)
    model_config = dict(model_config)
    del model_config['alpha_base']
    del model_config['eta_base']
    m_str = get_config_string(model_config)
    return '-'.join([kind, experiment, 'lda', cs_str, m_str, 'file', 'ranks']).lower() + '.csv.gz'

model_sweep = list()
for a, e, K in itertools.product(alpha_bases, eta_bases, num_topics):
    m = dict(model_config)
    m['alpha_base'] = a
    m['eta_base'] = e

    if a != 'auto':
        a /= K
    if e != 'auto':
        e /= K
    m['alpha'] = a
    m['eta'] = e
    m['num_topics'] = K
    model_sweep.append(m)

corpus_sweep = list()
b = [True, False]
for a, c, m, r in itertools.product(b, repeat=4):
    conf = dict(changeset_config)
    conf['include_additions'] = a
    conf['include_context'] = c
    conf['include_message'] = m
    conf['include_removals'] = r
    if any(conf.values()):
        corpus_sweep.append(conf)

In [72]:
projects = list()
Project = collections.namedtuple('Project', 'name version data rankpath config')
for dirpath, dirname, filenames in os.walk("../data"):
    for filename in filenames:
        if filename == 'ref':
            _, _, name, version = dirpath.split('/')
            projects.append(Project(name, version, dirpath, '', dict()))
projects

[Project(name='bookkeeper', version='v4.3.0', data='../data/bookkeeper/v4.3.0', rankpath='', config={}),
 Project(name='mahout', version='v0.10.0', data='../data/mahout/v0.10.0', rankpath='', config={}),
 Project(name='openjpa', version='v2.3.0', data='../data/openjpa/v2.3.0', rankpath='', config={}),
 Project(name='pig', version='v0.14.0', data='../data/pig/v0.14.0', rankpath='', config={}),
 Project(name='tika', version='v1.8', data='../data/tika/v1.8', rankpath='', config={}),
 Project(name='zookeeper', version='v3.5.0', data='../data/zookeeper/v3.5.0', rankpath='', config={})]

In [73]:
cs_dit = list()
for project, rankstuff in itertools.product(projects, [
        (c, get_rank_name('changeset', 'triage', c, model_config)) for c in corpus_sweep]):
    config, rankname = rankstuff
    rankpath = os.path.join(project.data, rankname)
    if os.path.exists(rankpath):
        cs_dit.append(project._replace(rankpath=rankpath, config=config))

cs_flt = list()
for project, rankstuff in itertools.product(projects, [
        (c, get_rank_name('changeset', 'feature_location', c, model_config)) for c in corpus_sweep]):
    config, rankname = rankstuff
    rankpath = os.path.join(project.data, rankname)
    if os.path.exists(rankpath):
        cs_flt.append(project._replace(rankpath=rankpath, config=config))

        
ms_dit = list()
for project, rankstuff in itertools.product(projects, [
        (c, get_rank_name('changeset', 'triage', changeset_config, c)) for c in model_sweep]):
    config, rankname = rankstuff
    rankpath = os.path.join(project.data, rankname)
    if os.path.exists(rankpath):
        ms_dit.append(project._replace(rankpath=rankpath, config=config))
        
ms_flt = list()
for project, rankstuff in itertools.product(projects, [
        (c, get_rank_name('changeset', 'feature_location', changeset_config, c)) for c in model_sweep]):
    config, rankname = rankstuff
    rankpath = os.path.join(project.data, rankname)
    if os.path.exists(rankpath):
         ms_flt.append(project._replace(rankpath=rankpath, config=config))

In [74]:
corpus_df = pandas.DataFrame(columns=["Subject", "Task","Issue", "Rank", "Distance", "Additions", "Removals", "Context", "Message"])
for item in cs_dit:
    df = pandas.DataFrame(columns=corpus_df.columns)
    subdf = pandas.read_csv(item.rankpath)
    df["Rank"] = subdf.groupby("id")["rank"].min().values
    df["Distance"] = subdf.groupby("id")["distance"].min().values
    df["Issue"] = subdf.groupby("id")["rank"].min().index
    df["Subject"] = df.Subject.fillna(item.name.title().replace("keeper", "Keeper").replace("Openjpa", "OpenJPA") + " " + item.version)
    df["Additions"] = df.Additions.fillna(item.config['include_additions'])
    df["Removals"] = df.Removals.fillna(item.config['include_removals'])
    df["Context"] = df.Context.fillna(item.config['include_context'])
    df["Message"] = df.Message.fillna(item.config['include_message'])
    df["Task"] = df.Task.fillna("DIT")
    corpus_df = corpus_df.append(df, ignore_index=True)
    
for item in cs_flt:
    df = pandas.DataFrame(columns=corpus_df.columns)
    subdf = pandas.read_csv(item.rankpath)
    df["Rank"] = subdf.groupby("id")["rank"].min().values
    df["Distance"] = subdf.groupby("id")["distance"].min().values
    df["Issue"] = subdf.groupby("id")["rank"].min().index
    df["Subject"] = df.Subject.fillna(item.name.title().replace("keeper", "Keeper").replace("Openjpa", "OpenJPA") + " " + item.version)
    df["Additions"] = df.Additions.fillna(item.config['include_additions'])
    df["Removals"] = df.Removals.fillna(item.config['include_removals'])
    df["Context"] = df.Context.fillna(item.config['include_context'])
    df["Message"] = df.Message.fillna(item.config['include_message'])
    df["Task"] = df.Task.fillna("FLT")
    corpus_df = corpus_df.append(df, ignore_index=True)
    
model_df = pandas.DataFrame(columns=["Subject", "Task", "Issue", "Rank", "Distance", "alpha", "eta", "K"])
for item in ms_dit:
    df = pandas.DataFrame(columns=model_df.columns)
    subdf = pandas.read_csv(item.rankpath)
    df["Rank"] = subdf.groupby("id")["rank"].min().values
    df["Distance"] = subdf.groupby("id")["distance"].min().values
    df["Issue"] = subdf.groupby("id")["rank"].min().index
    df["Subject"] = df.Subject.fillna(item.name.title().replace("keeper", "Keeper").replace("Openjpa", "OpenJPA") + " " + item.version)
    df["alpha"] = df.alpha.fillna(item.config['alpha_base'])
    df["eta"] = df.eta.fillna(item.config['eta_base'])
    df["K"] = df.K.fillna(item.config['num_topics'])
    df["Task"] = df.Task.fillna("DIT")
    model_df = model_df.append(df, ignore_index=True)
    
for item in ms_flt:
    df = pandas.DataFrame(columns=model_df.columns)
    subdf = pandas.read_csv(item.rankpath)
    df["Rank"] = subdf.groupby("id")["rank"].min().values
    df["Distance"] = subdf.groupby("id")["distance"].min().values
    df["Issue"] = subdf.groupby("id")["rank"].min().index
    df["Subject"] = df.Subject.fillna(item.name.title().replace("keeper", "Keeper").replace("Openjpa", "OpenJPA") + " " + item.version)
    df["alpha"] = df.alpha.fillna(item.config['alpha_base'])
    df["eta"] = df.eta.fillna(item.config['eta_base'])
    df["K"] = df.K.fillna(item.config['num_topics'])
    df["Task"] = df.Task.fillna("FLT")
    model_df = model_df.append(df, ignore_index=True)

# Corpus analysis

In [59]:
corpus_df[:10]

Unnamed: 0,Subject,Task,Issue,Rank,Distance,Additions,Removals,Context,Message
0,BookKeeper v4.3.0,DIT,257,1,0.887216,True,True,True,True
1,BookKeeper v4.3.0,DIT,312,1,0.732501,True,True,True,True
2,BookKeeper v4.3.0,DIT,313,1,0.891555,True,True,True,True
3,BookKeeper v4.3.0,DIT,363,1,0.813099,True,True,True,True
4,BookKeeper v4.3.0,DIT,429,3,0.963592,True,True,True,True
5,BookKeeper v4.3.0,DIT,432,1,0.890814,True,True,True,True
6,BookKeeper v4.3.0,DIT,446,1,0.886618,True,True,True,True
7,BookKeeper v4.3.0,DIT,506,1,0.883021,True,True,True,True
8,BookKeeper v4.3.0,DIT,526,1,0.878225,True,True,True,True
9,BookKeeper v4.3.0,DIT,544,1,0.881726,True,True,True,True


In [8]:
(corpus_df.groupby(["Subject", "Task", "Additions"]).Rank.apply(src.utils.calculate_mrr),
 corpus_df.groupby(["Subject", "Task", "Removals"]).Rank.apply(src.utils.calculate_mrr),
 corpus_df.groupby(["Subject", "Task", "Context"]).Rank.apply(src.utils.calculate_mrr),
 corpus_df.groupby(["Subject", "Task", "Message"]).Rank.apply(src.utils.calculate_mrr),
)

(Subject            Task  Additions
 BookKeeper v4.3.0  DIT   False        0.617247
                          True         0.622891
                    FLT   False        0.478523
                          True         0.575344
 Mahout v0.10.0     DIT   False        0.280230
                          True         0.311753
                    FLT   False        0.671349
                          True         0.659577
 OpenJPA v2.3.0     DIT   False        0.315077
                          True         0.344113
                    FLT   False        0.316098
                          True         0.331543
 Pig v0.14.0        DIT   False        0.200080
                          True         0.176639
                    FLT   False        0.446399
                          True         0.480212
 Tika v1.8          DIT   False        0.355327
                          True         0.414457
                    FLT   False        0.457941
                          True         0.503806
 Zoo

In [9]:
res = pandas.DataFrame(columns=["Subject", "Config", "NotIncl", "Incl", "p"])
for k in ["Additions", "Removals", "Context", "Message"]:
    for key, group in corpus_df.groupby(["Subject", "Task"]):
        sub = group.groupby(k).groups
        f = corpus_df.ix[sub[False]].Rank
        t = corpus_df.ix[sub[True]].Rank
        stat, p = scipy.stats.mannwhitneyu(f, t)
        res = res.append(
            dict(
                zip(res.columns, 
                    [key, k, src.utils.calculate_mrr(f), src.utils.calculate_mrr(t), p]))
            , ignore_index=True)
    sub = corpus_df.groupby(["Task", k]).groups
    f = corpus_df.ix[sub[("DIT", False)]].Rank
    t = corpus_df.ix[sub[("DIT", True)]].Rank
    stat, p = scipy.stats.mannwhitneyu(f, t)
    res = res.append(
        dict(
            zip(res.columns, 
                ["**Overall DIT**", k, src.utils.calculate_mrr(f), src.utils.calculate_mrr(t), p]))
        , ignore_index=True)
    f = corpus_df.ix[sub[("FLT", False)]].Rank
    t = corpus_df.ix[sub[("FLT", True)]].Rank
    stat, p = scipy.stats.mannwhitneyu(f, t)
    res = res.append(
        dict(
            zip(res.columns, 
                ["**Overall FLT**", k, src.utils.calculate_mrr(f), src.utils.calculate_mrr(t), p]))
        , ignore_index=True)        

In [10]:
res[(res.NotIncl > res.Incl) & (res.p < 0.05)]

Unnamed: 0,Subject,Config,NotIncl,Incl,p
15,"(BookKeeper v4.3.0, FLT)",Removals,0.541638,0.520118,0.03835963
16,"(Mahout v0.10.0, DIT)",Removals,0.31024,0.285494,0.0003012957
20,"(Pig v0.14.0, DIT)",Removals,0.212382,0.165875,7.141461e-13
24,"(ZooKeeper v3.5.0, DIT)",Removals,0.381525,0.344173,1.853858e-11
26,**Overall DIT**,Removals,0.365892,0.343557,2.612734e-14
35,"(Pig v0.14.0, FLT)",Context,0.478371,0.452237,0.02513671
44,"(Mahout v0.10.0, DIT)",Message,0.307949,0.287499,0.03774864


In [11]:
res[(res.NotIncl < res.Incl) & (res.p < 0.05)]

Unnamed: 0,Subject,Config,NotIncl,Incl,p
1,"(BookKeeper v4.3.0, FLT)",Additions,0.478523,0.575344,1.090972e-09
2,"(Mahout v0.10.0, DIT)",Additions,0.28023,0.311753,0.0006962983
4,"(OpenJPA v2.3.0, DIT)",Additions,0.315077,0.344113,0.001438463
7,"(Pig v0.14.0, FLT)",Additions,0.446399,0.480212,0.007726905
10,"(ZooKeeper v3.5.0, DIT)",Additions,0.357364,0.365314,0.0003700157
12,**Overall DIT**,Additions,0.349374,0.35801,0.000226208
13,**Overall FLT**,Additions,0.51856,0.54997,7.531264e-06
28,"(BookKeeper v4.3.0, DIT)",Context,0.56957,0.664609,1.53474e-11
34,"(Pig v0.14.0, DIT)",Context,0.172989,0.200344,9.659076000000001e-33
38,"(ZooKeeper v3.5.0, DIT)",Context,0.349857,0.371883,9.960133e-07


In [17]:
# for key, group in corpus_df.groupby(["Subject", "Task"]):
#     ranks = dict()
#     for subkey, subgroup in group.groupby(["Additions", "Removals", "Context", "Message"]):
#         ranks[subkey] = subgroup.Rank

#     print(key, scipy.stats.friedmanchisquare(*ranks.values()))
#     for x, y in itertools.combinations(corpus_df.groupby(["Additions", "Removals", "Context", "Message"]).groups.keys(), r=2):
#         stat, p = scipy.stats.wilcoxon(ranks[x], ranks[y])
#         if p < 0.01:
#             print(x, y, p, "******")
#         else:
#             print(x, y, p)
#     print()

In [90]:
for key, group in corpus_df.groupby(["Task"]):
    ranks = dict()
    for subkey, subgroup in group.groupby(["Additions", "Removals", "Context", "Message"]):
        ranks[subkey] = subgroup.Rank

    print(key, scipy.stats.friedmanchisquare(*ranks.values()))

for key, group in corpus_df.groupby(["Subject", "Task"]):
    ranks = dict()
    for subkey, subgroup in group.groupby(["Additions", "Removals", "Context", "Message"]):
        ranks[subkey] = subgroup.Rank

    print(key, scipy.stats.friedmanchisquare(*ranks.values()))

DIT FriedmanchisquareResult(statistic=942.88611070531044, pvalue=2.7776415516204836e-192)
FLT FriedmanchisquareResult(statistic=105.8991068336115, pvalue=3.4773259820180311e-16)
('BookKeeper v4.3.0', 'DIT') FriedmanchisquareResult(statistic=237.24182745176435, pvalue=1.2404160564716079e-42)
('BookKeeper v4.3.0', 'FLT') FriedmanchisquareResult(statistic=109.54686260102889, pvalue=6.849009684746497e-17)
('Mahout v0.10.0', 'DIT') FriedmanchisquareResult(statistic=75.591854689939609, pvalue=1.8426793680244682e-10)
('Mahout v0.10.0', 'FLT') FriedmanchisquareResult(statistic=34.988044179604266, pvalue=0.0014760196886066115)
('OpenJPA v2.3.0', 'DIT') FriedmanchisquareResult(statistic=84.36907166975098, pvalue=4.3434668992647125e-12)
('OpenJPA v2.3.0', 'FLT') FriedmanchisquareResult(statistic=15.76841888027292, pvalue=0.32772036536870108)
('Pig v0.14.0', 'DIT') FriedmanchisquareResult(statistic=933.14654193117281, pvalue=3.4009366822314683e-190)
('Pig v0.14.0', 'FLT') FriedmanchisquareResult(s

# Model analysis

In [75]:
model_df[:10]

Unnamed: 0,Subject,Task,Issue,Rank,Distance,alpha,eta,K
0,BookKeeper v4.3.0,DIT,257,1,0.922109,auto,auto,100
1,BookKeeper v4.3.0,DIT,312,1,0.776521,auto,auto,100
2,BookKeeper v4.3.0,DIT,313,1,0.944556,auto,auto,100
3,BookKeeper v4.3.0,DIT,363,2,0.837609,auto,auto,100
4,BookKeeper v4.3.0,DIT,429,5,0.978392,auto,auto,100
5,BookKeeper v4.3.0,DIT,432,1,0.856682,auto,auto,100
6,BookKeeper v4.3.0,DIT,446,3,0.868081,auto,auto,100
7,BookKeeper v4.3.0,DIT,506,2,0.896509,auto,auto,100
8,BookKeeper v4.3.0,DIT,526,1,0.905376,auto,auto,100
9,BookKeeper v4.3.0,DIT,544,2,0.908101,auto,auto,100


In [18]:
# for key, group in model_df.groupby(["Subject", "Task"]):
#     ranks = dict()
#     for subkey, subgroup in group.groupby(["alpha", "eta", "K"]):
#         ranks[subkey] = subgroup.Rank

#     print(key, scipy.stats.friedmanchisquare(*ranks.values()))
#     for x, y in itertools.combinations(model_df.groupby(["alpha", "eta", "K"]).groups.keys(), r=2):
#         stat, p = scipy.stats.wilcoxon(ranks[x], ranks[y])
#         if p < 0.01:
#             print(x, y, p, "******")
#         else:
#             print(x, y, p)
#     print()

In [76]:
for key, group in model_df.groupby(["Task"]):
    ranks = dict()
    for subkey, subgroup in group.groupby(["alpha", "eta", "K"]):
        ranks[subkey] = subgroup.Rank

    print(key, scipy.stats.friedmanchisquare(*ranks.values()))

for key, group in model_df.groupby(["Subject", "Task"]):
    ranks = dict()
    for subkey, subgroup in group.groupby(["alpha", "eta", "K"]):
        ranks[subkey] = subgroup.Rank

    print(key, scipy.stats.friedmanchisquare(*ranks.values()))

DIT FriedmanchisquareResult(statistic=2451.7193649520473, pvalue=0.0)
FLT FriedmanchisquareResult(statistic=836.58592171787814, pvalue=4.1273171489425937e-145)
('BookKeeper v4.3.0', 'DIT') FriedmanchisquareResult(statistic=699.43501411184604, pvalue=4.4954912006458822e-117)
('BookKeeper v4.3.0', 'FLT') FriedmanchisquareResult(statistic=121.00535731211414, pvalue=1.9072676752950246e-08)
('Mahout v0.10.0', 'DIT') FriedmanchisquareResult(statistic=327.62654973189512, pvalue=1.0329224953056032e-43)
('Mahout v0.10.0', 'FLT') FriedmanchisquareResult(statistic=140.14055852706161, pvalue=3.3747836389820856e-11)
('OpenJPA v2.3.0', 'DIT') FriedmanchisquareResult(statistic=201.52410662592223, pvalue=4.9284621782782372e-21)
('OpenJPA v2.3.0', 'FLT') FriedmanchisquareResult(statistic=247.77533730590784, pvalue=4.4283904720696155e-29)
('Pig v0.14.0', 'DIT') FriedmanchisquareResult(statistic=1057.482441605845, pvalue=8.5734191487907764e-191)
('Pig v0.14.0', 'FLT') FriedmanchisquareResult(statistic=32

In [82]:
res = pandas.DataFrame(columns=["Subject", "Task", "Config", "Config2", "MRR", "MRR2", "p"])
for k in ["alpha", "eta", "K"]:
    for key, group in model_df.groupby(["Subject", "Task"]):
        ranks = dict()
        for subkey, subgroup in group.groupby(k):
            ranks[subkey] = subgroup.Rank
        
        for each in itertools.combinations(ranks.keys(), r=2):
            f, t = each
            stat, p = scipy.stats.wilcoxon(ranks[f], ranks[t])
            res = res.append(
                dict(
                    zip(res.columns, 
                        [key[0], key[1], k + "=" + str(f), k + "=" + str(t), src.utils.calculate_mrr(ranks[f]), src.utils.calculate_mrr(ranks[t]), p]))
                , ignore_index=True)

In [87]:
len(res[res.p < 0.05]), len(res[res.p >= 0.05])

(94, 86)

In [91]:
res

Unnamed: 0,Subject,Task,Config,Config2,MRR,MRR2,p
0,BookKeeper v4.3.0,DIT,alpha=1,alpha=2,0.587483,0.589922,9.484692e-02
1,BookKeeper v4.3.0,DIT,alpha=1,alpha=5,0.587483,0.593699,3.997912e-01
2,BookKeeper v4.3.0,DIT,alpha=1,alpha=auto,0.587483,0.589160,2.681239e-01
3,BookKeeper v4.3.0,DIT,alpha=2,alpha=5,0.589922,0.593699,4.433888e-01
4,BookKeeper v4.3.0,DIT,alpha=2,alpha=auto,0.589922,0.589160,6.975340e-03
5,BookKeeper v4.3.0,DIT,alpha=5,alpha=auto,0.593699,0.589160,7.686273e-02
6,BookKeeper v4.3.0,FLT,alpha=1,alpha=2,0.527051,0.512845,3.179879e-02
7,BookKeeper v4.3.0,FLT,alpha=1,alpha=5,0.527051,0.520513,1.073050e-02
8,BookKeeper v4.3.0,FLT,alpha=1,alpha=auto,0.527051,0.518723,2.842989e-01
9,BookKeeper v4.3.0,FLT,alpha=2,alpha=5,0.512845,0.520513,9.420306e-01
