Copyright (C) 2020 Edouard Fouch√©

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.

You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.

In [1]:
import math

import matplotlib as mpl
from matplotlib import cm
mpl.rcParams['text.usetex'] = True 
mpl.rcParams['text.latex.preamble'] = r'\usepackage{libertine}' 
mpl.rc('font', family='serif')

import numpy as np
import pandas as pd
import seaborn as sns
import copy

import matplotlib.pyplot as plt
from matplotlib import rc

from cycler import cycler

mpl.rcParams['ps.usedistiller'] = 'xpdf' 
plt.style.use('seaborn-notebook')

plt.rcParams['axes.titlesize'] = '25'
plt.rcParams['axes.labelsize'] = '25'
plt.rcParams['legend.fontsize'] = '15'
plt.rcParams['xtick.labelsize'] = '15'
plt.rcParams['ytick.labelsize'] = '15'

monochrome=(cycler('color', sns.color_palette("husl", 8))*2+(cycler('marker', ['v', 's', "o"])*7)[0:16])
plt.rc('axes', prop_cycle=monochrome)

pd.options.display.max_rows = 999
pd.options.display.max_columns = None
cmap = cm.get_cmap('RdBu')

In [2]:
folder = "2020-02-10-17-37_Ablation_" # Replace

In [3]:
masterdata = pd.read_csv("../experiments/" + folder + "/Ablation.csv")

In [4]:
masterdata.columns

Index(['scoreId', 'dataset', 'bmId', 'k', 'j', 'nwords', 'ndocs', 'emb', 'cpu',
       'wall', 'cpurep', 'cpuindex', 'cpucache', 'c', 'p', 'entropythreshold',
       'miss', 'typeBrecall', 'typeBprecision', 'typeArecall',
       'typeAprecision', 'typeAauc', 'typeAap', 'typeBr1', 'typeBr2',
       'typeBr5', 'typeBr10', 'typeBr20', 'typeBr30', 'typeBp1', 'typeBp2',
       'typeBp5', 'typeBp10', 'typeBp20', 'typeBp30', 'typeBr1N', 'typeBr2N',
       'typeBr5N', 'typeBr10N', 'typeBr20N', 'typeBr30N', 'typeBp1N',
       'typeBp2N', 'typeBp5N', 'typeBp10N', 'typeBp20N', 'typeBp30N',
       'typeAr1', 'typeAr2', 'typeAr5', 'typeAr10', 'typeAr20', 'typeAr30',
       'typeAp1', 'typeAp2', 'typeAp5', 'typeAp10', 'typeAp20', 'typeAp30',
       'avgmiss', 'stdmiss', 'avgmatch', 'stdmatch', 'rep'],
      dtype='object')

In [5]:
masterdata["p"].unique() 

array([ 0.  ,  0.01,  0.02,  0.05,  0.1 ,  0.2 ,  0.3 ,  0.4 ,  0.5 ,
        0.6 ,  0.7 ,  0.8 ,  0.9 ,  0.95,  0.98,  1.  ])

In [6]:
masterdata["c"].unique() 

array([ 0.1 ,  0.2 ,  0.  ,  0.05,  0.3 ])

In [7]:
masterdata["rep"].unique() 

array([1, 2, 4, 3, 5])

In [8]:
masterdata["dataset"].unique() 

array(['arxiv_15', 'arxiv_25', 'arxiv_35', 'nyt_10', 'arxiv_45',
       'arxiv_54', 'arxiv_52', 'arxiv_53', 'arxiv_55', 'arxiv_51',
       'nyt_20', 'nyt_50', 'nyt_2', 'nyt_1', 'nyt_5'], dtype=object)

In [9]:
masterdata["k"].unique() 

array([ 0, 30])

In [10]:
masterdata["j"].unique() 

array([ 0, 30])

In [11]:
masterdata["scoreId"].unique() 

array(['KJNN', 'WKJNN'], dtype=object)

In [12]:
subdata = masterdata[(masterdata["c"] == 0.2)& ([x in [0.9, 1.0] for x in masterdata["p"]])]

In [13]:
subdata["A-F1"] = 2* (subdata["typeArecall"]*subdata["typeAprecision"]) / (subdata["typeArecall"]+subdata["typeAprecision"])
subdata["B-F1"] = 2* (subdata["typeBrecall"]*subdata["typeBprecision"]) / (subdata["typeBrecall"]+subdata["typeBprecision"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [14]:
subdata["A"] = ["0" for x in subdata.index]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [15]:
A4index = (subdata["scoreId"] == "KJNN") & (subdata["k"] == 30) & (subdata["j"] == 30) & (subdata["c"] == 0.2) & (subdata["p"] == 0.9)
A3index = (subdata["scoreId"] == "WKJNN") & (subdata["k"] == 0) & (subdata["j"] == 30) & (subdata["c"] == 0.2) & (subdata["p"] == 0.9)
A2index = (subdata["scoreId"] == "WKJNN") & (subdata["k"] == 30) & (subdata["j"] == 30) & (subdata["c"] == 0.2) & (subdata["p"] == 1.0)
A1index = (subdata["scoreId"] == "WKJNN") & (subdata["k"] == 30) & (subdata["j"] == 0) & (subdata["c"] == 0.2) & (subdata["p"] == 0.9)
kjNNind = (subdata["scoreId"] == "WKJNN") & (subdata["k"] == 30) & (subdata["j"] == 30) & (subdata["c"] == 0.2) & (subdata["p"] == 0.9)

subdata["A"][A4index] = "A4"
subdata["A"][A3index] = "A3"
subdata["A"][A2index] = "A2"
subdata["A"][A1index] = "A1"
subdata["A"][kjNNind] = "kj-NN"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the cave

In [16]:
subdata = subdata[subdata["A"] != "0"]

In [17]:
attsmall = ['typeAprecision', 'typeArecall', "A-F1", 
       'typeBprecision', 'typeBrecall', "B-F1", "typeBr10", "typeBr20"]
att = ['typeAprecision', 'typeArecall', "A-F1", 
       'typeBprecision', 'typeBrecall', "B-F1", 
       "typeAauc", "typeAap", "typeAr1", "typeAr2", "typeAr5",
      "typeBr10", "typeBr20"]
times = ['cpu', 'wall', 'cpurep', 'cpuindex', 'cpucache']

output = subdata.groupby(["dataset", "A", "scoreId", "p", "c", "j", "k"])[att + times].mean()


output[att] = round(output[att]*100,2)


output[times] = round(output[times] / 1000,2)

output[attsmall]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,typeAprecision,typeArecall,A-F1,typeBprecision,typeBrecall,B-F1,typeBr10,typeBr20
dataset,A,scoreId,p,c,j,k,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
arxiv_15,A1,WKJNN,0.9,0.2,0,30,0.0,0.0,,0.0,0.0,,0.0,0.0
arxiv_15,A2,WKJNN,1.0,0.2,30,30,0.0,0.0,,0.0,0.0,,0.0,0.0
arxiv_15,A3,WKJNN,0.9,0.2,30,0,0.0,0.0,,0.0,0.0,,0.0,0.0
arxiv_15,A4,KJNN,0.9,0.2,30,30,0.0,0.0,,0.0,0.0,,0.0,0.0
arxiv_15,kj-NN,WKJNN,0.9,0.2,30,30,0.0,0.0,,0.0,0.0,,0.0,0.0
arxiv_25,A1,WKJNN,0.9,0.2,0,30,3.22,32.17,5.86,95.01,88.93,91.87,49.02,88.93
arxiv_25,A2,WKJNN,1.0,0.2,30,30,0.0,0.0,,91.22,98.17,94.56,49.22,95.48
arxiv_25,A3,WKJNN,0.9,0.2,30,0,5.01,50.0,9.11,97.3,88.16,92.5,49.9,88.16
arxiv_25,A4,KJNN,0.9,0.2,30,30,3.88,38.7,7.05,96.1,88.98,92.4,49.22,88.98
arxiv_25,kj-NN,WKJNN,0.9,0.2,30,30,3.88,38.7,7.05,96.09,88.85,92.33,49.22,88.85


In [18]:
output = subdata.groupby(["dataset", "A", "scoreId", "p", "c", "j", "k"])[att + times].mean()

attsmall = ['typeAprecision', 'typeArecall', "A-F1", 
       'typeBprecision', 'typeBrecall', "B-F1"]
att = ['typeAprecision', 'typeArecall', "A-F1", 
       'typeBprecision', 'typeBrecall', "B-F1", 
       "typeAauc", "typeAap", "typeAr1", "typeAr2", "typeAr5",
      "typeBr10", "typeBr20"]
output[att] = round(output[att]*100,2)

times = ['cpu', 'wall', 'cpurep', 'cpuindex', 'cpucache']
output[times] = round(output[times] / 1000,2)

output[["typeAauc", "typeAap"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,typeAauc,typeAap
dataset,A,scoreId,p,c,j,k,Unnamed: 7_level_1,Unnamed: 8_level_1
arxiv_15,A1,WKJNN,0.9,0.2,0,30,54.21,1.17
arxiv_15,A2,WKJNN,1.0,0.2,30,30,54.21,1.17
arxiv_15,A3,WKJNN,0.9,0.2,30,0,54.21,1.17
arxiv_15,A4,KJNN,0.9,0.2,30,30,54.21,1.17
arxiv_15,kj-NN,WKJNN,0.9,0.2,30,30,54.21,1.17
arxiv_25,A1,WKJNN,0.9,0.2,0,30,67.75,3.7
arxiv_25,A2,WKJNN,1.0,0.2,30,30,70.64,5.1
arxiv_25,A3,WKJNN,0.9,0.2,30,0,81.23,5.93
arxiv_25,A4,KJNN,0.9,0.2,30,30,70.48,5.09
arxiv_25,kj-NN,WKJNN,0.9,0.2,30,30,70.64,5.1


In [19]:
output = subdata[subdata["A"] == "kj-NN"].groupby(["dataset", "A", "scoreId", "p", "c", "j", "k"])[att + times].mean()

attsmall = ['typeAprecision', 'typeArecall', "A-F1", 
       'typeBprecision', 'typeBrecall', "B-F1"]
att = ['typeAprecision', 'typeArecall', "A-F1", 
       'typeBprecision', 'typeBrecall', "B-F1", 
       "typeAauc", "typeAap", "typeAr1", "typeAr2", "typeAr5",
      "typeBr10", "typeBr20"]
output[att] = round(output[att]*100,2)

times = ['cpu', 'wall', 'cpurep', 'cpuindex', 'cpucache']
output[times] = round(output[times] / 1000,2)

output[["typeAauc", "typeAap", "typeAr1", "typeAr2", "typeAr5"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,typeAauc,typeAap,typeAr1,typeAr2,typeAr5
dataset,A,scoreId,p,c,j,k,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
arxiv_15,kj-NN,WKJNN,0.9,0.2,30,30,54.21,1.17,0.0,0.0,0.0
arxiv_25,kj-NN,WKJNN,0.9,0.2,30,30,70.64,5.1,10.44,16.52,27.82
arxiv_35,kj-NN,WKJNN,0.9,0.2,30,30,73.58,3.27,6.45,10.97,20.97
arxiv_45,kj-NN,WKJNN,0.9,0.2,30,30,69.94,3.28,6.46,9.09,18.99
arxiv_51,kj-NN,WKJNN,0.9,0.2,30,30,65.24,2.19,4.68,7.9,14.36
arxiv_52,kj-NN,WKJNN,0.9,0.2,30,30,78.1,3.07,3.87,7.9,19.35
arxiv_53,kj-NN,WKJNN,0.9,0.2,30,30,79.16,5.1,7.74,15.65,32.42
arxiv_54,kj-NN,WKJNN,0.9,0.2,30,30,76.65,3.85,5.81,10.0,21.61
arxiv_55,kj-NN,WKJNN,0.9,0.2,30,30,76.74,3.22,5.49,9.84,20.48
nyt_1,kj-NN,WKJNN,0.9,0.2,30,30,92.51,17.57,25.0,39.2,61.4


In [20]:
output = subdata[subdata["A"] == "kj-NN"].groupby(["dataset", "A", "scoreId", "p", "c", "j", "k"])[att + times].mean()

attsmall = ['typeAprecision', 'typeArecall', "A-F1", 
       'typeBprecision', 'typeBrecall', "B-F1"]
att = ['typeAprecision', 'typeArecall', "A-F1", 
       'typeBprecision', 'typeBrecall', "B-F1", 
       "typeAauc", "typeAap", "typeAr1", "typeAr2", "typeAr5",
      "typeBr10", "typeBr20"]
output[att] = round(output[att]*100,2)

times = ['cpu', 'wall', 'cpurep', 'cpuindex', 'cpucache']
output[times] = round(output[times] / 1000,2)

output[["typeBprecision", "typeBrecall", "B-F1", "typeBr10", "typeBr20"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,typeBprecision,typeBrecall,B-F1,typeBr10,typeBr20
dataset,A,scoreId,p,c,j,k,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
arxiv_15,kj-NN,WKJNN,0.9,0.2,30,30,0.0,0.0,,0.0,0.0
arxiv_25,kj-NN,WKJNN,0.9,0.2,30,30,96.09,88.85,92.33,49.22,88.85
arxiv_35,kj-NN,WKJNN,0.9,0.2,30,30,80.04,86.98,83.36,48.25,83.4
arxiv_45,kj-NN,WKJNN,0.9,0.2,30,30,70.29,86.87,77.7,45.78,76.89
arxiv_51,kj-NN,WKJNN,0.9,0.2,30,30,70.91,88.63,78.79,46.25,78.18
arxiv_52,kj-NN,WKJNN,0.9,0.2,30,30,71.08,88.43,78.81,47.28,78.79
arxiv_53,kj-NN,WKJNN,0.9,0.2,30,30,70.77,88.46,78.63,46.69,78.24
arxiv_54,kj-NN,WKJNN,0.9,0.2,30,30,70.38,88.05,78.23,46.83,77.78
arxiv_55,kj-NN,WKJNN,0.9,0.2,30,30,70.7,88.23,78.5,46.77,78.67
nyt_1,kj-NN,WKJNN,0.9,0.2,30,30,95.93,90.02,92.88,50.19,90.02


In [21]:
subdata = masterdata[(masterdata["k"] == 30) & ((masterdata["j"] == 30) ) # | (masterdata["j"] == 0)
                    & (masterdata["c"] == 0.2)& 
                     ([x in [0.9] for x in masterdata["p"]])]

In [22]:
subdata["cpu"] = round(subdata["cpu"] / 1000,2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [23]:
subdata["B-F1"] = 2* (subdata["typeBrecall"]*subdata["typeBprecision"]) / (subdata["typeBrecall"]+subdata["typeBprecision"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [24]:
att = ["typeBprecision", "typeBrecall", "B-F1", "typeBr10", "typeBr20"]
a = subdata.groupby(["dataset", "p", "c", "j"])[att].mean()

a[att] = round(a[att]*100,2)
a[["typeBprecision", "typeBrecall", "B-F1", "typeBr10", "typeBr20"]] 

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,typeBprecision,typeBrecall,B-F1,typeBr10,typeBr20
dataset,p,c,j,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
arxiv_15,0.9,0.2,30,0.0,0.0,,0.0,0.0
arxiv_25,0.9,0.2,30,96.09,88.91,92.36,49.22,88.91
arxiv_35,0.9,0.2,30,79.98,86.99,83.33,48.24,83.36
arxiv_45,0.9,0.2,30,70.19,86.88,77.65,45.75,76.79
arxiv_51,0.9,0.2,30,70.86,88.65,78.76,46.22,78.14
arxiv_52,0.9,0.2,30,70.99,88.44,78.76,47.25,78.68
arxiv_53,0.9,0.2,30,70.71,88.49,78.6,46.67,78.22
arxiv_54,0.9,0.2,30,70.29,88.03,78.16,46.81,77.71
arxiv_55,0.9,0.2,30,70.59,88.21,78.42,46.74,78.58
nyt_1,0.9,0.2,30,95.91,90.01,92.87,50.18,90.01


In [25]:
subdata["A-F1"] = 2* (subdata["typeArecall"]*subdata["typeAprecision"]) / (subdata["typeArecall"]+subdata["typeAprecision"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [26]:
subdata.groupby(["dataset", "p", "c", "j"])["k", "ndocs", "cpu", 
                                            "typeArecall", "typeAprecision", "A-F1",
                                       "typeAr1", "typeAr2", "typeAr5"].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,k,ndocs,cpu,typeArecall,typeAprecision,A-F1,typeAr1,typeAr2,typeAr5
dataset,p,c,j,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
arxiv_15,0.9,0.2,30,30,3919,124.765,0.0,0.0,,0.0,0.0,0.0
arxiv_25,0.9,0.2,30,30,4596,151.603,0.38696,0.03878,0.070495,0.10219,0.16306,0.27607
arxiv_35,0.9,0.2,30,30,6233,213.726,0.30323,0.03018,0.054896,0.06452,0.10807,0.20805
arxiv_45,0.9,0.2,30,30,10003,342.264,0.31613,0.0313,0.05696,0.06363,0.09292,0.18887
arxiv_51,0.9,0.2,30,30,12564,377.076,0.25161,0.02486,0.045249,0.0468,0.07822,0.14276
arxiv_52,0.9,0.2,30,30,12564,395.242,0.35162,0.0347,0.063166,0.03872,0.0758,0.19031
arxiv_53,0.9,0.2,30,30,12564,375.692,0.50161,0.04954,0.090174,0.07742,0.15648,0.32178
arxiv_54,0.9,0.2,30,30,12564,315.69,0.37177,0.03668,0.066772,0.05889,0.09836,0.21451
arxiv_55,0.9,0.2,30,30,12564,354.318,0.35403,0.03494,0.063603,0.05567,0.09676,0.20241
nyt_1,0.9,0.2,30,30,10100,210.351,0.776,0.0768,0.139767,0.246,0.389,0.611
