Copyright (C) 2020 Edouard Fouché

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.

You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.

In [1]:
import math

import matplotlib as mpl
from matplotlib import cm
mpl.rcParams['text.usetex'] = True 
mpl.rcParams['text.latex.preamble'] = r'\usepackage{libertine}' 
mpl.rc('font', family='serif')

import numpy as np
import pandas as pd
import seaborn as sns
import copy

import matplotlib.pyplot as plt
from matplotlib import rc

from cycler import cycler

mpl.rcParams['ps.usedistiller'] = 'xpdf' 
plt.style.use('seaborn-notebook')

plt.rcParams['axes.titlesize'] = '25'
plt.rcParams['axes.labelsize'] = '25'
plt.rcParams['legend.fontsize'] = '15'
plt.rcParams['xtick.labelsize'] = '15'
plt.rcParams['ytick.labelsize'] = '15'

monochrome=(cycler('color', sns.color_palette("husl", 8))*2+(cycler('marker', ['v', 's', "o"])*7)[0:16])
plt.rc('axes', prop_cycle=monochrome)

pd.options.display.max_rows = 999
pd.options.display.max_columns = None
cmap = cm.get_cmap('RdBu')

In [2]:
folder = "2020-02-11-17-22_Evaluate_KJNN_" # replace

In [3]:
masterdata = pd.read_csv("../experiments/" + folder + "/Evaluate_KJNN.csv")

In [4]:
masterdata.columns

Index(['scoreId', 'dataset', 'bmId', 'k', 'j', 'nwords', 'ndocs', 'emb', 'cpu',
       'wall', 'cpurep', 'cpuindex', 'cpucache', 'c', 'p', 'entropythreshold',
       'miss', 'typeBrecall', 'typeBprecision', 'typeArecall',
       'typeAprecision', 'typeAauc', 'typeAap', 'typeBr1', 'typeBr2',
       'typeBr5', 'typeBr10', 'typeBr20', 'typeBr30', 'typeBp1', 'typeBp2',
       'typeBp5', 'typeBp10', 'typeBp20', 'typeBp30', 'typeBr1N', 'typeBr2N',
       'typeBr5N', 'typeBr10N', 'typeBr20N', 'typeBr30N', 'typeBp1N',
       'typeBp2N', 'typeBp5N', 'typeBp10N', 'typeBp20N', 'typeBp30N',
       'typeAr1', 'typeAr2', 'typeAr5', 'typeAr10', 'typeAr20', 'typeAr30',
       'typeAp1', 'typeAp2', 'typeAp5', 'typeAp10', 'typeAp20', 'typeAp30',
       'avgmiss', 'stdmiss', 'avgmatch', 'stdmatch', 'rep'],
      dtype='object')

In [5]:
masterdata["p"].unique() 

array([ 0.  ,  0.01,  0.02,  0.05,  0.1 ,  0.2 ,  0.3 ,  0.4 ,  0.5 ,
        0.6 ,  0.7 ,  0.8 ,  0.9 ,  0.95,  0.98,  1.  ])

In [6]:
masterdata["rep"].unique() 

array([1, 3, 2, 5, 4])

In [7]:
masterdata["k"].unique() 

array([30])

In [8]:
subdata = masterdata[(masterdata["k"] == 30) & (masterdata["j"] == 30) 
                    & (masterdata["c"] == 0.2)& 
                     ([x in [0.9] for x in masterdata["p"]])]

In [9]:
subdata["cpu_combo"] = subdata['cpu'] + subdata['cpuindex'] + subdata['cpucache']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [10]:
att = ["typeAauc", "typeAap", "typeAr1", "typeAr2", "typeAr5"]

In [11]:
a = subdata.groupby(["dataset", "p", "c", "j"])["typeAauc", "typeAap", "typeAr1", "typeAr2", "typeAr5", "cpu"].mean()
a[a.columns] = round(a[a.columns]*100,2)
a

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,typeAauc,typeAap,typeAr1,typeAr2,typeAr5,cpu
dataset,p,c,j,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
arxiv_15,0.9,0.2,30,54.21,1.17,0.0,0.0,0.0,5585520.64
arxiv_25,0.9,0.2,30,70.61,4.38,6.96,14.35,26.96,6743119.38
arxiv_35,0.9,0.2,30,72.91,3.19,5.16,9.35,20.97,9246284.39
arxiv_45,0.9,0.2,30,68.6,3.6,6.06,10.1,19.39,14538948.49
arxiv_51,0.9,0.2,30,63.52,2.12,3.39,7.26,17.9,18404953.02
arxiv_52,0.9,0.2,30,77.82,3.0,3.23,7.26,19.68,18333612.72
arxiv_53,0.9,0.2,30,78.05,4.24,6.29,12.74,28.06,18459928.68
arxiv_54,0.9,0.2,30,76.44,3.31,3.87,9.52,19.03,18472948.32
arxiv_55,0.9,0.2,30,76.76,3.35,5.65,10.32,21.94,18342332.98
nyt_1,0.9,0.2,30,92.7,20.31,29.6,43.0,66.6,12187760.68


In [12]:
times = ['cpu', 'cpurep', 'cpuindex', 'cpucache', 'cpu_combo']
a = subdata.groupby(["dataset", "p", "c", "j"])[times].mean()
a[times] = round(a[times] / 1000,2)
a

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,cpu,cpurep,cpuindex,cpucache,cpu_combo
dataset,p,c,j,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
arxiv_15,0.9,0.2,30,55.86,26.46,15.62,8.36,79.84
arxiv_25,0.9,0.2,30,67.43,38.19,21.4,13.26,102.09
arxiv_35,0.9,0.2,30,92.46,57.47,25.17,25.11,142.75
arxiv_45,0.9,0.2,30,145.39,62.49,48.9,76.32,270.61
arxiv_51,0.9,0.2,30,184.05,142.94,60.26,119.8,364.11
arxiv_52,0.9,0.2,30,183.34,141.97,58.32,118.95,360.6
arxiv_53,0.9,0.2,30,184.6,144.38,63.8,119.82,368.22
arxiv_54,0.9,0.2,30,184.73,142.15,60.5,120.23,365.47
arxiv_55,0.9,0.2,30,183.42,142.98,66.87,119.91,370.21
nyt_1,0.9,0.2,30,121.88,2307.14,109.74,143.29,374.91


In [13]:
subdata = masterdata[(masterdata["k"] == 30) & ((masterdata["j"] == 30) | (masterdata["j"] == 0))
                    & (masterdata["c"] == 0.2)& 
                     ([x in [0.9, 1.0] for x in masterdata["p"]])]

In [14]:
subdata["cpu"] = round(subdata["cpu"] / 1000,2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [15]:
att = [ "typeBrecall", "typeBprecision", "typeBr1", "typeBr2", "typeBr5", "typeBr10", "typeBr20",
       "typeArecall", "typeAprecision", "typeAr1", "typeAr2", "typeAr5"]
subdata[att] = round(subdata[att]*100,2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [16]:
subdata["B-F1"] = 2* (subdata["typeBrecall"]*subdata["typeBprecision"]) / (subdata["typeBrecall"]+subdata["typeBprecision"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [17]:
subdata.groupby(["dataset", "p", "c", "j"])[
                                       "typeBrecall", "typeBprecision", "B-F1",
                                            "typeBr1", "typeBr2", "typeBr5", "typeBr10", "typeBr20",
                                         "typeArecall", "typeAprecision", 
                                            "typeAr1", "typeAr2", "typeAr5"].max()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,typeBrecall,typeBprecision,B-F1,typeBr1,typeBr2,typeBr5,typeBr10,typeBr20,typeArecall,typeAprecision,typeAr1,typeAr2,typeAr5
dataset,p,c,j,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
arxiv_15,0.9,0.2,30,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
arxiv_15,1.0,0.2,30,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
arxiv_25,0.9,0.2,30,90.12,96.36,93.135598,4.96,9.84,24.78,49.56,90.12,43.48,4.36,13.04,21.74,34.78
arxiv_25,1.0,0.2,30,98.57,91.78,94.797425,4.96,9.84,24.78,49.56,96.27,0.0,0.0,0.0,0.0,0.0
arxiv_35,0.9,0.2,30,88.23,81.1,83.979033,5.03,10.06,24.92,48.99,84.17,37.1,3.69,8.06,12.9,24.19
arxiv_35,1.0,0.2,30,95.96,73.59,83.299279,5.03,10.06,24.92,48.99,84.17,0.0,0.0,0.0,0.0,0.0
arxiv_45,0.9,0.2,30,88.01,70.63,78.306362,4.93,9.78,24.23,46.46,77.88,32.32,3.2,7.07,12.12,25.25
arxiv_45,1.0,0.2,30,95.97,64.22,76.80869,4.93,9.78,24.23,46.46,77.88,0.0,0.0,0.0,0.0,0.0
arxiv_51,0.9,0.2,30,89.37,71.7,79.534971,4.93,9.89,24.09,46.64,79.42,33.06,3.26,4.03,9.68,24.19
arxiv_51,1.0,0.2,30,97.55,64.42,77.545923,4.93,9.89,24.09,46.64,79.42,0.0,0.0,0.0,0.0,0.0


In [18]:
subdata.groupby(["dataset", "p", "c", "j"])["k", "ndocs", "cpu", 
                                            "typeArecall", "typeAprecision", 
                                       "typeAr1", "typeAr2", "typeAr5"].max()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,k,ndocs,cpu,typeArecall,typeAprecision,typeAr1,typeAr2,typeAr5
dataset,p,c,j,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
arxiv_15,0.9,0.2,30,30,3919,57.22,0.0,0.0,0.0,0.0,0.0
arxiv_15,1.0,0.2,30,30,3919,57.22,0.0,0.0,0.0,0.0,0.0
arxiv_25,0.9,0.2,30,30,4596,68.62,43.48,4.36,13.04,21.74,34.78
arxiv_25,1.0,0.2,30,30,4596,68.62,0.0,0.0,0.0,0.0,0.0
arxiv_35,0.9,0.2,30,30,6233,93.98,37.1,3.69,8.06,12.9,24.19
arxiv_35,1.0,0.2,30,30,6233,93.98,0.0,0.0,0.0,0.0,0.0
arxiv_45,0.9,0.2,30,30,10003,149.07,32.32,3.2,7.07,12.12,25.25
arxiv_45,1.0,0.2,30,30,10003,149.07,0.0,0.0,0.0,0.0,0.0
arxiv_51,0.9,0.2,30,30,12564,186.55,33.06,3.26,4.03,9.68,24.19
arxiv_51,1.0,0.2,30,30,12564,186.55,0.0,0.0,0.0,0.0,0.0
