Copyright (C) 2020 Edouard Fouché

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.

You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.

In [1]:
import math

import matplotlib as mpl
from matplotlib import cm
mpl.rcParams['text.usetex'] = True 
mpl.rcParams['text.latex.preamble'] = r'\usepackage{libertine}' 
mpl.rc('font', family='serif')

import numpy as np
import pandas as pd
import seaborn as sns
import copy

import matplotlib.pyplot as plt
from matplotlib import rc

from cycler import cycler


mpl.rcParams['ps.usedistiller'] = 'xpdf' 

plt.style.use('seaborn-notebook')

plt.rcParams['axes.titlesize'] = '25'
plt.rcParams['axes.labelsize'] = '25'
plt.rcParams['legend.fontsize'] = '15'
plt.rcParams['xtick.labelsize'] = '15'
plt.rcParams['ytick.labelsize'] = '15'

monochrome=(cycler('color', sns.color_palette("husl", 8))*2+(cycler('marker', ['v', 's', "o"])*7)[0:16])
plt.rc('axes', prop_cycle=monochrome)

pd.options.display.max_rows = 999
pd.options.display.max_columns = None
cmap = cm.get_cmap('RdBu')

In [2]:
folder = "2020-02-11-21-23_Evaluate_AvgNegCosSim_" # replace

In [3]:
masterdata = pd.read_csv("../experiments/" + folder + "/Evaluate_AvgNegCosSim.csv")

In [4]:
masterdata.columns

Index(['scoreId', 'dataset', 'k', 'bmId', 'ndocs', 'emb', 'cpuindex', 'cpu',
       'wall', 'typeAauc', 'typeAap', 'typeAr1', 'typeAr2', 'typeAr5',
       'typeAr10', 'typeAr20', 'typeAr30', 'typeAp1', 'typeAp2', 'typeAp5',
       'typeAp10', 'typeAp20', 'typeAp30'],
      dtype='object')

In [5]:
masterdata["k"].unique() 

array([    1,     2,     3,     4,     5,     6,     7,     9,    11,
          14,    18,    23,    29,    37,    48,    62,    80,   100,
       10099, 10199, 10499,  1009,  2019,  5049,  3918,  4595,  6232,
       10002, 12563])

In [6]:
masterdata["cpu_combo"] = masterdata['cpu'] + masterdata['cpuindex']

In [7]:
att = ["typeAauc", "typeAap", "typeAr1", "typeAr2", "typeAr5"]

In [8]:
times = ["cpu", "cpu_combo"]
subdata = masterdata[masterdata["k"] <= 100]
a = subdata.loc[subdata.groupby(["dataset"])["typeAauc"].idxmax()][["dataset", "k", "typeAauc", "typeAap", "typeAr1", "typeAr2", "typeAr5", "cpu", "cpu_combo"]]
a[times] = round(a[times] / 1000,2)
a

Unnamed: 0,dataset,k,typeAauc,typeAap,typeAr1,typeAr2,typeAr5,cpu,cpu_combo
116,arxiv_15,3,0.6714,0.0198,0.0,0.0769,0.1026,0.08,9.75
133,arxiv_25,1,0.6787,0.02,0.0217,0.0435,0.1087,13.99,25.2
162,arxiv_35,18,0.7343,0.0236,0.0161,0.0484,0.1613,0.54,16.97
171,arxiv_45,1,0.7011,0.0223,0.0101,0.0303,0.1717,67.37,100.25
274,arxiv_51,11,0.5419,0.0168,0.0403,0.0726,0.1129,0.43,35.1
249,arxiv_52,3,0.6626,0.0161,0.0081,0.0323,0.0887,0.2,30.92
228,arxiv_53,1,0.6356,0.0154,0.0242,0.0403,0.0968,72.78,112.75
223,arxiv_54,48,0.6893,0.0183,0.0,0.0161,0.0887,1.77,41.0
192,arxiv_55,3,0.6759,0.0225,0.0323,0.0645,0.1452,0.25,44.1
9,nyt_1,14,0.8389,0.0365,0.03,0.07,0.19,0.61,40.66


In [9]:
times = ["cpu", "cpu_combo"]
subdata = masterdata[masterdata["k"] > 100]
a = subdata.loc[subdata.groupby(["dataset"])["typeAauc"].idxmax()][["dataset", "k", "typeAauc", "typeAap", "typeAr1", "typeAr2", "typeAr5", "cpu", "cpu_combo"]]
a[times] = round(a[times] / 1000,2)
a

Unnamed: 0,dataset,k,typeAauc,typeAap,typeAr1,typeAr2,typeAr5,cpu,cpu_combo
132,arxiv_15,3918,0.4744,0.0095,0.0,0.0,0.0513,63.49,73.15
151,arxiv_25,4595,0.4787,0.0109,0.0217,0.0435,0.0652,92.44,103.65
170,arxiv_35,6232,0.5295,0.0107,0.0,0.0,0.0323,167.66,184.09
189,arxiv_45,10002,0.515,0.0132,0.0101,0.0101,0.0202,460.24,493.12
284,arxiv_51,12563,0.6591,0.0236,0.0565,0.1048,0.1532,439.46,474.13
265,arxiv_52,12563,0.4502,0.0084,0.0,0.0,0.0081,478.85,509.58
246,arxiv_53,12563,0.4754,0.0093,0.0081,0.0161,0.0323,461.4,501.38
227,arxiv_54,12563,0.5046,0.0101,0.0081,0.0081,0.0323,433.79,473.03
208,arxiv_55,12563,0.5169,0.018,0.0161,0.0323,0.0645,505.42,549.27
18,nyt_1,10099,0.6663,0.0257,0.06,0.1,0.21,444.31,484.36


In [14]:
a = subdata.groupby(["dataset"])["typeAauc", "typeAap", "typeAr1", "typeAr2", "typeAr5", "cpu"].mean()
a[a.columns] = round(a[a.columns]*100,2)
a

Unnamed: 0_level_0,typeAauc,typeAap,typeAr1,typeAr2,typeAr5,cpu
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
arxiv_15,47.44,0.95,0.0,0.0,5.13,6348724.92
arxiv_25,47.87,1.09,2.17,4.35,6.52,9243781.45
arxiv_35,52.95,1.07,0.0,0.0,3.23,16766129.3
arxiv_45,51.5,1.32,1.01,1.01,2.02,46023874.21
arxiv_51,65.91,2.36,5.65,10.48,15.32,43945992.2
arxiv_52,45.02,0.84,0.0,0.0,0.81,47885059.59
arxiv_53,47.54,0.93,0.81,1.61,3.23,46140457.29
arxiv_54,50.46,1.01,0.81,0.81,3.23,43379060.79
arxiv_55,51.69,1.8,1.61,3.23,6.45,50541516.87
nyt_1,66.63,2.57,6.0,10.0,21.0,44431114.77


In [17]:
times = ['cpu', 'cpuindex','cpu_combo']
a = subdata.groupby(["dataset"])[times].mean()
a[times] = round(a[times] / 1000,2)
a

Unnamed: 0_level_0,cpu,cpuindex,cpu_combo
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
arxiv_15,63.49,9.67,73.15
arxiv_25,92.44,11.22,103.65
arxiv_35,167.66,16.43,184.09
arxiv_45,460.24,32.88,493.12
arxiv_51,439.46,34.67,474.13
arxiv_52,478.85,30.72,509.58
arxiv_53,461.4,39.97,501.38
arxiv_54,433.79,39.24,473.03
arxiv_55,505.42,43.86,549.27
nyt_1,444.31,40.05,484.36
