## Rank Rows
* `rank`: Select the max row per group. Or the min.<br>
   Or mark the rows instead of selecting them. 

In [1]:
# blab init
try:
    import blab
except ImportError as e:
    !pip install blab
    import blab    
startup_notebook = blab.blab_startup()
%run $startup_notebook 

In [3]:
import numpy      as np
import pandas     as pd 
import bpyth      as bpy

# pandasklar
try:
    import pandasklar as pak 
except ImportError as e:
    !pip install pandasklar
    import pandasklar as pak   
    
# verbose
pak.Config.set('VERBOSE', True)

import time
from pandas._testing import assert_frame_equal

VERBOSE = True
--> setting verbose=True as default for all pandasklar functions

time: 10.8 s


In [3]:
# Create test data
# cols_group = ['AA']     # Very few very large groups
# cols_group = ['A']      # Few large groups
# cols_group = ['B']      # Many small groups

np.random.seed(10)
anz = 100000 #0
df = pak.zufallsdaten(anz)
df['AA'] = df.A.apply(lambda x: int(x/10)*10 )
df['AA'] = pak.change_datatype(df['AA'] )
df = pak.move_cols(df,'AA')
df = pak.rename_col(df,'C','other_data')
df = pak.rename_col(df,'D','score')
df
#pak.analyse_cols(df)

Unnamed: 0,AA,A,B,other_data,score,E
0,40,40,97438,0.369,9.97,0.000000e+00
1,30,33,87940,0.186,5.87,5.091812e-09
2,30,32,95268,0.063,3.27,8.532549e-09
3,20,27,61350,0.551,5.84,2.380759e-08
4,30,33,32778,0.393,1.92,3.068985e-08
...,...,...,...,...,...,...
99995,30,33,52307,0.320,8.20,1.000000e+00
99996,30,32,17225,0.366,6.87,1.000000e+00
99997,30,31,72053,0.583,4.13,1.000000e+00
99998,20,29,97320,0.443,7.72,1.000000e+00


time: 12.9 s


## rank()

In [8]:
?pak.rank

time: 90.3 ms


[0;31mSignature:[0m
[0mpak[0m[0;34m.[0m[0mrank[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdf[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcol_score[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcols_group[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfind[0m[0;34m=[0m[0;34m'max'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcol_target[0m[0;34m=[0m[0;34m''[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mon_conflict[0m[0;34m=[0m[0;34m'first'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Select the max row per group. Or the min.
Or mark the rows instead of selecting them. 
* cols_group:   Name of the columns to be grouped by.
* col_score:    Name of the column whose minimum or maximum is to be found.
* find:         'min' or 'max', default is 'max'.
* col_target:   Should a ranking column be added? If yes, then give the name here.
                If col_target is given, then the dataframe 

In [10]:
# select ONE row with max score
ranked = pak.rank( df, col_score='score')
ranked

Unnamed: 0,AA,A,B,other_data,score,E
567,20,26,54706,0.479,9.99,0.000547


time: 51.9 ms


In [11]:
# select the rows with max score, grouped by AA
ranked = pak.rank( df, cols_group=['AA'], col_score='score')
ranked

Unnamed: 0,AA,A,B,other_data,score,E
567,20,26,54706,0.479,9.99,0.000547
2072,30,36,48311,0.574,9.99,0.007293
2298,40,40,55534,0.408,9.99,0.008966


time: 63.8 ms


In [12]:
# select the rows with min score, grouped by AA
ranked = pak.rank( df, cols_group=['AA'], col_score='score', find='min')
ranked

Unnamed: 0,AA,A,B,other_data,score,E
886,30,32,12017,0.863,0.0,0.001337
1058,20,25,63125,0.128,0.0,0.001908
50593,40,40,78253,0.662,0.0,0.36644


time: 73.7 ms


### mark instead of select

In [18]:
# Instead of selecting the top rows, we can mark all rows by it's rank.
# If no group is given, the one with max score in the whode DataFrame gets rank 1.
# This time we group by A.

df = pak.rank( df, col_score='score', col_target='rank')
df.sort_values(['rank'])

Unnamed: 0,AA,A,B,other_data,score,E,rank
567,20,26,54706,0.479,9.99,0.000547,1
2072,30,36,48311,0.574,9.99,0.007293,2
2298,40,40,55534,0.408,9.99,0.008966,3
3865,20,20,21622,0.781,9.99,0.025184,4
5834,20,28,24588,0.672,9.99,0.056606,5
...,...,...,...,...,...,...,...
95513,30,30,74955,0.691,0.00,0.976408,99996
96222,30,32,85761,0.432,0.00,0.983219,99997
98138,30,32,30890,0.036,0.00,0.995898,99998
99439,30,37,83082,0.964,0.00,0.999627,99999


time: 89.6 ms


In [19]:
# Instead of selecting the top rows, we can mark all rows by it's rank.
# The one with max score in the group gets rank 1.
# This time we group by A.

df = pak.rank( df, cols_group=['A'], col_score='score', col_target='rank')
df.sort_values(['A','score'], ascending=[True,False])

Unnamed: 0,AA,A,B,other_data,score,E,rank
3865,20,20,21622,0.781,9.99,0.025184,1
566,20,20,94494,0.075,9.98,0.000547,2
77384,20,20,52557,0.081,9.98,0.563056,3
44876,20,20,76498,0.624,9.97,0.364598,4
35531,20,20,82368,0.454,9.96,0.350346,5
...,...,...,...,...,...,...,...
44926,40,42,31006,0.253,0.11,0.364608,681
56713,40,42,25721,0.053,0.11,0.367975,682
46278,40,42,26719,0.590,0.06,0.364954,683
68928,40,42,14264,0.344,0.04,0.396120,684


time: 112 ms


### Deal with ambiguous results 

In [None]:
# In this dataset there are many records with max score.
mask = (df.score >= 9.99)  &  (df.AA==20)
df[mask].head(5)

In [None]:
# on_conflict='first' is default.
df = pak.rank( df, cols_group=['AA'], col_score='score', col_target='rank')
df.sort_values(['AA','score'], ascending=[True,False])

In [None]:
# If you sort the dataframe before, you will get secondary ranking criterias.
# In this example the ranking criterias are score and B.
df = pak.rank( df.sort_values('B'), cols_group=['AA'], col_score='score', col_target='rank')
df.sort_values(['AA','score'], ascending=[True,False])

In [None]:
# group by A, on_conflict='average' 
df = pak.rank( df, cols_group=['A'], col_score='score', col_target='rank', on_conflict='average')
a = df.sort_values(['A','score'], ascending=[True,False])
a

In [None]:
# group by A, on_conflict='min' 
df = pak.rank( df, cols_group=['A'], col_score='score', col_target='rank', on_conflict='min')
a = df.sort_values(['A','score'], ascending=[True,False])
a