# Create data

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({
    'category': ['TV', 'TV',  'TV', 'TV', 'TV',  'SHIRT', 'SHIRT',  'SHIRT'],
    'marca':      [0.99, 0.99, 0.95, 0.5, 0.2, 0.8, 0.7, 0.9],
    'aesthetics': [0.8, 0.7,  0.4, 0.3, 0.6, 0.8, 0.9, 0.7],
    'stars':      [4.2, 4.1, 4.5, 4.0, 3.5, 4.0, 4.3, 3.0],
})

df = df.sort_values("marca", ascending=False).reset_index(drop=True)

df2 = pd.DataFrame({
    'category':   ['TV','TV', 'TV', 'TV', 'TV', 'TV', 'SHIRT', 'SHIRT', 'SHIRT', 'SHIRT', 'SHIRT'],
    'marca':      [0.95, 0.9, 0.90, 0.9, 0.5, 0.5, 0.8, 0.7, 0.9, 0.5, 0.7],
    'aesthetics': [0.8, 0.80, 0.70, 0.4, 0.3, 0.6, 0.8, 0.9, 0.7, 0.5, 0.7],
    'stars':      [5.2, 4.2, 4.1, 4.5, 4.0, 3.5, 4.0, 4.3, 3.0, 3, 0.7],
})

df2 = df2.sort_values("marca", ascending=False).reset_index(drop=True)

signals = ['marca', 'aesthetics', 'stars']
transformed_signals = [f'φ({s})' for s in signals]

# Different ranking strategies

The following ranking functions are considered
```
    method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
        How to rank the group of records that have the same value (i.e. ties):
    
        *🚨 average: average rank of the group
        *🚨 min: lowest rank in the group
        *   max: highest rank in the group
        *   dense: like 'min', but rank always increases by 1 between groups.
```

Let us define `phi(vector)` to the the percentile rank of the vector.

## Understand `.rank`  with `method = "min", pct=True`


-  🚨 The highest `phi(signal)` might not be 1, see `# EXAMPLE UPPER BOUND NOT ONE`


In [3]:
col = "marca"
print(df2[col].values[::-1])
rank_int = df2[col].rank(method="min", pct=False)
rank_int

[0.5  0.5  0.5  0.7  0.7  0.8  0.9  0.9  0.9  0.9  0.95]


0     11.0
1      7.0
2      7.0
3      7.0
4      7.0
5      6.0
6      4.0
7      4.0
8      1.0
9      1.0
10     1.0
Name: marca, dtype: float64

The explanation for rank_ink is the following

- 0.5 has rank_int = 1 = min(1,  2 , 3) 
- 0.7 has rank_int = 4 = min(4, 5)
- 0.8 has rank_int = 6 = min(6)
- 0.9 has rank_int = 7 = min(7, 8, 9, 10)
- 0.95 has rank_int = 11 = min(11)

In [4]:
def find_rank_dict_min(col_values, verbose=False):
    sorted_inds = np.argsort(col_values)
    sorted_vals = col_values[sorted_inds]
    n_values = len(col_values)
        
    rank_dict = {}
    pre_val = 0
    min_val = 0
    prev_val = sorted_vals[0]
    
    for i,val in enumerate(sorted_vals, start=1):        
        if prev_val == val :
            min_val += 1
            if verbose:
                print(f'i={i}, prev_val={prev_val}, val={val}, min={min_val} ')    
            rank_dict[val] = min_val 
        else:
            if verbose:
                print(f'i={i}, prev_val={prev_val}, val={val}, min={min_val}, rank({prev_val})={min_val}')

            min_val = i 
            rank_dict[val] = min_val 
        
        if i == len(col_values):
            rank_dict[val] = min_val 
            if verbose:
                print(f'i={i}, prev_val={prev_val}, val={val}, min={min_val}, rank({prev_val})={min_val}')

        prev_val = val

    return rank_dict

def vector_rank_values(col_values, rank_dict):
    return np.array([rank_dict[x] for x in col_values])

def rank_pct_min_custom_core(col_values, rank_dict):
    rank_dict = find_rank_dict_min(col_values, verbose=False)
    rank_vector = vector_rank_values(col_values, rank_dict)
    percentile_rank_vector = rank_vector/len(col_values)
    return percentile_rank_vector

def rank_pct_min_custom(df, col):
    rank_dict = find_rank_dict_min(df[col].values)
    percentile_rank_vector = rank_pct_min_custom_core(df[col], rank_dict)
    return percentile_rank_vector

Looking at our custom implementation

In [5]:
col_values = df["marca"].values
rank_dict = find_rank_dict_min(col_values, verbose=True)
vector_custom = vector_rank_values(col_values, rank_dict)

print("\nSorted Values and rank_dicts found:")
print(col_values[::-1])
print(vector_custom[::-1])
print(rank_dict)

i=1, prev_val=0.2, val=0.2, min=1 
i=2, prev_val=0.2, val=0.5, min=1, rank(0.2)=1
i=3, prev_val=0.5, val=0.7, min=2, rank(0.5)=2
i=4, prev_val=0.7, val=0.8, min=3, rank(0.7)=3
i=5, prev_val=0.8, val=0.9, min=4, rank(0.8)=4
i=6, prev_val=0.9, val=0.95, min=5, rank(0.9)=5
i=7, prev_val=0.95, val=0.99, min=6, rank(0.95)=6
i=8, prev_val=0.99, val=0.99, min=8 
i=8, prev_val=0.99, val=0.99, min=8, rank(0.99)=8

Sorted Values and rank_dicts found:
[0.2  0.5  0.7  0.8  0.9  0.95 0.99 0.99]
[1 2 3 4 5 6 8 8]
{0.2: 1, 0.5: 2, 0.7: 3, 0.8: 4, 0.9: 5, 0.95: 6, 0.99: 8}


In [6]:
col_values = df2["marca"].values
rank_dict = find_rank_dict_min(col_values, verbose=True)
vector_custom = vector_rank_values(col_values, rank_dict)

print("\nSorted Values and rank_dicts found:")
print(col_values[::-1])
print(vector_custom[::-1])
print(rank_dict)

i=1, prev_val=0.5, val=0.5, min=1 
i=2, prev_val=0.5, val=0.5, min=2 
i=3, prev_val=0.5, val=0.5, min=3 
i=4, prev_val=0.5, val=0.7, min=3, rank(0.5)=3
i=5, prev_val=0.7, val=0.7, min=5 
i=6, prev_val=0.7, val=0.8, min=5, rank(0.7)=5
i=7, prev_val=0.8, val=0.9, min=6, rank(0.8)=6
i=8, prev_val=0.9, val=0.9, min=8 
i=9, prev_val=0.9, val=0.9, min=9 
i=10, prev_val=0.9, val=0.9, min=10 
i=11, prev_val=0.9, val=0.95, min=10, rank(0.9)=10
i=11, prev_val=0.9, val=0.95, min=11, rank(0.9)=11

Sorted Values and rank_dicts found:
[0.5  0.5  0.5  0.7  0.7  0.8  0.9  0.9  0.9  0.9  0.95]
[ 3  3  3  5  5  6 10 10 10 10 11]
{0.5: 3, 0.7: 5, 0.8: 6, 0.9: 10, 0.95: 11}


In [7]:
print(df2[col].values[::-1])
rank_int = df2[col].rank(method="min", pct=False)
rank_int

[0.5  0.5  0.5  0.7  0.7  0.8  0.9  0.9  0.9  0.9  0.95]


0     11.0
1      7.0
2      7.0
3      7.0
4      7.0
5      6.0
6      4.0
7      4.0
8      1.0
9      1.0
10     1.0
Name: marca, dtype: float64

The explanation for rank_ink is the following

- 0.5 has rank_int = 1 = max(1,  2 , 3) 
- 0.7 has rank_int = 4 = max(4, 5)
- 0.8 has rank_int = 6 = max(6)
- 0.9 has rank_int = 7 = max(7, 8, 9, 10)
- 0.95 has rank_int = 11 = max(11)

In [8]:
# EXAMPLE UPPER BOUND NOT ONE
phi_marca_1 = df2["marca"].rank(method="min",pct=True)

rank_int = df2["marca"].rank(method="min",pct=False)
phi_marca_2 = df2["marca"].rank(method="min",pct=False) / len(df["marca"])

pd.DataFrame({"marca":df["marca"] ,"ϕ(marca)_1":phi_marca_1, "rank_int":rank_int, "ϕ(marca)_2":phi_marca_2}).round(3)

Unnamed: 0,marca,ϕ(marca)_1,rank_int,ϕ(marca)_2
0,0.99,1.0,11.0,1.375
1,0.99,0.636,7.0,0.875
2,0.95,0.636,7.0,0.875
3,0.9,0.636,7.0,0.875
4,0.8,0.636,7.0,0.875
5,0.7,0.545,6.0,0.75
6,0.5,0.364,4.0,0.5
7,0.2,0.364,4.0,0.5
8,,0.091,1.0,0.125
9,,0.091,1.0,0.125


## Understand `.rank`  with `method = "max", pct=True`


In [9]:
print(df2[col].values[::-1])
rank_int = df2[col].rank(method="average", pct=False)
rank_int

[0.5  0.5  0.5  0.7  0.7  0.8  0.9  0.9  0.9  0.9  0.95]


0     11.0
1      8.5
2      8.5
3      8.5
4      8.5
5      6.0
6      4.5
7      4.5
8      2.0
9      2.0
10     2.0
Name: marca, dtype: float64

The explanation for rank_ink is the following

- 0.5 has rank_int = 3 = max(1,  2 , 3) 
- 0.7 has rank_int = 5 = max(4, 5)
- 0.8 has rank_int = 6 = max(6)
- 0.9 has rank_int = 10 = max(7, 8, 9, 10)
- 0.95 has rank_int = 11 = max(11)

In [10]:
def find_rank_dict_max(col_values, verbose=False):
    sorted_inds = np.argsort(col_values)
    sorted_vals = col_values[sorted_inds]
    n_values = len(col_values)
        
    rank_dict = {}
    pre_val = 0
    max_val = 0
    prev_val = sorted_vals[0]
    
    for i,val in enumerate(sorted_vals, start=1):        
        if prev_val == val :
            max_val += 1
            if verbose:
                print(f'i={i}, prev_val={prev_val}, val={val}, max={max_val} ')    
            rank_dict[val] = max_val 
        else:
            if verbose:
                print(f'i={i}, prev_val={prev_val}, val={val}, max={max_val}, rank({prev_val})={max_val}')

            max_val = i 
            rank_dict[val] = max_val 
        
        if i == len(col_values):
            rank_dict[val] = max_val 
            if verbose:
                print(f'i={i}, prev_val={prev_val}, val={val}, max={max_val}, rank({prev_val})={max_val}')

        prev_val = val

    return rank_dict

def vector_rank_values(col_values, rank_dict):
    return np.array([rank_dict[x] for x in col_values])

def rank_pct_max_custom_core(col_values, rank_dict):
    rank_dict = find_rank_dict_max(col_values, verbose=False)
    rank_vector = vector_rank_values(col_values, rank_dict)
    percentile_rank_vector = rank_vector/len(col_values)
    return percentile_rank_vector

def rank_pct_max_custom(df, col):
    rank_dict = find_rank_dict_max(df[col].values)
    percentile_rank_vector = rank_pct_max_custom_core(df[col], rank_dict)
    return percentile_rank_vector

Looking at our custom implementation

In [11]:
col_values = df["marca"].values
rank_dict = find_rank_dict_max(col_values, verbose=True)
vector_custom = vector_rank_values(col_values, rank_dict)

print("\nSorted Values and rank_dicts found:")
print(col_values[::-1])
print(vector_custom[::-1])
print(rank_dict)

i=1, prev_val=0.2, val=0.2, max=1 
i=2, prev_val=0.2, val=0.5, max=1, rank(0.2)=1
i=3, prev_val=0.5, val=0.7, max=2, rank(0.5)=2
i=4, prev_val=0.7, val=0.8, max=3, rank(0.7)=3
i=5, prev_val=0.8, val=0.9, max=4, rank(0.8)=4
i=6, prev_val=0.9, val=0.95, max=5, rank(0.9)=5
i=7, prev_val=0.95, val=0.99, max=6, rank(0.95)=6
i=8, prev_val=0.99, val=0.99, max=8 
i=8, prev_val=0.99, val=0.99, max=8, rank(0.99)=8

Sorted Values and rank_dicts found:
[0.2  0.5  0.7  0.8  0.9  0.95 0.99 0.99]
[1 2 3 4 5 6 8 8]
{0.2: 1, 0.5: 2, 0.7: 3, 0.8: 4, 0.9: 5, 0.95: 6, 0.99: 8}


In [12]:
col_values = df2["marca"].values
rank_dict = find_rank_dict_max(col_values, verbose=True)
vector_custom = vector_rank_values(col_values, rank_dict)

print("\nSorted Values and rank_dicts found:")
print(col_values[::-1])
print(vector_custom[::-1])
print(rank_dict)

i=1, prev_val=0.5, val=0.5, max=1 
i=2, prev_val=0.5, val=0.5, max=2 
i=3, prev_val=0.5, val=0.5, max=3 
i=4, prev_val=0.5, val=0.7, max=3, rank(0.5)=3
i=5, prev_val=0.7, val=0.7, max=5 
i=6, prev_val=0.7, val=0.8, max=5, rank(0.7)=5
i=7, prev_val=0.8, val=0.9, max=6, rank(0.8)=6
i=8, prev_val=0.9, val=0.9, max=8 
i=9, prev_val=0.9, val=0.9, max=9 
i=10, prev_val=0.9, val=0.9, max=10 
i=11, prev_val=0.9, val=0.95, max=10, rank(0.9)=10
i=11, prev_val=0.9, val=0.95, max=11, rank(0.9)=11

Sorted Values and rank_dicts found:
[0.5  0.5  0.5  0.7  0.7  0.8  0.9  0.9  0.9  0.9  0.95]
[ 3  3  3  5  5  6 10 10 10 10 11]
{0.5: 3, 0.7: 5, 0.8: 6, 0.9: 10, 0.95: 11}


#### Verify custom implementation is the same as pandas one

In [13]:
phi_marca_1 = df["marca"].rank(method="max",pct=True)

rank_int = df["marca"].rank(method="max",pct=False)
phi_marca_2 = df["marca"].rank(method="max",pct=False) / len(df["marca"])

phi_custom = rank_pct_max_custom(df, col)

df1_max = pd.DataFrame({"marca":df["marca"],
              "ϕ(marca)_1":phi_marca_1,
              "rank_int":rank_int,
              "ϕ(marca)_2":phi_marca_2,
              "ϕ(marca)_custom":phi_custom,              
             }).round(3)

df1_max

Unnamed: 0,marca,ϕ(marca)_1,rank_int,ϕ(marca)_2,ϕ(marca)_custom
0,0.99,1.0,8.0,1.0,1.0
1,0.99,1.0,8.0,1.0,1.0
2,0.95,0.75,6.0,0.75,0.75
3,0.9,0.625,5.0,0.625,0.625
4,0.8,0.5,4.0,0.5,0.5
5,0.7,0.375,3.0,0.375,0.375
6,0.5,0.25,2.0,0.25,0.25
7,0.2,0.125,1.0,0.125,0.125


In [14]:
phi_marca_1 = df2["marca"].rank(method="max",pct=True)

rank_int = df2["marca"].rank(method="max",pct=False)
phi_marca_2 = df2["marca"].rank(method="max",pct=False) / len(df2["marca"])

phi_custom = rank_pct_max_custom(df2, col)

df2_max = pd.DataFrame({"marca":df["marca"],
              "ϕ(marca)_1":phi_marca_1,
              "rank_int":rank_int,
              "ϕ(marca)_2":phi_marca_2,
              "ϕ(marca)_custom":phi_custom,              
             }).round(3)

df2_max

Unnamed: 0,marca,ϕ(marca)_1,rank_int,ϕ(marca)_2,ϕ(marca)_custom
0,0.99,1.0,11.0,1.0,1.0
1,0.99,0.909,10.0,0.909,0.909
2,0.95,0.909,10.0,0.909,0.909
3,0.9,0.909,10.0,0.909,0.909
4,0.8,0.909,10.0,0.909,0.909
5,0.7,0.545,6.0,0.545,0.545
6,0.5,0.455,5.0,0.455,0.455
7,0.2,0.455,5.0,0.455,0.455
8,,0.273,3.0,0.273,0.273
9,,0.273,3.0,0.273,0.273


In [15]:
df2

Unnamed: 0,category,marca,aesthetics,stars
0,TV,0.95,0.8,5.2
1,TV,0.9,0.8,4.2
2,TV,0.9,0.7,4.1
3,TV,0.9,0.4,4.5
4,SHIRT,0.9,0.7,3.0
5,SHIRT,0.8,0.8,4.0
6,SHIRT,0.7,0.9,4.3
7,SHIRT,0.7,0.7,0.7
8,TV,0.5,0.3,4.0
9,TV,0.5,0.6,3.5


## Understand  `.rank`  with `method = "dense", pct=True`


The rank function in pandas with `method="dense", pct=True` is the same as with `method="dense",pct=False` but dividing by the number of unique elements in the dataframe

In [16]:
phi_marca_1 = df["marca"].rank(method="dense",pct=True)
rank_int = df["marca"].rank(method="dense",pct=False)
phi_marca_2 = rank_int / len(np.unique(df["marca"]))

df1_dense = pd.DataFrame({"marca":df["marca"],
              "ϕ(marca)_1":phi_marca_1,
              "rank_int":rank_int, 
              "ϕ(marca)_2":phi_marca_2}).round(3)

df1_dense

Unnamed: 0,marca,ϕ(marca)_1,rank_int,ϕ(marca)_2
0,0.99,1.0,7.0,1.0
1,0.99,1.0,7.0,1.0
2,0.95,0.857,6.0,0.857
3,0.9,0.714,5.0,0.714
4,0.8,0.571,4.0,0.571
5,0.7,0.429,3.0,0.429
6,0.5,0.286,2.0,0.286
7,0.2,0.143,1.0,0.143


In [17]:
phi_marca_1 = df2["marca"].rank(method="dense", pct=True)
rank_int = df2["marca"].rank(method="dense", pct=False)
phi_marca_2 = rank_int / len(np.unique(df2["marca"]))

df2_dense = pd.DataFrame({"marca":df2["marca"],
              "ϕ(marca)_1":phi_marca_1,
              "rank_int":rank_int,
              "ϕ(marca)_2":phi_marca_2}).round(3)
df2_dense

Unnamed: 0,marca,ϕ(marca)_1,rank_int,ϕ(marca)_2
0,0.95,1.0,5.0,1.0
1,0.9,0.8,4.0,0.8
2,0.9,0.8,4.0,0.8
3,0.9,0.8,4.0,0.8
4,0.9,0.8,4.0,0.8
5,0.8,0.6,3.0,0.6
6,0.7,0.4,2.0,0.4
7,0.7,0.4,2.0,0.4
8,0.5,0.2,1.0,0.2
9,0.5,0.2,1.0,0.2


## Understand  `.rank`  with `method = "average", pct=True`

Note that by default if method = "average".


Note:


-  🚨 The highest `phi(signal)` might not be 1, see `# EXAMPLE UPPER BOUND NOT ONE`


In [18]:
def rank_pct_average(df, column):
    """
    Rank the values in a specified column of a DataFrame, using average ranking for ties
    and returning the percentile rank of each entry.

    Parameters:
        df (pd.DataFrame): The input DataFrame.
        column (str): The column name to rank.

    Returns:
        pd.Series: A series with percentile ranks.
    """
    ranks = df[column].rank(method="average") / len(df)
    return ranks

#### Rank average from scratch

Let us start looking at the output values of `.rank(method="average", pct=False)`

In [19]:
print(df2[col].values[::-1])
rank_int = df2[col].rank(method="average", pct=False)
rank_int

[0.5  0.5  0.5  0.7  0.7  0.8  0.9  0.9  0.9  0.9  0.95]


0     11.0
1      8.5
2      8.5
3      8.5
4      8.5
5      6.0
6      4.5
7      4.5
8      2.0
9      2.0
10     2.0
Name: marca, dtype: float64

The explanation for rank_ink is the following

- 0.5 has rank_int = 2 = (1 + 2 + 3)/3
- 0.7 has rank_int = 4.5 =(4 + 5)/2
- 0.8 has rank_int = 6 = 6/1
- 0.9 has rank_int = 8.5 = (7 + 8 + 9 + 10)/4
- 0.95 has rank_int = 11 = 11/1

In [20]:
def find_rank_dict_average(col_values, verbose=False):
    sorted_inds = np.argsort(col_values)
    sorted_vals = col_values[sorted_inds]
    n_values = len(col_values)
    
    result = np.zeros(n_values)
    ranks = np.zeros(n_values)
    
    rank_dict = {}
    pre_val = 0
    acum = 0
    counter = 0
    prev_val = sorted_vals[0]
    for i,val in enumerate(sorted_vals, start=1):        
        
        if prev_val == val :
            acum += i
            counter += 1
            
            if verbose:
                print(f'i={i}, prev_val={prev_val}, val={val}, acum={acum}, counter={counter} ')
    
        else:
            if verbose:
                print(f'i={i}, prev_val={prev_val}, val={val}, acum={acum}, counter={counter}, rank({prev_val})={acum/counter}')

            if counter !=0:
                rank_dict[prev_val] = acum / counter
            counter = 1
            acum = i
            
        if i == len(sorted_vals):
            rank_dict[val] = acum / counter
            if verbose:
                print(f'i={i}, prev_val={prev_val}, val={val}, acum={acum}, counter={counter}, rank({prev_val})={acum/counter}')
    
        prev_val = val

    return rank_dict

def vector_rank_values(col_values, rank_dict):
    return np.array([rank_dict[x] for x in col_values])

def rank_pct_average_custom_core(col_values, rank_dict):
    rank_dict = find_rank_dict_average(col_values, verbose=False)
    rank_vector = vector_rank_values(col_values, rank_dict)
    percentile_rank_vector = rank_vector/len(col_values)
    return percentile_rank_vector

def rank_pct_average_custom(df, col):
    rank_dict = find_rank_dict_average(df[col].values)
    percentile_rank_vector = rank_pct_average_custom_core(df[col], rank_dict)
    return percentile_rank_vector

In [21]:
col_values = df["marca"].values
rank_dict = find_rank_dict_average(col_values, verbose=True)

i=1, prev_val=0.2, val=0.2, acum=1, counter=1 
i=2, prev_val=0.2, val=0.5, acum=1, counter=1, rank(0.2)=1.0
i=3, prev_val=0.5, val=0.7, acum=2, counter=1, rank(0.5)=2.0
i=4, prev_val=0.7, val=0.8, acum=3, counter=1, rank(0.7)=3.0
i=5, prev_val=0.8, val=0.9, acum=4, counter=1, rank(0.8)=4.0
i=6, prev_val=0.9, val=0.95, acum=5, counter=1, rank(0.9)=5.0
i=7, prev_val=0.95, val=0.99, acum=6, counter=1, rank(0.95)=6.0
i=8, prev_val=0.99, val=0.99, acum=15, counter=2 
i=8, prev_val=0.99, val=0.99, acum=15, counter=2, rank(0.99)=7.5


In [22]:
pd.Series(rank_dict)

0.20    1.0
0.50    2.0
0.70    3.0
0.80    4.0
0.90    5.0
0.95    6.0
0.99    7.5
dtype: float64

We can see that both implementations are actually the same

In [23]:
col = "marca"
phi_marca_1 = df[col].rank(method="average", pct=True)
phi_marca_custom = rank_pct_average(df, col)

pd.DataFrame({"marca": df["marca"], 
              "ϕ(marca)_1": phi_marca_1,
              "ϕ(marca)_custom": phi_marca_custom }).round(3)

Unnamed: 0,marca,ϕ(marca)_1,ϕ(marca)_custom
0,0.99,0.938,0.938
1,0.99,0.938,0.938
2,0.95,0.75,0.75
3,0.9,0.625,0.625
4,0.8,0.5,0.5
5,0.7,0.375,0.375
6,0.5,0.25,0.25
7,0.2,0.125,0.125


In [24]:
col = "marca"
phi_marca_1 = df[col].rank(method="average", pct=True)
rank_int = df[col].rank(method="average", pct=False) 
phi_marca_2 = rank_pct_average(df, col)
phi_marca_custom = rank_pct_average_custom(df, col)

pd.DataFrame({"marca":df["marca"],
              "ϕ(marca)_1":phi_marca_1,
              "rank_int":rank_int,
              "ϕ(marca)_2":phi_marca_2,
              "ϕ(marca)_custom":phi_marca_custom }).round(3)

Unnamed: 0,marca,ϕ(marca)_1,rank_int,ϕ(marca)_2,ϕ(marca)_custom
0,0.99,0.938,7.5,0.938,0.938
1,0.99,0.938,7.5,0.938,0.938
2,0.95,0.75,6.0,0.75,0.75
3,0.9,0.625,5.0,0.625,0.625
4,0.8,0.5,4.0,0.5,0.5
5,0.7,0.375,3.0,0.375,0.375
6,0.5,0.25,2.0,0.25,0.25
7,0.2,0.125,1.0,0.125,0.125


In [25]:
col = "marca"
phi_marca_1 = df2[col].rank(method="average", pct=True)
rank_int = df2[col].rank(method="average", pct=False) 
phi_marca_2 = rank_pct_average(df2, col)
phi_marca_custom = rank_pct_average_custom(df2, col)

pd.DataFrame({"marca":df2["marca"] ,
              "ϕ(marca)_1":phi_marca_1,
              "rank_int":rank_int,
              "ϕ(marca)_2":phi_marca_2,
              "ϕ(marca)_custom":phi_marca_custom }).round(3)

Unnamed: 0,marca,ϕ(marca)_1,rank_int,ϕ(marca)_2,ϕ(marca)_custom
0,0.95,1.0,11.0,1.0,1.0
1,0.9,0.773,8.5,0.773,0.773
2,0.9,0.773,8.5,0.773,0.773
3,0.9,0.773,8.5,0.773,0.773
4,0.9,0.773,8.5,0.773,0.773
5,0.8,0.545,6.0,0.545,0.545
6,0.7,0.409,4.5,0.409,0.409
7,0.7,0.409,4.5,0.409,0.409
8,0.5,0.182,2.0,0.182,0.182
9,0.5,0.182,2.0,0.182,0.182


Now let us see the equivalence with df2

In [26]:
rank_int = df2[col].rank(method="average", pct=False) 
rank_int

0     11.0
1      8.5
2      8.5
3      8.5
4      8.5
5      6.0
6      4.5
7      4.5
8      2.0
9      2.0
10     2.0
Name: marca, dtype: float64

In [27]:
col_values = df2["marca"].values
rank_dict = find_rank_dict_average(col_values, verbose=True)

i=1, prev_val=0.5, val=0.5, acum=1, counter=1 
i=2, prev_val=0.5, val=0.5, acum=3, counter=2 
i=3, prev_val=0.5, val=0.5, acum=6, counter=3 
i=4, prev_val=0.5, val=0.7, acum=6, counter=3, rank(0.5)=2.0
i=5, prev_val=0.7, val=0.7, acum=9, counter=2 
i=6, prev_val=0.7, val=0.8, acum=9, counter=2, rank(0.7)=4.5
i=7, prev_val=0.8, val=0.9, acum=6, counter=1, rank(0.8)=6.0
i=8, prev_val=0.9, val=0.9, acum=15, counter=2 
i=9, prev_val=0.9, val=0.9, acum=24, counter=3 
i=10, prev_val=0.9, val=0.9, acum=34, counter=4 
i=11, prev_val=0.9, val=0.95, acum=34, counter=4, rank(0.9)=8.5
i=11, prev_val=0.9, val=0.95, acum=11, counter=1, rank(0.9)=11.0


In [28]:
vector_rank_values(df2[col], rank_dict) / len(df2)

array([1.        , 0.77272727, 0.77272727, 0.77272727, 0.77272727,
       0.54545455, 0.40909091, 0.40909091, 0.18181818, 0.18181818,
       0.18181818])