In [1]:
import pandas as pd
import numpy as np

This tutorial is based on this Stackoverflow entry https://stackoverflow.com/questions/54357300/bayesian-averaging-in-a-dataframe

In [2]:
d = {
    'Bar': ['Snickers', 'Mars Bars', 'Milky Way', 'Almond Joy', 'Babe Ruth'],
    'User1': [0.01, 0.25, 0.9, np.nan, 0.5],
    'User2': [np.nan, 0.4, 1.0, np.nan, 0.1],
    'User3': [0.7, 0.1, np.nan, np.nan, 0.3]
}

df = pd.DataFrame(data=d)

In [3]:
df

Unnamed: 0,Bar,User1,User2,User3
0,Snickers,0.01,,0.7
1,Mars Bars,0.25,0.4,0.1
2,Milky Way,0.9,1.0,
3,Almond Joy,,,
4,Babe Ruth,0.5,0.1,0.3


Create a list of all columns that had user reviews:

In [4]:
user_cols = []
for col in df.columns.values:
    if 'User' in col:
        user_cols.append(col)

In [5]:
user_cols

['User1', 'User2', 'User3']

1. Calculate the value of v for each bar

In [6]:
df['v'] = df[user_cols].count(axis=1)

In [7]:
df

Unnamed: 0,Bar,User1,User2,User3,v
0,Snickers,0.01,,0.7,2
1,Mars Bars,0.25,0.4,0.1,3
2,Milky Way,0.9,1.0,,2
3,Almond Joy,,,,0
4,Babe Ruth,0.5,0.1,0.3,3


2. Calculate the value of m (equals 2.0 in this example):

In [8]:
m = np.mean(df['v'])

In [9]:
m

2.0

3. Calculate the value of w for each bar:

In [10]:
df['w'] = df['v']/(df['v'] + m)

In [11]:
df

Unnamed: 0,Bar,User1,User2,User3,v,w
0,Snickers,0.01,,0.7,2,0.5
1,Mars Bars,0.25,0.4,0.1,3,0.6
2,Milky Way,0.9,1.0,,2,0.5
3,Almond Joy,,,,0,0.0
4,Babe Ruth,0.5,0.1,0.3,3,0.6


4. And calculate the value of R for each bar:

In [12]:
df['R'] = np.mean(df[user_cols], axis=1)

In [13]:
df

Unnamed: 0,Bar,User1,User2,User3,v,w,R
0,Snickers,0.01,,0.7,2,0.5,0.355
1,Mars Bars,0.25,0.4,0.1,3,0.6,0.25
2,Milky Way,0.9,1.0,,2,0.5,0.95
3,Almond Joy,,,,0,0.0,
4,Babe Ruth,0.5,0.1,0.3,3,0.6,0.3


5. Finally, get the value of C (equals 0.426 in this example):

In [14]:
C = np.nanmean(df[user_cols].values.flatten())

In [15]:
test = df[user_cols].values.flatten()

In [21]:
test

array([0.01,  nan, 0.7 , 0.25, 0.4 , 0.1 , 0.9 , 1.  ,  nan,  nan,  nan,
        nan, 0.5 , 0.1 , 0.3 ])

In [16]:
C

0.426

And now we're ready to calculate the Bayesian Average score, S, for each candy bar:

In [17]:
df['S'] = df['w']*df['R'] + (1 - df['w'])*C

In [18]:
df

Unnamed: 0,Bar,User1,User2,User3,v,w,R,S
0,Snickers,0.01,,0.7,2,0.5,0.355,0.3905
1,Mars Bars,0.25,0.4,0.1,3,0.6,0.25,0.3204
2,Milky Way,0.9,1.0,,2,0.5,0.95,0.688
3,Almond Joy,,,,0,0.0,,
4,Babe Ruth,0.5,0.1,0.3,3,0.6,0.3,0.3504


Where the final column S contains all the S-scores for the candy bars. If you want you could then delete the v, w, and R temporary columns: df = df.drop(['v', 'w', 'R'], axis=1):

In [19]:
df = df.drop(['v', 'w', 'R'], axis=1)

In [20]:
df

Unnamed: 0,Bar,User1,User2,User3,S
0,Snickers,0.01,,0.7,0.3905
1,Mars Bars,0.25,0.4,0.1,0.3204
2,Milky Way,0.9,1.0,,0.688
3,Almond Joy,,,,
4,Babe Ruth,0.5,0.1,0.3,0.3504
