In [33]:
import numpy as np
import pandas as pd
import pickle

In [34]:
data = None

column_names = ['fruit', 'long', 'sweet', 'yellow', 'seed', 'Brazil']

with open('../fruit_data.pkl', 'rb') as fin:
    data = pickle.load(fin)

In [35]:
data

array([['others', '0', '1', '0', '1', '0'],
       ['bananas', '1', '0', '1', '0', '0'],
       ['bananas', '1', '0', '1', '0', '0'],
       ...,
       ['bananas', '1', '1', '1', '0', '0'],
       ['oranges', '0', '1', '0', '1', '0'],
       ['oranges', '0', '1', '0', '1', '0']], dtype='<U21')

In [36]:
df = pd.DataFrame(data, columns=column_names)
df

Unnamed: 0,fruit,long,sweet,yellow,seed,Brazil
0,others,0,1,0,1,0
1,bananas,1,0,1,0,0
2,bananas,1,0,1,0,0
3,bananas,0,1,1,0,1
4,oranges,0,1,0,1,0
...,...,...,...,...,...,...
1395,oranges,0,1,0,1,0
1396,mangoes,0,0,0,0,1
1397,bananas,1,1,1,0,0
1398,oranges,0,1,0,1,0


In [37]:
df.long = pd.to_numeric(df.long)
df.sweet = pd.to_numeric(df.sweet)
df.yellow = pd.to_numeric(df.yellow)
df.seed = pd.to_numeric(df.seed)
df.Brazil = pd.to_numeric(df.Brazil)

## Prior

In [38]:
df.fruit

0        others
1       bananas
2       bananas
3       bananas
4       oranges
         ...   
1395    oranges
1396    mangoes
1397    bananas
1398    oranges
1399    oranges
Name: fruit, Length: 1400, dtype: object

In [39]:
priors = df.fruit.value_counts()/df.shape[0]
priors

oranges    0.357143
bananas    0.285714
mangoes    0.214286
others     0.142857
Name: fruit, dtype: float64

This was extremely easy 👌 😁

## Posterior

In [40]:
bananas = df[df.fruit == 'bananas'] # no need to go through banana indices to find the banana rows like in numpy
bananas

Unnamed: 0,fruit,long,sweet,yellow,seed,Brazil
1,bananas,1,0,1,0,0
2,bananas,1,0,1,0,0
3,bananas,0,1,1,0,1
5,bananas,1,1,1,0,0
6,bananas,1,0,1,0,0
...,...,...,...,...,...,...
1384,bananas,1,1,1,0,0
1385,bananas,0,1,1,0,0
1390,bananas,1,1,1,0,0
1394,bananas,1,1,1,0,0


In [41]:
oranges = df[df.fruit == 'oranges']i 
oranges

SyntaxError: invalid syntax (<ipython-input-41-1eb7ebbdf2c7>, line 1)

In [42]:
mangoes = df[df.fruit == 'mangoes']
mangoes

Unnamed: 0,fruit,long,sweet,yellow,seed,Brazil
8,mangoes,0,1,1,0,1
12,mangoes,0,0,0,0,1
23,mangoes,1,1,1,0,1
27,mangoes,0,0,0,0,1
31,mangoes,0,1,0,0,1
...,...,...,...,...,...,...
1367,mangoes,0,1,1,0,1
1372,mangoes,0,0,0,0,1
1377,mangoes,0,1,1,0,1
1381,mangoes,1,1,1,0,1


In [43]:
others = df[df.fruit == 'others']
others

Unnamed: 0,fruit,long,sweet,yellow,seed,Brazil
0,others,0,1,0,1,0
9,others,1,1,0,1,0
14,others,1,1,0,1,0
17,others,1,1,1,1,0
24,others,0,1,1,1,0
...,...,...,...,...,...,...
1382,others,0,1,0,1,0
1383,others,1,1,1,1,0
1386,others,0,1,0,1,0
1388,others,0,1,1,1,0


In [68]:
# let's create our posterio matrix
# row names -> bananas, mangoes, oranges, others
# column indices -> long, NOT long, sweet, NOT sweet, yellow, NOT yellow, sweet, NOT sweet, Brazil, NOT Brazil
# a 4x10 matrix


column_names = ['long', 'NOT_long', 'sweet', 'NOT_sweet', 'yellow', 'NOT_yellow', 'seed', 'NOT_seed', 'Brazil', 'NOT_Brazil']
posteriors = pd.DataFrame(np.zeros([4, 10]), index=priors.index, columns=column_names)
posteriors

Unnamed: 0,long,NOT_long,sweet,NOT_sweet,yellow,NOT_yellow,seed,NOT_seed,Brazil,NOT_Brazil
oranges,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
bananas,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mangoes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
others,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
# populate the posterior matrix

# 1. bananas, e.g. posteriors matrix COLUMN 0
print(df[df.fruit == 'bananas'].long.sum()) # <- finds the long bananas
df[df.fruit == 'bananas'].shape[0] - df[df.fruit == 'bananas'].long.sum() # <- finds the NOT long bananas

360


40

In [86]:
df.columns[0]

'fruit'

Pandas is great for column-wise and row-wise operations versus element-wise operations

In [87]:
# iterating over every fruit and calculating its posterior for the LONG and NOT LONG features
for feature in df.columns:
    
    if feature != 'fruit':

        feature_count = []
        not_feature_count = []

        for fruit in posteriors.index:

            total_fruit_count = df[df.fruit == fruit].shape[0]
            feature_count.append(df[df.fruit == fruit][feature].sum() / total_fruit_count)
            not_feature_count.append((df[df.fruit == fruit].shape[0] - df[df.fruit == fruit][feature].sum()) / total_fruit_count)

        posteriors[feature] = feature_count
        posteriors['NOT_' + feature] = not_feature_count

In [88]:
posteriors

Unnamed: 0,long,NOT_long,sweet,NOT_sweet,yellow,NOT_yellow,seed,NOT_seed,Brazil,NOT_Brazil
oranges,0.006,0.994,0.92,0.08,0.06,0.94,0.94,0.06,0.02,0.98
bananas,0.9,0.1,0.625,0.375,0.975,0.025,0.0,1.0,0.1625,0.8375
mangoes,0.366667,0.633333,0.666667,0.333333,0.4,0.6,0.0,1.0,1.0,0.0
others,0.225,0.775,1.0,0.0,0.5,0.5,0.95,0.05,0.0,1.0
