In [145]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [31]:
df = pd.read_csv("../datasets/bays/mushrooms.csv")

In [32]:
df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [33]:
le = LabelEncoder()

In [34]:
df = df.apply(le.fit_transform)

In [35]:
X = df.drop(["type"], axis=1)
y = df["type"]

In [36]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 22 columns):
cap_shape                   8124 non-null int64
cap_surface                 8124 non-null int64
cap_color                   8124 non-null int64
bruises                     8124 non-null int64
odor                        8124 non-null int64
gill_attachment             8124 non-null int64
gill_spacing                8124 non-null int64
gill_size                   8124 non-null int64
gill_color                  8124 non-null int64
stalk_shape                 8124 non-null int64
stalk_root                  8124 non-null int64
stalk_surface_above_ring    8124 non-null int64
stalk_surface_below_ring    8124 non-null int64
stalk_color_above_ring      8124 non-null int64
stalk_color_below_ring      8124 non-null int64
veil_type                   8124 non-null int64
veil_color                  8124 non-null int64
ring_number                 8124 non-null int64
ring_type                   8

In [37]:
df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [38]:
le.classes_

array(['d', 'g', 'l', 'm', 'p', 'u', 'w'], dtype=object)

In [39]:
set(df["habitat"])

{0, 1, 2, 3, 4, 5, 6}

In [40]:
set(df["type"])

{0, 1}

In [74]:
def check_prior(data, value):
    return np.sum(data == value) / len(data)

In [75]:
X.loc[0]

cap_shape                   5
cap_surface                 2
cap_color                   4
bruises                     1
odor                        6
gill_attachment             1
gill_spacing                0
gill_size                   1
gill_color                  4
stalk_shape                 0
stalk_root                  3
stalk_surface_above_ring    2
stalk_surface_below_ring    2
stalk_color_above_ring      7
stalk_color_below_ring      7
veil_type                   0
veil_color                  2
ring_number                 1
ring_type                   4
spore_print_color           2
population                  3
habitat                     5
Name: 0, dtype: int64

In [77]:
prior

{0: 0.517971442639094, 1: 0.48202855736090594}

In [140]:
class NB:
    
    def __init__(self):
        pass

    def fit(self, X, y):
        model = {}   
        prior = {}
        klasses = set(y)
        for klass in klasses:
            model[klass] = {}
            for col in X.columns:
                model[klass][col] = {}
                for val in set(X[col]):
                    select = X.loc[y==klass]
                    prob = np.sum(select[col] == val)/len(select)
                    model[klass][col][val] = prob
        for klass in klasses:
            prior[klass] = np.sum(y == klass) / len(y)
    
        self.model = model
        self.prior = prior
        
    def predict_point(self, point):
        
        probs = []
        
        for klass in self.prior:
            p = self.prior[klass]
            for col in self.model[klass]:
                p *= self.model[klass][col][point[col]]
            
            probs.append(p)
        return np.argmax(probs)
    
    def predict(self, X):
        y = []
        for index, row in X.iterrows():
            y.append(self.predict_point(row))
        
        return np.array(y)
    
    def score(self, X, y):
        y_p = self.predict(X)
        return np.mean(y == y_p)
    

In [147]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.33, random_state=42)

In [148]:
nb = NB()

In [149]:
nb.fit(X_train, y_train)

In [150]:
# 

In [151]:
nb.score(X_test, y_test)

0.9973890339425587

In [153]:
# nb.model[0]['cap_shape']

In [152]:
# nb.model[1]['cap_shape']

In [139]:
! push "l 18"

[master ebb6a20] l 18
 2 files changed, 2739 insertions(+)
 create mode 100644 lecture_18/.ipynb_checkpoints/Bay's-checkpoint.ipynb
 create mode 100644 lecture_18/Bay's.ipynb
Enumerating objects: 6, done.
Counting objects: 100% (6/6), done.
Delta compression using up to 4 threads.
Compressing objects: 100% (5/5), done.
Writing objects: 100% (5/5), 6.61 KiB | 3.30 MiB/s, done.
Total 5 (delta 1), reused 0 (delta 0)
remote: Resolving deltas: 100% (1/1), completed with 1 local object.[K
To https://github.com/coding-blocks-archives/ML-Noida-2019-June-One.git
   bf1e14b..ebb6a20  master -> master
