In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# %matplotlib inline

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

from collections import Counter
import math

In [2]:
# data
def create_data():
    iris = load_iris()
    df = pd.DataFrame(iris.data, columns=iris.feature_names)
    df['label'] = iris.target
    df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
    data = np.array(df.iloc[:100, :])
    # print(data)
    return data[:,:-1], data[:,-1]

In [3]:
X, y = create_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [4]:
X_test[0], y_test[0]

(array([4.6, 3.1, 1.5, 0.2]), 0.0)

In [32]:
def mean(x):
    return sum(x) / float(len(x))

def stdev(x):
    avg=mean(x)
    return np.sqrt(sum((x1-avg)**2 for x1 in x)) / float(len(x))

def gaussian_proba(x,mean,stdev):
    exp=math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1/(math.pow(2*math.pi,1/2)*stdev)) * exp


In [14]:
def summariez(train_data):
    summaries=[(mean(i),stdev(i)) for i in zip(*train_data)]
    return summaries

In [15]:
X_train[:5]

array([[4.7, 3.2, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.3],
       [5. , 3. , 1.6, 0.2],
       [5.9, 3. , 4.2, 1.5],
       [6.3, 3.3, 4.7, 1.6]])

In [17]:
def fit(x,y):
    labels=list(set(y))
    data={label:[] for label in labels}
    model={}
    for f ,label in zip(x,y):
        data[label].append(f)
    model={label:summariez(value for label,value in data.items())}
    return model
        

In [37]:
def calculate_proba(input_data,model):
    probability={}
    for label,value in model.items():
        probability[label]=1
        for i in range(len(value)):
            mean,stdev=value[i]
            print(mean,stdev)
            probability[label]*=gaussian_proba(input_data[i],mean,stdev)
    return probability

In [38]:
def predict(x,y,x_test):
    label=sorted(calculate_proba(x_test,fit(x,y)).items(),key=lambda x:x[-1])[-1][0]
    return label

In [39]:
def score(x_test,y_test):
    right=0
    for x,y in zip(x_test,y_test):
        label=predict(x)
        if label==y:
            right+=1
    return right / float(len(x_test))

In [40]:
predict(X_train,y_train,X_test)

[5.3  3.1  2.9  0.85] [0.42426407 0.07071068 0.91923882 0.45961941]


TypeError: only size-1 arrays can be converted to Python scalars

In [41]:
fit(X_train,y_train)

{0.0: [(array([5.3 , 3.1 , 2.9 , 0.85]),
   array([0.42426407, 0.07071068, 0.91923882, 0.45961941])),
  (array([5.55, 3.15, 3.05, 0.95]),
   array([0.53033009, 0.10606602, 1.16672619, 0.45961941])),
  (array([5.35, 2.95, 2.9 , 0.75]),
   array([0.24748737, 0.03535534, 0.91923882, 0.38890873])),
  (array([5.55, 3.25, 2.7 , 0.75]),
   array([0.03535534, 0.1767767 , 0.98994949, 0.38890873])),
  (array([5.55, 3.1 , 3.1 , 1.  ]),
   array([0.31819805, 0.14142136, 0.98994949, 0.35355339])),
  (array([5.45, 3.45, 3.  , 0.7 ]),
   array([0.1767767 , 0.45961941, 1.06066017, 0.42426407])),
  (array([5.6 , 2.95, 2.85, 0.7 ]),
   array([0.56568542, 0.03535534, 1.02530483, 0.42426407])),
  (array([5.6 , 3.3 , 2.95, 0.85]),
   array([0.35355339, 0.35355339, 0.74246212, 0.31819805])),
  (array([5.45, 3.  , 3.15, 0.8 ]),
   array([0.45961941, 0.07071068, 1.09601551, 0.42426407])),
  (array([4.95, 2.85, 2.65, 0.7 ]),
   array([0.45961941, 0.10606602, 1.09601551, 0.42426407])),
  (array([5.4 , 2.3 , 2.8