In [9]:
from sklearn import datasets
import pandas as pd
import numpy as np
import math

In [10]:
iris = datasets.load_iris()

In [11]:
df = pd.DataFrame(iris.data)
df.columns = ["sl", "sw", 'pl', 'pw']

In [12]:
def abc(k, *val):
    if k < val[0]:
        return 0
    else:
        return 1

In [13]:
df.sl.apply(abc, args=(5,))

0      1
1      0
2      0
3      0
4      1
5      1
6      0
7      1
8      0
9      0
10     1
11     0
12     0
13     0
14     1
15     1
16     1
17     1
18     1
19     1
20     1
21     1
22     0
23     1
24     0
25     1
26     1
27     1
28     1
29     0
      ..
120    1
121    1
122    1
123    1
124    1
125    1
126    1
127    1
128    1
129    1
130    1
131    1
132    1
133    1
134    1
135    1
136    1
137    1
138    1
139    1
140    1
141    1
142    1
143    1
144    1
145    1
146    1
147    1
148    1
149    1
Name: sl, Length: 150, dtype: int64

In [14]:
def label(val, *boundaries):
    if (val < boundaries[0]):
        return 'a'
    elif (val < boundaries[1]):
        return 'b'
    elif (val < boundaries[2]):
        return 'c'
    else:
        return 'd'

def toLabel(df, old_feature_name):
    second = df[old_feature_name].mean()
    minimum = df[old_feature_name].min()
    first = (minimum + second)/2
    maximum = df[old_feature_name].max()
    third = (maximum + second)/2
    return df[old_feature_name].apply(label, args= (first, second, third))

In [15]:
df['sl_labeled'] = toLabel(df, 'sl')
df['sw_labeled'] = toLabel(df, 'sw')
df['pl_labeled'] = toLabel(df, 'pl')
df['pw_labeled'] = toLabel(df, 'pw')
df

Unnamed: 0,sl,sw,pl,pw,sl_labeled,sw_labeled,pl_labeled,pw_labeled
0,5.1,3.5,1.4,0.2,b,c,a,a
1,4.9,3.0,1.4,0.2,a,b,a,a
2,4.7,3.2,1.3,0.2,a,c,a,a
3,4.6,3.1,1.5,0.2,a,c,a,a
4,5.0,3.6,1.4,0.2,a,c,a,a
5,5.4,3.9,1.7,0.4,b,d,a,a
6,4.6,3.4,1.4,0.3,a,c,a,a
7,5.0,3.4,1.5,0.2,a,c,a,a
8,4.4,2.9,1.4,0.2,a,b,a,a
9,4.9,3.1,1.5,0.1,a,c,a,a


In [16]:
df.drop(['sl', 'sw', 'pl', 'pw'], axis = 1, inplace = True)

In [17]:
set(df['sl_labeled'])

{'a', 'b', 'c', 'd'}

In [18]:
df["output"] = iris.target

In [19]:
def fit(data):
    output_name = data.columns[-1]
    features = data.columns[0:-1]
    counts = {}
    counts["total_count"]=len(data)
    possible_outputs = set(data[output_name])
    for output in possible_outputs:
        counts[output] = {}
        smallData = data[data[output_name] == output]
        counts[output]["total_count"] = len(smallData)
        for i in range(len(features)):
            f=features[i]
            counts[output][i] = {}
            possible_values = set(data[f])
            for value in possible_values:
                val_count = len(smallData[smallData[f] == value])
                counts[output][i][value] = val_count
    return counts

In [20]:
store=fit(df)
store

{'total_count': 150,
 0: {'total_count': 50,
  0: {'c': 0, 'd': 0, 'b': 22, 'a': 28},
  1: {'c': 32, 'd': 10, 'b': 7, 'a': 1},
  2: {'c': 0, 'd': 0, 'b': 0, 'a': 50},
  3: {'c': 0, 'd': 0, 'b': 0, 'a': 50}},
 1: {'total_count': 50,
  0: {'c': 24, 'd': 2, 'b': 21, 'a': 3},
  1: {'c': 8, 'd': 0, 'b': 29, 'a': 13},
  2: {'c': 43, 'd': 0, 'b': 7, 'a': 0},
  3: {'c': 40, 'd': 0, 'b': 10, 'a': 0}},
 2: {'total_count': 50,
  0: {'c': 29, 'd': 15, 'b': 5, 'a': 1},
  1: {'c': 15, 'd': 2, 'b': 28, 'a': 5},
  2: {'c': 20, 'd': 30, 'b': 0, 'a': 0},
  3: {'c': 16, 'd': 34, 'b': 0, 'a': 0}}}

In [21]:
def predict(store,x_test):
    y_pred=[]
    for x in x_test:
        x_class=predictsinglepoint(x,store)
        y_pred.append(x_class)
    return y_pred
    

In [22]:
def predictsinglepoint(x,store):
    classes=store.keys()
    best_prob= -1
    best_class= -1
    first_run=True
    for currentclass in classes:
        if(currentclass=='total_count'):
            continue
        p_currentclass = probability(store,x,currentclass)
        if (first_run or p_currentclass>best_prob):
            best_prob=p_currentclass
            best_class=currentclass
        first_run=False
    return best_class


In [23]:
def probability(store,x,currentclass):
    output = 0
    features=store[currentclass].keys()
    for j in range(len(features)-1):
        current_x_j=x[j]
        count_class_and_feature = store[currentclass][j][current_x_j]+1 #laplace correction
        possi=len(store[currentclass][j].keys())
        count_class=store[currentclass]['total_count'] + possi #laplace correction
        p=np.log(count_class_and_feature)-np.log(count_class)
        output=output+p
    count_class=store[currentclass]['total_count']
    total_count=store['total_count']
    class_prob=np.log(count_class)-np.log(total_count)
    ouput=output+class_prob
    return output
    
            

In [29]:
predict(store,ndf)

TypeError: 'int' object is not subscriptable

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
xtrain,xtest,ytrain,ytest=train_test_split(np.asarray(df.iloc[:,:-1]),np.asarray(df.iloc[:,-1:]))

In [27]:
ndf=pd.DataFrame(xtrain)
ndf['output']=ytrain

In [28]:
ndf

Unnamed: 0,0,1,2,3,output
0,b,c,a,a,0
1,c,c,c,d,2
2,a,c,a,a,0
3,c,a,c,d,2
4,b,b,c,c,1
5,c,c,c,c,1
6,a,c,a,a,0
7,b,a,b,b,1
8,d,d,d,d,2
9,b,c,a,a,0
