### Data Mining and Machine Learning
### Edgar Acuna
### Naive Bayes Classifier
### March 2021

In [39]:
import numpy as np
import pandas as pd
import math as m
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import Binarizer
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import KBinsDiscretizer
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline

### Example: Naive Bayes for mixed type of attributes
### Option 1: Discretizing feature x4

In [44]:
x1=[0,0,1,0,1,0,1]
x2=[0,1,1,0,1,0,1]
x3=[1,0,0,1,1,1,0]
x4=[3.15,8.17,5.72,7.16,9.32,12.81,15.48]
y=[0,0,0,1,1,1,1]
data=[x1,x2,x3,x4,y]
data=np.transpose(data)
df=pd.DataFrame(data,columns=['x1','x2','x3','x4d','y'])
df1=df.copy()
#discretizando x4 en dos bins
bins = [0, 9, 50]
labels =[0,1]
df['x4d'] = pd.cut(df['x4d'], bins,labels=labels)
df = df.astype(int)
df['y'].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   x1      7 non-null      int32
 1   x2      7 non-null      int32
 2   x3      7 non-null      int32
 3   x4d     7 non-null      int32
 4   y       7 non-null      int32
dtypes: int32(5)
memory usage: 268.0 bytes


In [45]:
# Calculo de las probabilidades posteriores y  las clases predichas
y=df['y']
Xd=df.iloc[:,0:4]
clf = BernoulliNB()
clf.fit(Xd,y)
proba=pd.DataFrame(clf.predict_proba(Xd))
pred=clf.predict(Xd)
proba['pred']=pred
print("the accuracy is", (pred==y).sum()/len(Xd))
proba

the accuracy is 0.8571428571428571


Unnamed: 0,0,1,pred
0,0.509034,0.490966,0
1,0.823496,0.176504,0
2,0.756715,0.243285,0
3,0.509034,0.490966,0
4,0.114731,0.885269,1
5,0.114731,0.885269,1
6,0.279954,0.720046,1


### Option 2: without discretizing  feature x4 and using the mixed naive Bayes module

In [48]:
from mixed_naive_bayes import MixedNB
X = [[0, 0, 1, 3.15],
     [0, 1, 0, 8.17],
     [1, 1, 0, 5.72],
     [0, 0, 1, 7.16],
     [1, 1, 1, 9.32],
     [0, 0,1, 12.81],
     [1, 1, 0, 15.48]]
y = [0, 0, 0, 1, 1,1,1]
clf = MixedNB(categorical_features=[0,1,2])
clf.fit(X,y)
clf.predict(X)
proba1=pd.DataFrame(clf.predict_proba(X))
pred1=clf.predict(X)

[2 2 2]


In [49]:
proba1['pred']=pred1
print("the accuracy is", (pred1==y).sum()/len(X))
proba1

the accuracy is 0.8571428571428571


Unnamed: 0,0,1,pred
0,0.867185,0.132562,0
1,0.739939,0.259985,0
2,0.908308,0.091618,0
3,0.500911,0.499,0
4,0.125832,0.874085,1
5,0.001569,0.99834,1
6,6.1e-05,0.999479,1


Notar que haya una mejora en la precision del clasificador

In [50]:
#Prediciendo una nueva instancia
xnew=[[0,0,1,4.25]]
clf.predict(xnew)

array([0], dtype=int64)

### Ejemplo: Naive Bayes aplicado a Pima-Diabetes

In [51]:
#Leyendo el conjunto de datos pima-diabetes 
url= "http://academic.uprm.edu/eacuna/diabetes.dat"
names = ['preg','plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = pd.read_table(url, names=names,header=None)
print(data.shape)
data.head()

(768, 9)


Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,2
1,1,85,66,29,0,26.6,0.351,31,1
2,8,183,64,0,0,23.3,0.672,32,2
3,1,89,66,23,94,28.1,0.167,21,1
4,0,137,40,35,168,43.1,2.288,33,2


In [52]:
#Hallando la columna de clases y la matriz de predictoras
y=data['class']
X=data.iloc[:,0:8]
y1=y.to_numpy()
X1=X.to_numpy()

### Debido a que las predictoras son continuas usaremos Gaussian Naive Bayes, donde la distribucion probabilistica de cada predictora es aproximada como una Normal univariada

In [53]:
# Calculo de las probabilidades posteriores y de las clases predichas
clf = GaussianNB()
clf.fit(X1,y1)
proba=pd.DataFrame(clf.predict_proba(X1))
pred=clf.predict(X1)
proba['pred']=pred
print ( (pred==y1).sum()/len(X1))
proba.head()

0.7630208333333334


Unnamed: 0,0,1,pred
0,0.328506,0.671494,2
1,0.980506,0.019494,1
2,0.198911,0.801089,2
3,0.986822,0.013178,1
4,0.000217,0.999783,2


In [54]:
#Tasa de precision usando validacion cruzada usando 10 repeticiones con 10 folds cada una
clf = GaussianNB()
pred1=[]
for i in range (0,11):
    cv = ShuffleSplit()
    scores = cross_val_score(clf, X1,y1, cv=cv)
    pred1.append(scores)
print("Estimado de la precision por validacion cruzada=", np.mean(pred1),"+/-",np.std(pred1))    

Estimado de la precision por validacion cruzada= 0.7584415584415584 +/- 0.04712563229967936


In [55]:
#Estimando la precision por el metodo holdout con 50 muestras
pred=[]
for i in range(0,51):
    X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.33)
    naivediab = GaussianNB().fit(X_train, y_train)
    pred.append(naivediab.score(X_test, y_test))
print ("Estimado de la precision=",np.mean(pred),"+/-",np.std(pred))

Estimado de la precision= 0.7540528022232516 +/- 0.023818759719964947


### Aplicando Bernoulli Naive Bayes, donde cada atributo es discretizado en dos valores, determinado por la mediana de cada atributo (Binarizacion)

In [56]:
from sklearn.model_selection import cross_val_score
#Usando la mediana como punto de corte
medX=X.median(axis=0)
predictors=['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age']
Xd=X.copy()
for col in predictors:
    Xd[col] =  (X[col] < medX[col]).astype(int)
Xd.head()
X2=Xd.to_numpy()
clf = BernoulliNB()
scores = cross_val_score(clf, X2, y1, cv=10)
scores     

array([0.71428571, 0.71428571, 0.71428571, 0.62337662, 0.7012987 ,
       0.75324675, 0.7012987 , 0.81818182, 0.67105263, 0.73684211])

In [57]:
print("Precision: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std()))

Precision: 0.715 (+/- 0.05)


### Aplicando Multinomial Naive Bayes, donde la distribucion de cada predictora es considerada que es una Multinomial

In [58]:
from sklearn.model_selection import cross_val_score
clf = MultinomialNB()
scores = cross_val_score(clf, X1, y1, cv=10)
scores     

array([0.62337662, 0.58441558, 0.7012987 , 0.48051948, 0.61038961,
       0.61038961, 0.5974026 , 0.62337662, 0.55263158, 0.52631579])

In [59]:
print("Precision: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

Precision: 0.591 (+/- 0.057)


Comentario: Muy Mal resultado

### Discretizando las predictoras y luego aplicando Bayes Multinomial, esto mejora el anterior resultado

In [60]:
diab_disc=X.apply(lambda x: pd.cut(x,4,precision=2,labels=False), axis=0)
diab_disc.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age
0,1,2,2,1,0,2,0,1
1,0,1,2,1,0,1,0,0
2,1,3,2,0,0,1,1,0
3,0,1,2,0,0,1,0,0
4,0,2,1,1,0,2,3,0


In [61]:
X1=diab_disc.to_numpy()
clf = MultinomialNB()
scores = cross_val_score(clf, X1, y1, cv=10)
scores     

array([0.68831169, 0.67532468, 0.68831169, 0.63636364, 0.63636364,
       0.66233766, 0.62337662, 0.68831169, 0.67105263, 0.67105263])

In [62]:
print("Precision: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

Precision: 0.664 (+/- 0.023)


## Applying  Naive Bayes to the  Auto-mpg dataset

In [63]:
names = ['mpg','cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'car_name']
df=pd.read_table("https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data",sep="\s+",names=names,header=None,na_values="?")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car_name      398 non-null    object 
dtypes: float64(5), int64(3), object(1)
memory usage: 28.1+ KB


In [23]:
df1=df.dropna()

In [24]:
df1.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,23.445918,5.471939,194.41199,104.469388,2977.584184,15.541327,75.979592,1.576531
std,7.805007,1.705783,104.644004,38.49116,849.40256,2.758864,3.683737,0.805518
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0,1.0
25%,17.0,4.0,105.0,75.0,2225.25,13.775,73.0,1.0
50%,22.75,4.0,151.0,93.5,2803.5,15.5,76.0,1.0
75%,29.0,8.0,275.75,126.0,3614.75,17.025,79.0,2.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0,3.0


In [25]:
df2=df1.copy()
#discretizando x4 en dos bins
bins1 = [8, 25, 47]
bins2=[66,200,460]
bins3=[45, 90, 230]
bins4=[1610,3000,5140]
bins5=[7,15,25]
labels =[0,1]
df2['mpg'] = pd.cut(df2['mpg'], bins1,labels=labels)
df2 = df2.astype('category')
df2['displacement']=pd.cut(df2['displacement'],bins2,labels=labels)
df2['horsepower']=pd.cut(df2['horsepower'],bins3,labels=labels)
df2['weight']=pd.cut(df2['weight'],bins4,labels=labels)
df2['acceleration']=pd.cut(df2['acceleration'],bins5,labels=labels)
bins6=[69,74,77,82]
labels1=[0,1,2]
df2['model_year']=pd.cut(df2['model_year'],bins6,labels=labels1)
df3=df2.drop('car_name',axis=1)
df3["origin"] = df3["origin"].apply(lambda x: x - 1)
df3['horsepower'].value_counts()/392


1    0.520408
0    0.479592
Name: horsepower, dtype: float64

In [26]:
#Hallando la columna de clases y la matriz de predictoras
y=df3['mpg']
X=df3.iloc[:,1:8]
X = X.astype(int)
X['displacement'].value_counts()

0    234
1    158
Name: displacement, dtype: int64

In [27]:
#Hallando la columna de clases y la matriz de predictoras
clf=MultinomialNB()
clf.fit(X,y)
proba1=pd.DataFrame(clf.predict_proba(X))
pred1=clf.predict(X)
#proba1['pred']=pred
print("the accuracy is", (pred1==y).sum()/len(X))
pred1

the accuracy is 0.8545918367346939


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,

In [28]:
#Tasa de precision usando validacion cruzada usando 10 repeticiones con 10 folds cada una
clf = MultinomialNB()
pred1=[]
for i in range (0,11):
    cv = ShuffleSplit()
    scores = cross_val_score(clf, X,y, cv=cv)
    pred1.append(scores)
print("Estimado de la precision por validacion cruzada=", np.mean(pred1),"+/-",np.std(pred1))    

Estimado de la precision por validacion cruzada= 0.8525 +/- 0.05518048897605352


In [82]:
#Estimando la precision por el metodo holdout con 50 muestras
pred=[]
for i in range(0,51):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
    naivediab = MultinomialNB().fit(X_train, y_train)
    pred.append(naivediab.score(X_test, y_test))
print ("Estimado de la precision=",np.mean(pred),"+/-",np.std(pred))

Estimado de la precision= 0.8567119155354448 +/- 0.022072709659036772
