### ESMA 4016
### Edgar Acuna
### Clasificacion usando Naive Bayes
#### Marzo 2018

In [2]:
import numpy as np
import pandas as pd
import math as m
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import Binarizer
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
import seaborn as sns
%matplotlib inline

In [3]:
#Leyendo el conjunto de datos pima-diabetes 
url= "http://academic.uprm.edu/eacuna/diabetes.dat"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = pd.read_table(url, names=names,header=None)
print(data.shape)
data.head()

(768, 9)


Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,2
1,1,85,66,29,0,26.6,0.351,31,1
2,8,183,64,0,0,23.3,0.672,32,2
3,1,89,66,23,94,28.1,0.167,21,1
4,0,137,40,35,168,43.1,2.288,33,2


In [4]:
#Hallando la columna de clases y la matriz de predictoras
y=data['class']
X=data.iloc[:,0:8]
medX=X.median(axis=0)
y1=y.as_matrix()
X1=X.as_matrix()

### Debido a que las predictoras son continuas usaremos Gaussian Bayes, donde la distribucion probabilistica de cada predictora es aproximada como una Normal univariada

In [8]:
# Calculo de las probabilidades posteriores y de las clases predichas
clf = GaussianNB()
clf.fit(X1,y1)
proba=pd.DataFrame(clf.predict_proba(X1))
pred=clf.predict(X1)
proba['pred']=pred
print (pred==y1).sum()/float(768)
proba.head()

0.763020833333


Unnamed: 0,0,1,pred
0,0.328506,0.671494,2
1,0.980506,0.019494,1
2,0.198911,0.801089,2
3,0.986822,0.013178,1
4,0.000217,0.999783,2


In [10]:
#Estimando la precision por el metodo holdout con 50 muestras
pred=[]
for i in range(0,51):
    X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.33)
    naivediab = GaussianNB().fit(X_train, y_train)
    pred.append(naivediab.score(X_test, y_test))
print "Estimado de la precision=",np.mean(pred)

Estimado de la precision= 0.749807009418


In [11]:
#Tasa de precision usando validacion cruzada usando 10 repeticiones con 10 folds cada una
clf = GaussianNB()
pred1=[]
for i in range (0,11):
    cv = ShuffleSplit()
    scores = cross_val_score(clf, X1,y1, cv=cv)
    pred1.append(scores)
print "Estimado de la precision por validacion cruzada=", np.mean(pred1),"+/-",np.std(pred1)    

Estimado de la precision por validacion cruzada= 0.755962219599 +/- 0.0458690981226


### Aplicando Bernoulli Naive Bayes, donde cada atributo es discretizado en dos valores, detreminado por la mediana de cada atributo (Binarizacion)

In [12]:
from sklearn.model_selection import cross_val_score
#Usando la mediana como punto de corte
medX=X.median(axis=0)
predictors=['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age']
Xd=X.copy()
for col in predictors:
    Xd[col] =  (X[col] < medX[col]).astype(int)
Xd.head()
X2=Xd.as_matrix()
clf = BernoulliNB()
scores = cross_val_score(clf, X2, y1, cv=10)
scores     

array([ 0.71428571,  0.71428571,  0.71428571,  0.62337662,  0.7012987 ,
        0.75324675,  0.7012987 ,  0.81818182,  0.67105263,  0.73684211])

In [13]:
print("Precision: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std()))

Precision: 0.715 (+/- 0.05)


### Aplicando Multinomial Naive Bayes, donde la distribucion de cada predictora es considerada que es una Multinomial

In [14]:
from sklearn.model_selection import cross_val_score
clf = MultinomialNB()
scores = cross_val_score(clf, X1, y1, cv=10)
scores     

array([ 0.62337662,  0.58441558,  0.7012987 ,  0.48051948,  0.61038961,
        0.61038961,  0.5974026 ,  0.62337662,  0.55263158,  0.52631579])

In [15]:
print("Precision: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

Precision: 0.591 (+/- 0.057)


### Discretizando las predictoras y luego aplicando Bayes Multinomial, esto mejora el anterior resultado

In [16]:
# Funcion auxiliar para discretizar cualquier columna de un dataframe
def disc_col_ew(df,str,k,out):
    df1=df[str]
    bins=np.linspace(df1.min(), df1.max(),k)
    if out=="num":
        df1=pd.cut(df1,bins=bins,include_lowest=True, right=True,labels=False)
    else:
        bins[0]=float('-inf')
        bins[k-1]=float('inf')
        df1=pd.cut(df1,bins=bins,include_lowest=True, right=True)  
    return df1

In [17]:
# funcion auxiliar para detreminar el numero optimo de intervalos segun la formula de scott
def nclass_scott(x):
    h=3.5*(np.var(x,ddof=1)**.5)*len(x)**(-.3333)
    intervals=m.ceil((max(x)-min(x))/h)
    return int(intervals)
print "El numero optimo de intervalos para el attribute pedi es:", nclass_scott(X['pedi'])

El numero optimo de intervalos para el attribute pedi es: 19


In [18]:
#Funcion para discretizar todas las colunan de un dataframe
def disc_ew(df,out):
    name=df.columns.tolist()
    disc=pd.DataFrame()
    for name in df.columns.tolist():
        k=nclass_scott(df[name])
        disc[name]=disc_col_ew(df,name,k,out)
    return disc  

In [19]:
#discretizando las columnas de la matriz predictora X de diabetes
diab_disc=disc_ew(X,out="num")

In [20]:
X1=diab_disc.as_matrix()
clf = MultinomialNB()
scores = cross_val_score(clf, X1, y1, cv=10)
scores     

array([ 0.68831169,  0.7012987 ,  0.68831169,  0.55844156,  0.61038961,
        0.67532468,  0.58441558,  0.68831169,  0.67105263,  0.67105263])

In [21]:
print("Precision: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

Precision: 0.654 (+/- 0.048)
