### ESMA 4016
### EDgar Acuna
### Clasificacion usando Naive Bayes

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
import seaborn as sns
%matplotlib inline

In [3]:
#Leyendo el conjunto de datos pima-diabetes 
url= "http://academic.uprm.edu/eacuna/diabetes.dat"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = pd.read_table(url, names=names,header=None)
print(data.shape)
data.head()

(768, 9)


Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,2
1,1,85,66,29,0,26.6,0.351,31,1
2,8,183,64,0,0,23.3,0.672,32,2
3,1,89,66,23,94,28.1,0.167,21,1
4,0,137,40,35,168,43.1,2.288,33,2


In [4]:
#Hallando la columna de clases y la matriz de predictoras
y=data['class']
X=data.iloc[:,0:8]
y1=y.as_matrix()
X1=X.as_matrix()

### Debido a que las predictoras son continuas usaremos Gaussian Bayes, donde la distribucion probabilistica de cada predictora es aproximada como una Normal univariada

In [5]:
# Calculo de las probabilidades posteriores y de las clases predichas
clf = GaussianNB()
clf.fit(X1,y1)
proba=pd.DataFrame(clf.predict_proba(X1))
pred=clf.predict(X1)
proba['pred']=pred
proba.head()

Unnamed: 0,0,1,pred
0,0.328506,0.671494,2
1,0.980506,0.019494,1
2,0.198911,0.801089,2
3,0.986822,0.013178,1
4,0.000217,0.999783,2


In [6]:
#Estimando la precision por el metodo holdout con 50 muestras
pred=[]
for i in range(0,51):
    X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.33)
    naivediab = GaussianNB().fit(X_train, y_train)
    pred.append(naivediab.score(X_test, y_test))
print "Estimado de la precison=",np.mean(pred)

Estimado de la precison= 0.755982708044


In [7]:
#Tasa de precision usando validacion cruzada usando 10 repeticiones con 10 folds cada una
clf = GaussianNB()
pred1=[]
for i in range (0,11):
    cv = ShuffleSplit()
    scores = cross_val_score(clf, X1,y1, cv=cv)
    pred1.append(scores)
print "Estimado de la precison por validacion cruzada=", np.mean(pred1),"+/-",np.std(pred1)    

Estimado de la precison por validacion cruzada= 0.751357733176 +/- 0.0487767652565


### Aplicando Bernoulli Naive Bayes, donde cada atributo es discretizado en dos valores

In [8]:
from sklearn.model_selection import cross_val_score
clf = BernoulliNB()
scores = cross_val_score(clf, X1, y1, cv=10)
scores     

array([ 0.63636364,  0.64935065,  0.64935065,  0.64935065,  0.64935065,
        0.62337662,  0.66233766,  0.64935065,  0.64473684,  0.63157895])

In [127]:
print("Precision: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std()))

Precision: 0.645 (+/- 0.01)


### Aplicando Multinomial Naive Bayes, donde la distribucion de cada predictora es considerada que es una Multinomial

In [9]:
from sklearn.model_selection import cross_val_score
clf = MultinomialNB()
scores = cross_val_score(clf, X1, y1, cv=10)
scores     

array([ 0.62337662,  0.58441558,  0.7012987 ,  0.48051948,  0.61038961,
        0.61038961,  0.5974026 ,  0.62337662,  0.55263158,  0.52631579])

In [10]:
print("Precision: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

Precision: 0.591 (+/- 0.057)


### Discretizando las predictoras y luego aplicando Bayes multinomial

In [20]:
def disc_col_ew(df,str,k,out):
    df1=df[str]
    bins=np.linspace(df1.min(), df1.max(),k)
    if out=="num":
        df1=pd.cut(df1,bins=bins,include_lowest=True, right=True,labels=False)
    else:
        bins[0]=float('-inf')
        bins[k-1]=float('inf')
        df1=pd.cut(df1,bins=bins,include_lowest=True, right=True)  
    return df1

In [21]:
def disc_ew(df,k,out):
    name=df.columns.tolist()
    disc=pd.DataFrame()
    for name in df.columns.tolist():
        disc[name]=disc_col_ew(df,name,k,out)
    return disc  

In [22]:
#discretizando los datos en 10 intervalos
diab_disc=disc_ew(X,10,out="num")

In [23]:
X1=diab_disc.as_matrix()
clf = MultinomialNB()
scores = cross_val_score(clf, X1, y1, cv=10)
scores     

array([ 0.68831169,  0.66233766,  0.68831169,  0.58441558,  0.58441558,
        0.7012987 ,  0.61038961,  0.71428571,  0.59210526,  0.65789474])

In [24]:
print("Precision: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

Precision: 0.648 (+/- 0.048)
