## Data Mining  and Machine Learning 
### Edgar Acuna
### Naive Bayes Classifier
#### March 2019

In [2]:
import numpy as np
import pandas as pd
import math as m
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import Binarizer
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
import seaborn as sns
%matplotlib inline

In [3]:
#Leyendo el conjunto de datos pima-diabetes 
url= "http://academic.uprm.edu/eacuna/diabetes.dat"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = pd.read_table(url, names=names,header=None)
print(data.shape)
data.head()

(768, 9)


Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,2
1,1,85,66,29,0,26.6,0.351,31,1
2,8,183,64,0,0,23.3,0.672,32,2
3,1,89,66,23,94,28.1,0.167,21,1
4,0,137,40,35,168,43.1,2.288,33,2


In [4]:
#Hallando la columna de clases y la matriz de predictoras
y=data['class']
X=data.iloc[:,0:8]
medX=X.median(axis=0)
y1=y.as_matrix()
X1=X.as_matrix()

  """
  


### First Method: Discretizing all the features in several categories and applying Mutlinomial Naive Bayes

In [5]:
#Discretizing all the features in 10 values using equal width 
from sklearn.preprocessing import KBinsDiscretizer
est = KBinsDiscretizer(n_bins=2, encode='ordinal', strategy='uniform')
est.fit(X)  
Xt = est.transform(X)
clf=BernoulliNB()
clf.fit(Xt,y1)
scores = cross_val_score(clf, X1, y1, cv=10)
print("Accuracy by cross validation: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

Accuracy by cross validation: 0.645 (+/- 0.011)


Discretizing for equal width using my functions

In [6]:
# Funcion auxiliar para discretizar cualquier columna de un dataframe
def disc_col_ew(df,str,k,out):
    df1=df[str]
    bins=np.linspace(df1.min(), df1.max(),k)
    if out=="num":
        df1=pd.cut(df1,bins=bins,include_lowest=True, right=True,labels=False)
    else:
        bins[0]=float('-inf')
        bins[k-1]=float('inf')
        df1=pd.cut(df1,bins=bins,include_lowest=True, right=True)  
    return df1

In [7]:
# funcion auxiliar para determinar el numero optimo de intervalos segun la formula de scott
def nclass_scott(x):
    h=3.5*(np.var(x,ddof=1)**.5)*len(x)**(-.3333)
    intervals=m.ceil((max(x)-min(x))/h)
    return int(intervals)

In [8]:
#Funcion para discretizar todas las columnas de un dataframe
def disc_ew(df,out):
    name=df.columns.tolist()
    disc=pd.DataFrame()
    for name in df.columns.tolist():
        k=nclass_scott(df[name])
        disc[name]=disc_col_ew(df,name,k,out)
    return disc  

In [9]:
#discretizando las columnas de la matriz predictora X de diabetes
diab_disc=disc_ew(X,out="num")

In [10]:
X1d=diab_disc.as_matrix()
clf = MultinomialNB()
scores = cross_val_score(clf, X1d, y1, cv=10)
scores     

  """Entry point for launching an IPython kernel.


array([0.68831169, 0.7012987 , 0.68831169, 0.55844156, 0.61038961,
       0.67532468, 0.58441558, 0.68831169, 0.67105263, 0.67105263])

In [11]:
print("Accuracy by CV: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

Accuracy by CV: 0.654 (+/- 0.048)


### Second Method:  Applying Bernoulli Naive Bayes. First, each attribute of the dataset is discretized in two values using its median as the cut point(Binarization)

In [12]:
from sklearn.model_selection import cross_val_score
#Usando la mediana como punto de corte
medX=X.median(axis=0)
predictors=['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age']
Xd=X.copy()
for col in predictors:
    Xd[col] =  (X[col] < medX[col]).astype(int)
Xd.head()
X2=Xd.as_matrix()
clf = BernoulliNB()
scores = cross_val_score(clf, X2, y1, cv=10)
scores     

  if __name__ == '__main__':


array([0.71428571, 0.71428571, 0.71428571, 0.62337662, 0.7012987 ,
       0.75324675, 0.7012987 , 0.81818182, 0.67105263, 0.73684211])

In [13]:
print("Accuracy by CV : %0.3f (+/- %0.2f)" % (scores.mean(), scores.std()))

Accuracy by CV : 0.715 (+/- 0.05)



### Third Method: Considering that each attribute is continuous and applying Gaussian Naive Bayes

In [14]:
# Calculo de las probabilidades posteriores y de las clases predichas
clf = GaussianNB()
clf.fit(X1,y1)
proba=pd.DataFrame(clf.predict_proba(X1))
pred=clf.predict(X1)
proba['pred']=pred
print('Accuracy by Resubstitution',(pred==y1).sum()/float(768))
print("Posterior probabilities")
proba.head()

Accuracy by Resubstitution 0.7630208333333334
Posterior probabilities


Unnamed: 0,0,1,pred
0,0.328506,0.671494,2
1,0.980506,0.019494,1
2,0.198911,0.801089,2
3,0.986822,0.013178,1
4,0.000217,0.999783,2


In [15]:
#Estimando la precision por el metodo holdout con 50 muestras
pred=[]
for i in range(0,51):
    X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.33)
    naivediab = GaussianNB().fit(X_train, y_train)
    pred.append(naivediab.score(X_test, y_test))
print("Accuracy by holdout: %0.3f (+/- %0.3f)" % (np.mean(pred), np.std(pred)))

Accuracy by holdout: 0.749 (+/- 0.021)


In [16]:
#Tasa de precision usando validacion cruzada usando 10 repeticiones con 10 folds cada una
clf = GaussianNB()
pred1=[]
for i in range (0,11):
    cv = ShuffleSplit()
    scores = cross_val_score(clf, X1,y1, cv=cv)
    pred1.append(scores)
print("Accuracy by cross validacion=", np.mean(pred1),"+/-",np.std(pred1))    

Accuracy by cross validacion= 0.7576151121605667 +/- 0.04696519166761703


In [29]:
names = ['mpg', 'cyl', 'disp', 'hp', 'wt', 'acc', 'model_year', 'origin', 'car_name']
a=pd.read_table('https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data',header=None, names=names,delim_whitespace=True)
a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
mpg           398 non-null float64
cyl           398 non-null int64
disp          398 non-null float64
hp            398 non-null object
wt            398 non-null float64
acc           398 non-null float64
model_year    398 non-null int64
origin        398 non-null int64
car_name      398 non-null object
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB


In [51]:
a.head()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [52]:
a1=a.iloc[:,0:8]

In [53]:
a1.head()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,model_year,origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


In [59]:
meda=a1.median(axis=0)
names1=names[0:8]
a1d=a1.copy()
#for col in names1:
#    a1d[col] =  (a1[col] < meda[col]).astype(int)
#ad2=ad1.as_matrix()
#clf = BernoulliNB()
#scores = cross_val_score(clf, ad2, y1, cv=10)
#scores     