## Data Mining  and Machine Learning 
### Edgar Acuna
### Naive Bayes Classifier
#### October 2018

In [None]:
import numpy as np
import pandas as pd
import math as m
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import Binarizer
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
import seaborn as sns
%matplotlib inline

In [None]:
#Leyendo el conjunto de datos pima-diabetes 
url= "http://academic.uprm.edu/eacuna/diabetes.dat"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = pd.read_table(url, names=names,header=None)
print(data.shape)
data.head()

In [None]:
#Hallando la columna de clases y la matriz de predictoras
y=data['class']
X=data.iloc[:,0:8]
medX=X.median(axis=0)
y1=y.as_matrix()
X1=X.as_matrix()

### First Method: Discretizing all the features in several categories and applying Mutlinomial Naive Bayes

In [None]:
#Discretizing all the features in 10 values using equal width 
from sklearn.preprocessing import KBinsDiscretizer
est = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
est.fit(X)  
Xt = est.transform(X)
clf=MultinomialNB()
clf.fit(Xt,y1)
scores = cross_val_score(clf, X1, y1, cv=10)
print("Accuracy by cross validation: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

Discretizing for equal width using my functions

In [None]:
# Funcion auxiliar para discretizar cualquier columna de un dataframe
def disc_col_ew(df,str,k,out):
    df1=df[str]
    bins=np.linspace(df1.min(), df1.max(),k)
    if out=="num":
        df1=pd.cut(df1,bins=bins,include_lowest=True, right=True,labels=False)
    else:
        bins[0]=float('-inf')
        bins[k-1]=float('inf')
        df1=pd.cut(df1,bins=bins,include_lowest=True, right=True)  
    return df1

In [None]:
# funcion auxiliar para determinar el numero optimo de intervalos segun la formula de scott
def nclass_scott(x):
    h=3.5*(np.var(x,ddof=1)**.5)*len(x)**(-.3333)
    intervals=m.ceil((max(x)-min(x))/h)
    return int(intervals)

In [None]:
#Funcion para discretizar todas las columnas de un dataframe
def disc_ew(df,out):
    name=df.columns.tolist()
    disc=pd.DataFrame()
    for name in df.columns.tolist():
        k=nclass_scott(df[name])
        disc[name]=disc_col_ew(df,name,k,out)
    return disc  

In [None]:
#discretizando las columnas de la matriz predictora X de diabetes
diab_disc=disc_ew(X,out="num")

In [None]:
X1d=diab_disc.as_matrix()
clf = MultinomialNB()
scores = cross_val_score(clf, X1d, y1, cv=10)
scores     

In [None]:
print("Accuracy by CV: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

### Second Method:  Applying Bernoulli Naive Bayes. First, each attribute of the dataset is discretized in two values using its median as the cut point(Binarization)

In [None]:
from sklearn.model_selection import cross_val_score
#Usando la mediana como punto de corte
medX=X.median(axis=0)
predictors=['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age']
Xd=X.copy()
for col in predictors:
    Xd[col] =  (X[col] < medX[col]).astype(int)
Xd.head()
X2=Xd.as_matrix()
clf = BernoulliNB()
scores = cross_val_score(clf, X2, y1, cv=10)
scores     

In [None]:
print("Accuracy by CV : %0.3f (+/- %0.2f)" % (scores.mean(), scores.std()))

### Third Method: Considering that each attribute is continuous and applying Gaussian Naive Bayes

In [None]:
# Calculo de las probabilidades posteriores y de las clases predichas
clf = GaussianNB()
clf.fit(X1,y1)
proba=pd.DataFrame(clf.predict_proba(X1))
pred=clf.predict(X1)
proba['pred']=pred
print 'Accuracy by Resubstitution',(pred==y1).sum()/float(768)
print "Posterior probabilities"
proba.head()

In [None]:
#Estimando la precision por el metodo holdout con 50 muestras
pred=[]
for i in range(0,51):
    X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.33)
    naivediab = GaussianNB().fit(X_train, y_train)
    pred.append(naivediab.score(X_test, y_test))
print("Accuracy by holdout: %0.3f (+/- %0.3f)" % (np.mean(pred), np.std(pred)))

In [None]:
#Tasa de precision usando validacion cruzada usando 10 repeticiones con 10 folds cada una
clf = GaussianNB()
pred1=[]
for i in range (0,11):
    cv = ShuffleSplit()
    scores = cross_val_score(clf, X1,y1, cv=cv)
    pred1.append(scores)
print "Accuracy by cross validacion=", np.mean(pred1),"+/-",np.std(pred1)    

In [None]:
a=pd.read_table('https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data',delim_whitespace=True)
a.info()