In [1]:
import numpy as np
import pandas as pd

from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.model_selection import RandomizedSearchCV, train_test_split, ShuffleSplit
from sklearn.model_selection import StratifiedKFold, KFold, LeaveOneOut
from sklearn.metrics import make_scorer, f1_score, roc_auc_score, auc, roc_curve, precision_score
from sklearn.metrics import recall_score, classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, LabelBinarizer

from sklearn import datasets

In [2]:
# Reading the data

folderName = '../data/' 
fileName   = 'tennis.csv'

df = pd.read_csv(folderName+fileName)

df.head()

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play Tennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes


In [3]:
number = LabelEncoder()

df2 = df.copy()

df2['Outlook'] = number.fit_transform(df['Outlook'])
df2['Temperature'] = number.fit_transform(df['Temperature'])
df2['Humidity'] = number.fit_transform(df['Humidity'])
df2['Wind'] = number.fit_transform(df['Wind'])
df2['Play Tennis'] = number.fit_transform(df['Play Tennis'])

features = ["Outlook", "Temperature", "Humidity", "Wind"]
target = "Play Tennis"

In [4]:
# Multinomial Naive Bayes
clf = MultinomialNB()

clf.fit(df2[features],df2[target])

# New = ['Sunny','Hot','Normal','Yes']
new = np.array([2, 1, 1, 1])

print('Predicted Class:', clf.predict(new.reshape(1,-1)))
print('Probability:', clf.predict_proba(new.reshape(1,-1)))

Predicted Class: [1]
Probability: [[ 0.34426593  0.65573407]]


In [5]:
# Gaussian Naive Bayes
clf = GaussianNB()

clf.fit(df2[features],df2[target])

# New = ['Sunny','Hot','Normal','Yes']
new = np.array([2, 1, 1, 1])

print('Predicted Class:', clf.predict(new.reshape(1,-1)))
print('Probability:', clf.predict_proba(new.reshape(1,-1)))

Predicted Class: [1]
Probability: [[ 0.22541203  0.77458797]]


In [6]:
binar = LabelBinarizer()

outlook = pd.get_dummies(df['Outlook'])
temp = pd.get_dummies(df['Temperature'])

humid = pd.DataFrame(binar.fit_transform(df['Humidity']),columns=['Humidity'])
wind = pd.DataFrame(binar.fit_transform(df['Wind']),columns=['Wind'])

df3 = pd.concat([outlook,temp,humid,wind],axis=1)

In [7]:
df3.head()

Unnamed: 0,Overcast,Rain,Sunny,Cool,Hot,Mild,Humidity,Wind
0,0,0,1,0,1,0,0,1
1,0,0,1,0,1,0,0,0
2,1,0,0,0,1,0,0,1
3,0,1,0,0,0,1,0,1
4,0,1,0,1,0,0,1,1


In [8]:
# Bernoulli Naive Bayes
clf = BernoulliNB()

clf.fit(df3,df2[target])

# New = ['Sunny','Hot','Normal','Yes']
new = np.array([0,0,1,0,1,0,1,1])

print('Predicted Class:', clf.predict(new.reshape(1,-1)))
print('Probability:', clf.predict_proba(new.reshape(1,-1)))

Predicted Class: [1]
Probability: [[ 0.47855378  0.52144622]]


In [9]:
# ----------- CROSS VALIDATION ----------- #
sc_mean_test=[]
sc_mean_train=[]

x = df3
y = df2[target]

cv = KFold(n_splits=10) # K-fold Cross Validation method
for train_index, test_index in cv.split(df.values):
    X_train, X_test = x.values[train_index,:], x.values[test_index,:]
    y_train, y_test = y[train_index],y[test_index]     
        
    clf.fit(X_train, y_train)
    predicted_train = clf.predict(X_train)
    sc_mean_train.append(accuracy_score(y_train, predicted_train))
    
    predicted_test = clf.predict(X_test)
    sc_mean_test.append(accuracy_score(y_test, predicted_test))

In [10]:
# Transforming list to array
sc_mean_test = np.array(sc_mean_test)
sc_mean_train = np.array(sc_mean_train)

# Calculating the mean values for train and test score
sc_mean_train.mean(), sc_mean_test.mean()

(0.88141025641025639, 0.55000000000000004)

In [11]:
# ----------- Leave-one-out ----------- #
sc_mean_test=[]
sc_mean_train=[]

x = df3
y = df2[target]

cv = LeaveOneOut() # K-fold Cross Validation method
for train_index, test_index in cv.split(df.values):
    X_train, X_test = x.values[train_index,:], x.values[test_index,:]
    y_train, y_test = y[train_index],y[test_index]     
        
    clf.fit(X_train, y_train)
    predicted_train = clf.predict(X_train)
    sc_mean_train.append(accuracy_score(y_train, predicted_train))
    
    predicted_test = clf.predict(X_test)
    sc_mean_test.append(accuracy_score(y_test, predicted_test))

In [12]:
# Transforming list to array
sc_mean_test = np.array(sc_mean_test)
sc_mean_train = np.array(sc_mean_train)

# Calculating the mean values for train and test score
sc_mean_train.mean(), sc_mean_test.mean()

(0.88461538461538469, 0.6428571428571429)

In [13]:
# ----------- Hold out ----------- #
sc_mean_test=[]
sc_mean_train=[]

x = df3
y = df2[target]

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42, stratify=y)

clf.fit(X_train, y_train)
predicted_train = clf.predict(X_train)
sc_mean_train = accuracy_score(y_train, predicted_train)

predicted_test = clf.predict(X_test)
sc_mean_test= accuracy_score(y_test, predicted_test)


In [14]:
print(sc_mean_train, sc_mean_test)

0.888888888889 0.6
