# Naive Bayes Classifier Building in Scikit-learn

## Defining Dataset

In [1]:
# Assigning features and label variables

# First Feature
weather = ["Sunny", "Sunny", "Overcast", "Rainy", "Rainy", "Rainy", "Overcast",
           "Sunny", "Sunny", "Rainy", "Sunny", "Overcast", "Overcast", "Rainy"]

# Second Feature
temp = ["Hot", "Hot", "Hot", "Mild", "Cool", "Cool", "Cool",
        "Mild", "Cool", "Mild", "Mild", "Mild", "Hot", "Mild"]

# Label or Terget Variable
play = ["No", "No", "Yes", "Yes", "Yes", "No", "Yes",
        "No", "Yes", "Yes", "Yes", "Yes", "Yes", "No"]

## Encoding Features

Pertama, kita perlu mengubah string menjadi angka, misalnya: 'Overcast', 'Rainy', 'Sunny' sebagai 0, 1, 2. Ini dikenal sebagai label encoding. Scikit-learn menyediakan pustaka LabelEncoder untuk mengenkode label dengan nilai.

In [2]:
# Import labelEncoder
from sklearn import preprocessing

# Creating labelEncoder
le = preprocessing.LabelEncoder()

# Converting string labels into numbers.
weather_encoded = le.fit_transform(weather)

print(weather_encoded)

[2 2 0 1 1 1 0 2 2 1 2 0 0 1]


In [3]:
# Converting string labels into numbers
temp_encoded = le.fit_transform(temp)
label = le.fit_transform(play)

print("Temp:", temp_encoded)
print("Play:", label)

Temp: [1 1 1 2 0 0 0 2 0 2 2 2 1 2]
Play: [0 0 1 1 1 0 1 0 1 1 1 1 1 0]


In [4]:
# Combining weather and temp into single listof tuples

features = list(zip(weather_encoded, temp_encoded))

features

[(2, 1),
 (2, 1),
 (0, 1),
 (1, 2),
 (1, 0),
 (1, 0),
 (0, 0),
 (2, 2),
 (2, 0),
 (1, 2),
 (2, 2),
 (0, 2),
 (0, 1),
 (1, 2)]

In [5]:
# Import Gaussian Naive Bayes Model
from sklearn.naive_bayes import GaussianNB

# Create a Gaussian Classifier
model = GaussianNB()

# Train the model using the training sets
model.fit(features, label)

# Predict Output
predicted = model.predict([[0, 2]]) # --> 0: Overcast, 2: Mild
print("Predicted Value:",predicted)

Predicted Value: [1]


## Naive Bayes with Multiple Labels

### Loading Data

In [6]:
# Import scikit-learn dataset library
from sklearn import datasets

# Load Dataset
wine = datasets.load_wine()

In [7]:
# Print the names of the 13 Features
print("Features: ", wine.feature_names)

# Print the label type of wine(class_0, class_1, class_2)
print("Labels: ", wine.target_names)

Features:  ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']
Labels:  ['class_0' 'class_1' 'class_2']


In [8]:
# Print data(feature)shape
wine.data.shape

(178, 13)

In [9]:
# Print the wine data features (top 5 records)
print(wine.data[:5])

[[1.423e+01 1.710e+00 2.430e+00 1.560e+01 1.270e+02 2.800e+00 3.060e+00
  2.800e-01 2.290e+00 5.640e+00 1.040e+00 3.920e+00 1.065e+03]
 [1.320e+01 1.780e+00 2.140e+00 1.120e+01 1.000e+02 2.650e+00 2.760e+00
  2.600e-01 1.280e+00 4.380e+00 1.050e+00 3.400e+00 1.050e+03]
 [1.316e+01 2.360e+00 2.670e+00 1.860e+01 1.010e+02 2.800e+00 3.240e+00
  3.000e-01 2.810e+00 5.680e+00 1.030e+00 3.170e+00 1.185e+03]
 [1.437e+01 1.950e+00 2.500e+00 1.680e+01 1.130e+02 3.850e+00 3.490e+00
  2.400e-01 2.180e+00 7.800e+00 8.600e-01 3.450e+00 1.480e+03]
 [1.324e+01 2.590e+00 2.870e+00 2.100e+01 1.180e+02 2.800e+00 2.690e+00
  3.900e-01 1.820e+00 4.320e+00 1.040e+00 2.930e+00 7.350e+02]]


In [10]:
# Print the wine labels(class_0, class_1, class_2)
print(wine.target)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]


In [11]:
# Import train_test_split function
from sklearn.model_selection import train_test_split

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(
    wine.data, wine.target, test_size=0.3, random_state=109)  # 70% training and 30% test

In [12]:
# Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB

# Create a Gaussian Classifier
gnb = GaussianNB()

# Train the model using the training sets
gnb.fit(X_train, y_train)

# Predict the response for test dataset
y_pred = gnb.predict(X_test)

In [13]:
#Import scikit learn metrics module for accuracy calculation
from sklearn import metrics

# Model accuracy, how often is the classifier correct?
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))

Accuracy:  0.9074074074074074


# Decision Tree Classifier Building in Scikit-learn

In [14]:
# Load libraries
import pandas as pd

# Import Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier

# Import train_test_split function
from sklearn.model_selection import train_test_split

# import sikit-learn metrics module for accuracy calculation
from sklearn import metrics

In [15]:
col_names = ["pregnant", "glucose", "bp", "skin",
             "insulin", "bmi", "pedigree", "age", "label"]

# load dataset
pima = pd.read_csv(
    "https://raw.githubusercontent.com/ardhiraka/PFDS_sources/master/diabetes.csv", header=None, names=col_names)

In [16]:
pima.head()

Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,label
0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
1,6,148,72,35,0,33.6,0.627,50,1
2,1,85,66,29,0,26.6,0.351,31,0
3,8,183,64,0,0,23.3,0.672,32,1
4,1,89,66,23,94,28.1,0.167,21,0


## Feature Selection

In [17]:
pima.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 769 entries, 0 to 768
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   pregnant  769 non-null    object
 1   glucose   769 non-null    object
 2   bp        769 non-null    object
 3   skin      769 non-null    object
 4   insulin   769 non-null    object
 5   bmi       769 non-null    object
 6   pedigree  769 non-null    object
 7   age       769 non-null    object
 8   label     769 non-null    object
dtypes: object(9)
memory usage: 54.2+ KB


In [18]:
numer = ["pregnant", "insulin", "bmi", "age", "glucose", "bp", "pedigree", "label"]

for col in numer: # Coerce for missing values
    pima[col] = pd.to_numeric(pima[col], errors="coerce")

In [19]:
pima.dropna(inplace=True)

In [20]:
# Split dataset in features and target variable
feature_cols = ["pregnant", "insulin", "bmi", "age", "glucose", "bp", "pedigree"]

# Features
X = pima[feature_cols]
# Target Varible
y = pima.label

In [21]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # --> 70% training and 30% test

In [22]:
# Create Decision Tree Classifier Object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifier
clf = clf.fit(X_train, y_train)

# Predict the response for test dataset
y_pred = clf.predict(X_test)

In [23]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7056277056277056


##  Visualizing Decision Tree

In [24]:
import sklearn.tree as tree
import pydotplus
from six import StringIO
from IPython.display import Image

dot_data = StringIO()
tree.export_graphviz(
    clf,
    out_file=dot_data,
    # the target names
    class_names=["0", "1"],
    # the features names
    feature_names=feature_cols,
    filled=True,
    rounded=True,
    special_characters=True
)

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())

ModuleNotFoundError: No module named 'pydotplus'

<b>Catatan</b>

module sklearn.externals.six was removed in version 0.23. if you want to use this module, you have to downgrade to version 0.22 or lower.

dari

from sklearn.externals.six import StringIO

menajdi

from six import StringIO


In [None]:
# Create Decision Tree Classifier Object
clf = DecisionTreeClassifier(criterion="entropy", max_depth=3)

# Train Decision Tree Classifier
clf = clf.fit(X_train, y_train)

# Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier work?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
import sklearn.tree as tree
import pydotplus
from six import StringIO
from IPython.display import Image

dot_data = StringIO()
tree.export_graphviz(
    clf,
    out_file=dot_data,
    # the target names
    class_names=["0", "1"],
    # the features names
    feature_names=feature_cols,
    filled=True,
    rounded=True,
    special_characters=True
)

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())

### Building a Classifier using Scikit-learn

In [None]:
# Import scikit-learn dataset library
from sklearn import datasets

# load dataset
iris = datasets.load_iris()

In [None]:
# Print the label species (setosa, versicolor, virginica)
print(iris.target_names)

# Print the names of the four features
print(iris.feature_names)

In [None]:
# Print the iris data (top 5 records)
print(iris.data[:5])

# Print the iris labels (0: Setosa, 1: Versicolor, 2: Virginica)
print(iris.target)

In [None]:
# Creating DataFrame of given iris dataset.
import pandas as pd

data = pd.DataFrame({
    "sepal length":iris.data[:,0],
    "sepal width":iris.data[:,1],
    "petal length":iris.data[:,2],
    "petal width":iris.data[:,3],
    "species":iris.target
})

data.head()

In [None]:
# Memisahkan Kolom menjadi Variabel dependen dan independen (features dan labels)

# Import train_test_split function
from sklearn.model_selection import train_test_split

# Features
X = data[[
    "sepal length",
    "sepal width",
    "petal length",
    "petal width",
]]
# labels
y = data["species"]

# Split dataset into training set and test set
# 70% training and 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
# Melakukan train dari data testing

# Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

# Create a Gaussian Classifier
cld = RandomForestClassifier(n_estimators=100)

# Train the model using the training sets y_pred = clf.predict(X_test)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

In [None]:
# Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Kita juga dapat membuat prediksi untuk satu item, misalnya:

<ul>
    <li>sepal length = 3</li>
    <li>sepal width = 5</li>
    <li>petal length = 4</li>
    <li>petal width = 2</li>
</ul>

In [None]:
clf.predict([[
    3,
    5,
    4,
    2
]])

##  Finding Important Features in Scikit-learn

In [None]:
# Import Module
from sklearn.ensemble import RandomForestClassifier

# Create a Gaussian Classifier
clf = RandomForestClassifier(n_estimators=100)

# Train the model using the training sets y_pred = clf.predict(X_test)
clf.fit(X_train, y_train)

In [None]:
RandomForestClassifier(
    bootstrap=True,
    class_weight=None,
    criterion='gini',
    max_depth=None, 
    max_features='auto', 
    max_leaf_nodes=None, 
    min_impurity_decrease=0.0, 
    min_impurity_split=None,                   
    min_samples_leaf=1, 
    min_samples_split=2,            
    min_weight_fraction_leaf=0.0, 
    n_estimators=100, 
    n_jobs=1,            
    oob_score=False, random_state=None, 
    verbose=0,            
    warm_start=False
)

In [None]:
import pandas as pd

feature_imp = pd.Series(clf.feature_importances_, index=iris.feature_names).sort_values(ascending=False)

feature_imp

In [None]:
# Visualisasi

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Creating a bar plot
sns.barplot(x = feature_imp, y = feature_imp.index)

# Add labels to your graph
plt.xlabel("Feature Importance Score")
plt.ylabel("Features")
plt.title("Visualizing Important Features")

plt.show()

##  Generating the Model on Selected Features

In [None]:
# Import train_test_split function

# Import sklearn.cross_validation sudah berubah menjadi sklearn.model_selection
from sklearn.model_selection import train_test_split

# Split dataset into features and labels
X = data[[
    "petal length",
    "petal width",
    "sepal length"
]]
y = data["species"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.70, random_state=5)

### Keterangan
Setalah dipisah selanjutnya dilakukan training set features yang dipilih, dan melakukan prediksi pada test set featur yang dipilih serta membandingkan nilai aktual dan nilai prediksi

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Creatue a Gaussian Classifier
clf = RandomForestClassifier(n_estimators=100)

# Train the model using the training sets y_pred = clf.predict(X_test)
clf.fit(X_train, y_train)

# Prediction on test set
y_pred = clf.predict(X_test)

# Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

### Loading Data

In [None]:
# Import scikit-learn dataset library
from sklearn import datasets

# Load dataset
cancer = datasets.load_breast_cancer()

### Exploring Data

In [None]:
# print the names of the 13 features
print("Features: ", cancer.feature_names)

# Print the label type of cancer("milignant" "benign")
print("Labels: ", cancer.target_names)

In [None]:
# print data(feature)shape
cancer.data.shape

In [None]:
# print the cancer data features (top 5 records)
print(cancer.data[:5])

In [None]:
# print the cancer labels (0:malignant, 1:beningn)
print(cancer.target)

### Splitting Data

In [None]:
# Import train_test_split function
from sklearn.model_selection import train_test_split

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, test_size=0.3, random_state=109)  # -> 70% training and 30% test

### Generating Model

In [None]:
# Import svm model
from sklearn import svm

# create a sum Classifier
clf = svm.SVC(kernel="linear") # --> Linear Kernel

# Train the model using the training sets
clf.fit(X_train, y_train)

# Predict the response for test dataset
y_pred = clf.predict(X_test)

### Evaluating the Model

In [None]:
# Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy: how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

print("Kita  mendapat tingkat klasifikasi", round(metrics.accuracy_score(y_test, y_pred) * 100, 2) ,"% dianggap sebagai akurasi yang sangat baik.")

In [None]:
# Model Precision: What percentage of positive tuples are labeled as such?
print("Precision:", metrics.precision_score(y_test, y_pred))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:", metrics.recall_score(y_test, y_pred))

##  Exercise: Multiple Algorithm on Dataset

In [None]:
import pandas as pd
import numpy as np

In [None]:
train = pd.read_csv('https://raw.githubusercontent.com/ardhiraka/PFDS_sources/master/Final_Dataset/train.csv')

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
print("Train Data:")
print(train.isnull().sum(), "\n")

In [None]:
train.fillna(train.mean(), inplace=True)
train.isnull().sum()

In [None]:
train.Gender.fillna(train.Gender.mode()[0],inplace=True)

train.Married.fillna(train.Married.mode()[0],inplace=True)

train.Dependents.fillna(train.Dependents.mode()[0],inplace=True) 

train.Self_Employed.fillna(train.Self_Employed.mode()[0],inplace=True)  

train.isnull().sum() 

In [None]:
train.Loan_Amount_Term=np.log(train.Loan_Amount_Term)

In [None]:
X = train.drop("Loan_Status", 1)
y = train.Loan_Status

In [None]:
X = pd.get_dummies(X)
train = pd.get_dummies(train)

In [None]:
X.head()

In [None]:
y.head()

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_cv, y_train, y_cv = train_test_split(X, y, test_size=0.2)

In [None]:
#(a) LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(x_train, y_train)

In [None]:
pred_cv = model.predict(x_cv)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

print(accuracy_score(y_cv, pred_cv))
matrix = confusion_matrix(y_cv, pred_cv)
print(matrix)

In [None]:
#(b)DECISION TREE ALGORITHM
from sklearn import tree
dt=tree.DecisionTreeClassifier()
dt.fit(x_train, y_train)

In [None]:
pred_cv1 = dt.predict(x_cv)

In [None]:
print(accuracy_score(y_cv, pred_cv1))
matrix1 = confusion_matrix(y_cv, pred_cv1)
print(matrix1)

In [None]:
#(c)RANDOM FOREST ALGORITHM

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(x_train, y_train)

In [None]:
pred_cv2 = rf.predict(x_cv)

In [None]:
print(accuracy_score(y_cv, pred_cv2))
matrix2 = confusion_matrix(y_cv, pred_cv2)
print(matrix2)

In [None]:
#(d)SUPPORT VECTOR MACHINE (SVM) ALGORITHM

from sklearn import svm
svm_model = svm.SVC()
svm_model.fit(x_train, y_train)

In [None]:
pred_cv3 = svm_model.predict(x_cv)

In [None]:
print(accuracy_score(y_cv, pred_cv3))
matrix3 = confusion_matrix(y_cv, pred_cv3)
print(matrix3)

In [None]:
#(e)NAIVE BAYES ALGORITHM

from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(x_train, y_train)

pred_cv4 = nb.predict(x_cv)

print(accuracy_score(y_cv, pred_cv4))
matrix4 = confusion_matrix(y_cv, pred_cv4)
print(matrix4)

In [None]:
#(f)K-NEAREST NEIGHBOR(kNN) ALGORITHM

from sklearn.neighbors import KNeighborsClassifier
kNN = KNeighborsClassifier()
kNN.fit(x_train, y_train)

pred_cv5 = kNN.predict(x_cv)

print(accuracy_score(y_cv, pred_cv5))
matrix5 = confusion_matrix(y_cv, pred_cv5)
print(matrix5)

In [None]:
print("Logistic Regression:", accuracy_score(y_cv, pred_cv))
print("Decision Tree:", accuracy_score(y_cv, pred_cv1))
print("Random Forest:", accuracy_score(y_cv, pred_cv2))
print("SVM:", accuracy_score(y_cv, pred_cv3))
print("Naive Bayes:", accuracy_score(y_cv, pred_cv4))
print("KNN:", accuracy_score(y_cv, pred_cv5))

In [None]:
#Write test results in csv file

predictions = pd.DataFrame(pred_cv2, columns=["predictions"]).to_csv("H8_NB_Credit_Predictions.csv")