In [1]:
import numpy as np
import pandas as pd

### Naive Bayes Classifier

#### Binary

In [2]:
data = pd.read_csv('sampleweather.csv')
weather = data['Weather'].tolist()
temp = data['Temperature'].tolist()
play = data['Play'].tolist()
display(data)

Unnamed: 0,Weather,Temperature,Play
0,Sunny,Hot,No
1,Sunny,Hot,No
2,Overcast,Hot,Yes
3,Rainy,Mild,Yes
4,Rainy,Cool,Yes
5,Rainy,Cool,Yes
6,Overcast,Cool,No
7,Sunny,Mild,Yes
8,Sunny,Cool,No
9,Rainy,Mild,Yes


In [3]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
weather_encoded = le.fit_transform(weather)
temp_encoded = le.fit_transform(temp)
label = le.fit_transform(play)

print('Weather :', weather_encoded)
print('Temperature :', temp_encoded)
print('Play :', label)

Weather : [2 2 0 1 1 1 0 2 2 1 2 0 0 1]
Temperature : [1 1 1 2 0 0 0 2 0 2 2 2 1 2]
Play : [0 0 1 1 1 1 0 1 0 1 1 1 1 0]


In [4]:
features = list(zip(weather_encoded, temp_encoded))

In [5]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(features, label)

# Predict Overcast & Mild
predicted = model.predict([[0, 2]])
print('Predicted :', predicted)

Predicted : [1]


#### Multiple

In [6]:
from sklearn import datasets

wine = datasets.load_wine()

In [7]:
wine.data[0]

array([1.423e+01, 1.710e+00, 2.430e+00, 1.560e+01, 1.270e+02, 2.800e+00,
       3.060e+00, 2.800e-01, 2.290e+00, 5.640e+00, 1.040e+00, 3.920e+00,
       1.065e+03])

In [8]:
print('Features :\n', wine.feature_names)
print()
print('Labels :\n', wine.target_names)

Features :
 ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']

Labels :
 ['class_0' 'class_1' 'class_2']


In [9]:
wine.data.shape

(178, 13)

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(wine.data, 
                                                    wine.target, 
                                                    test_size=.3,
                                                    random_state=109)

gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

In [11]:
pd.DataFrame({'Predicted': y_pred,
              'Actual': y_test})

Unnamed: 0,Predicted,Actual
0,0,0
1,0,0
2,1,1
3,2,2
4,0,0
5,1,1
6,0,0
7,0,1
8,1,1
9,0,0


In [12]:
from sklearn import metrics

print('Accuracy = {:.2f}%'.format(metrics.accuracy_score(y_test, y_pred)*100))

Accuracy = 90.74%


### Decision Tree

In [13]:
from sklearn.tree import DecisionTreeClassifier

In [14]:
pima = pd.read_csv('diabetes.csv', header=None, names=['pregnant', 'glucose',
                                                       'bp', 'skin', 'insulin',
                                                       'bmi', 'pedigree', 
                                                       'age', 'label'])
print(pima.info())
display(pima.head())
pima.dropna(inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 769 entries, 0 to 768
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   pregnant  769 non-null    object
 1   glucose   769 non-null    object
 2   bp        769 non-null    object
 3   skin      769 non-null    object
 4   insulin   769 non-null    object
 5   bmi       769 non-null    object
 6   pedigree  769 non-null    object
 7   age       769 non-null    object
 8   label     769 non-null    object
dtypes: object(9)
memory usage: 54.2+ KB
None


Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,label
0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
1,6,148,72,35,0,33.6,0.627,50,1
2,1,85,66,29,0,26.6,0.351,31,0
3,8,183,64,0,0,23.3,0.672,32,1
4,1,89,66,23,94,28.1,0.167,21,0


In [15]:
for col in pima.columns:
    pima[col] = pd.to_numeric(pima[col], errors='coerce')
print(pima.info())
display(pima.head())
pima.dropna(inplace=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 769 entries, 0 to 768
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pregnant  768 non-null    float64
 1   glucose   768 non-null    float64
 2   bp        768 non-null    float64
 3   skin      768 non-null    float64
 4   insulin   768 non-null    float64
 5   bmi       768 non-null    float64
 6   pedigree  768 non-null    float64
 7   age       768 non-null    float64
 8   label     768 non-null    float64
dtypes: float64(9)
memory usage: 60.1 KB
None


Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,label
0,,,,,,,,,
1,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0,1.0
2,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0,0.0
3,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0,1.0
4,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0.0


In [16]:
features = pima.columns.drop(['label']).tolist()
X = pima[features]
y = pima['label'].tolist()

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=.3, 
                                                    random_state=1)
clf = DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print('Accuracy : {:.2f}%'.format(metrics.accuracy_score(y_test, y_pred)*100))

Accuracy : 70.13%


In [17]:
import sklearn

In [18]:
sklearn.__version__

'0.24.1'

In [19]:
import sklearn.tree as tree
import pydotplus
from six import StringIO
from IPython.display import Image

In [20]:
dot_data = StringIO()
tree.export_graphviz(clf, 
                     out_file=dot_data,
                     class_names=['0', '1'], 
                     feature_names=features,
                     filled=True,
                     rounded=True,
                     special_characters=True)

grap = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(grap.create_png())

InvocationException: GraphViz's executables not found

### Random Forest

In [24]:
iris = datasets.load_iris()
print('X :\n', iris.feature_names)
print('y :\n', iris.target_names)

X :
 ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
y :
 ['setosa' 'versicolor' 'virginica']


In [25]:
D = pd.DataFrame({'sepal length': iris.data[:,0],
                  'sepal width': iris.data[:,1],
                  'petal length': iris.data[:,2],
                  'petal width': iris.data[:,3],
                  'species': iris.target})
D

Unnamed: 0,sepal length,sepal width,petal length,petal width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [27]:
X = D[D.columns.drop('species')]
y = D['species']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=.3)

In [33]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('Accuracy = {:.2f}%'.format(metrics.accuracy_score(y_test, y_pred)*100))

Accuracy = 97.78%


In [35]:
pd.Series(clf.feature_importances_, index=iris.feature_names).sort_values(ascending=False)

petal length (cm)    0.471772
petal width (cm)     0.420384
sepal length (cm)    0.083292
sepal width (cm)     0.024552
dtype: float64

### SVM

In [36]:
cancer = datasets.load_breast_cancer()
print('Features :\n', cancer.feature_names)
print('Labels :\n', cancer.target_names)

Features :
 ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
Labels :
 ['malignant' 'benign']


In [38]:
cancer.data.shape

(569, 30)

In [39]:
cancer.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

In [40]:
X = cancer.data
y = cancer.target
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size=.3,
                                                    random_state=109)

In [45]:
from sklearn import svm

clf = svm.SVC(kernel='linear')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print('Accuracy = {:.2f}%'.format(metrics.accuracy_score(y_test, y_pred)*100))
print('Precision = {:.2f}%'.format(metrics.precision_score(y_test, y_pred)*100))
print('Recall = {:.2f}%'.format(metrics.recall_score(y_test, y_pred)*100))

Accuracy = 96.49%
Precision = 98.11%
Recall = 96.30%


### Multiple Algorithm

In [46]:
train = pd.read_csv('Final_Dataset/train.csv')
print(train.info())
train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB
None


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [47]:
train.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [48]:
train.fillna(train.mean(), inplace=True)
train.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term      0
Credit_History        0
Property_Area         0
Loan_Status           0
dtype: int64

In [49]:
train['Gender'].fillna(train['Gender'].mode()[0], inplace=True)
train['Married'].fillna(train['Married'].mode()[0], inplace=True)
train['Dependents'].fillna(train['Dependents'].mode()[0], inplace=True)
train['Self_Employed'].fillna(train['Self_Employed'].mode()[0], inplace=True)

In [50]:
train.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [52]:
train['Loan_Amount_Term'] = np.log(train['Loan_Amount_Term'])

In [55]:
X = train.drop('Loan_Status', 1)
y = train['Loan_Status']

In [58]:
X.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001002,Male,No,0,Graduate,No,5849,0.0,146.412162,5.886104,1.0,Urban
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,5.886104,1.0,Rural
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,5.886104,1.0,Urban
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,5.886104,1.0,Urban
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,5.886104,1.0,Urban


In [60]:
X = pd.get_dummies(X)
train = pd.get_dummies(train)

In [61]:
X.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_ID_LP001002,Loan_ID_LP001003,Loan_ID_LP001005,Loan_ID_LP001006,Loan_ID_LP001008,...,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,5849,0.0,146.412162,5.886104,1.0,1,0,0,0,0,...,0,0,0,1,0,1,0,0,0,1
1,4583,1508.0,128.0,5.886104,1.0,0,1,0,0,0,...,1,0,0,1,0,1,0,1,0,0
2,3000,0.0,66.0,5.886104,1.0,0,0,1,0,0,...,0,0,0,1,0,0,1,0,0,1
3,2583,2358.0,120.0,5.886104,1.0,0,0,0,1,0,...,0,0,0,0,1,1,0,0,0,1
4,6000,0.0,141.0,5.886104,1.0,0,0,0,0,1,...,0,0,0,1,0,1,0,0,0,1


In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [85]:
from sklearn.metrics import confusion_matrix, accuracy_score

# Logistic Regression
from sklearn.linear_model import LogisticRegression

model_reg = LogisticRegression(max_iter=150)
model_reg.fit(X_train, y_train)

y_pred_reg = model_reg.predict(X_test)

print('Logistic Regression')
acc_reg = accuracy_score(y_test, y_pred_reg)*100
print('Accuracy: {:.2f}%'.format(acc_reg))
matrix = confusion_matrix(y_test, y_pred_reg)
print(matrix)

Logistic Regression
Accuracy: 81.30%
[[15 21]
 [ 2 85]]


In [96]:
# Decision Tree

model_dt = tree.DecisionTreeClassifier()
model_dt.fit(X_train, y_train)

y_pred_dt = model_dt.predict(X_test)
print('Decision Tree')
acc_dt = accuracy_score(y_test, y_pred_dt)*100
print('Accuracy: {:.2f}%'.format(acc_dt))
matrix = confusion_matrix(y_test, y_pred_dt)
print(matrix)

Decision Tree
Accuracy: 80.49%
[[19 17]
 [ 7 80]]


In [87]:
# Random Forest

model_for = RandomForestClassifier()
model_for.fit(X_train, y_train)

y_pred_for = model_for.predict(X_test)
print('Random Forest')
acc_for = accuracy_score(y_test, y_pred_for)*100
print('Accuracy: {:.2f}%'.format(acc_for))
matrix = confusion_matrix(y_test, y_pred_for)
print(matrix)

Random Forest
Accuracy: 82.93%
[[16 20]
 [ 1 86]]


In [88]:
# SVM
model_svm = svm.SVC()
model_svm.fit(X_train, y_train)

y_pred_svm = model_svm.predict(X_test)
print('SVM')
acc_svm = accuracy_score(y_test, y_pred_svm)*100
print('Accuracy: {:.2f}%'.format(acc_svm))
matrix = confusion_matrix(y_test, y_pred_svm)
print(matrix)

SVM
Accuracy: 69.92%
[[ 0 36]
 [ 1 86]]


In [90]:
# Naive Bayes
model_nb = GaussianNB()
model_nb.fit(X_train, y_train)

y_pred_nb = model_nb.predict(X_test)
print('Naive Bayes')
acc_nb = accuracy_score(y_test, y_pred_nb)*100
print('Accuracy: {:.2f}%'.format(acc_nb))
matrix = confusion_matrix(y_test, y_pred_nb)
print(matrix)

Naive Bayes
Accuracy: 77.24%
[[19 17]
 [11 76]]


In [91]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
model_knn = KNeighborsClassifier()
model_knn.fit(X_train, y_train)

y_pred_knn = model_knn.predict(X_test)
print('KNN')
acc_knn = accuracy_score(y_test, y_pred_knn)*100
print('Accuracy: {:.2f}%'.format(acc_knn))
matrix = confusion_matrix(y_test, y_pred_knn)
print(matrix)

KNN
Accuracy: 60.98%
[[ 8 28]
 [20 67]]


In [97]:
summary_model = pd.DataFrame({'Algorithm': ['Logistic Regression', 
                                            'Decision Tree', 
                                            'Random Forest', 
                                            'SVM', 
                                            'Naive Bayes',
                                            'KNN'],
                              'Accuracy': [acc_reg,
                                           acc_dt,
                                           acc_for,
                                           acc_svm,
                                           acc_nb, 
                                           acc_knn]})
summary_model

Unnamed: 0,Algorithm,Accuracy
0,Logistic Regression,81.300813
1,Decision Tree,80.487805
2,Random Forest,82.926829
3,SVM,69.918699
4,Naive Bayes,77.235772
5,KNN,60.97561


In [None]:
from sklearn.model_selection import 

In [None]:
post = 'saya sedang marah'
post = 'marah'
'marah' --> positif 8
'marah' --> negative 12
'marah' --> neutral 0