In [0]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import random
from pprint import pprint
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [0]:
def load_csv(data, columns=None, header=None):
  data = pd.read_csv(data, header=header)
  if columns != None:
    data.columns = columns
  return data

In [0]:
def train_test_split(X, y=None, split_ratio=0.3, random_state=123, stype="separate"):
  np.random.seed(random_state)
  if stype=="separate":
    split = np.random.rand(X.shape[0]) < split_ratio
    return X[~split], X[split], y[~split], y[split]
  elif stype=="whole":
    split = np.random.rand(X.shape[0]) < split_ratio
    return X[~split], X[split] 

In [5]:
columns = ['variance', 'skewness', 'curtosis', 'entropy', 'class']
data = "https://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt"
banknote = load_csv(data, columns)
banknote.head()

Unnamed: 0,variance,skewness,curtosis,entropy,class
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0
3,3.4566,9.5228,-4.0112,-3.5944,0
4,0.32924,-4.4552,4.5718,-0.9888,0


In [6]:
# Selecting X and y
X = banknote.iloc[:, :-1]
y = banknote.iloc[:,-1]
print(X.head())
print(y.head())

   variance  skewness  curtosis  entropy
0   3.62160    8.6661   -2.8073 -0.44699
1   4.54590    8.1674   -2.4586 -1.46210
2   3.86600   -2.6383    1.9242  0.10645
3   3.45660    9.5228   -4.0112 -3.59440
4   0.32924   -4.4552    4.5718 -0.98880
0    0
1    0
2    0
3    0
4    0
Name: class, dtype: int64


In [7]:
# Train and Test split
X_train, X_test, y_train, y_test = train_test_split(X, y=y)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
data_train, data_test = train_test_split(banknote, stype="whole")
print(data_train.shape)
print(data_test.shape)

(952, 4)
(420, 4)
(952,)
(420,)
(952, 5)
(420, 5)


In [0]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn import ensemble

In [9]:
clf_entropy = RandomForestClassifier(criterion = "entropy", random_state = 100,
                               max_depth=3, min_samples_leaf=5)
clf_entropy.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=3, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=100,
                       verbose=0, warm_start=False)

In [10]:
y_pred = clf_entropy.predict(X_test)
#print(y_pred)
print ("Accuracy is ", accuracy_score(y_test,y_pred)*100)

Accuracy is  95.47619047619048


In [11]:
clf_gini = RandomForestClassifier(criterion = "gini", random_state = 100,
                               max_depth=3, min_samples_leaf=5)
clf_gini.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=3, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=100,
                       verbose=0, warm_start=False)

In [12]:
y_pred = clf_gini.predict(X_test)
#print(y_pred)
print ("Accuracy is ", accuracy_score(y_test,y_pred)*100)

Accuracy is  95.0
