In [23]:
#Importing the modules we need.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn import tree
import warnings
warnings.filterwarnings('ignore')

# Exploring the Dataset

In [24]:
#Reading the csv file and renaming the columns. 
bills = pd.read_csv("bill_authentication.csv")
bills.rename(columns={'Variance': 'Variance','Skewness':'Skewness',
                      'Curtosis':'Curtosis','Entropy':'Entropy',
                      'Class':'Class'}, inplace=True)
#Returning the top 5 rows of the dataframe.
bills.head()


Unnamed: 0,Variance,Skewness,Curtosis,Entropy,Class
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0
3,3.4566,9.5228,-4.0112,-3.5944,0
4,0.32924,-4.4552,4.5718,-0.9888,0


In [25]:
#Printing how many rows and columns the dataset has.
bills.shape

(1372, 5)

In [26]:
#Printing basic information about dataset. 
bills.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1372 entries, 0 to 1371
Data columns (total 5 columns):
Variance    1372 non-null float64
Skewness    1372 non-null float64
Curtosis    1372 non-null float64
Entropy     1372 non-null float64
Class       1372 non-null int64
dtypes: float64(4), int64(1)
memory usage: 53.7 KB


In [27]:
#Assigning X and Y values from dataset.
X = bills.iloc[:,:4].values
Y = bills.iloc[:,-1].values

In [28]:
#Dropping the specified label with drop function.
X = bills.drop('Class', axis=1)
Y = bills['Class']

In [29]:
#Splitting the data into test and train with test size %30.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

# Scikit-learn Decision Tree Classifier Demonstration

In [30]:
#Using decision tree classifier to fit the train data.
tree_classifier = DecisionTreeClassifier()
tree_classifier.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [31]:
#Calculating the accuracy of decision tree classifier and printing the parameters of decision tree classifier.
Y_pred = tree_classifier.predict(X_test)
print("Accuracy:", metrics.accuracy_score(Y_test, Y_pred))
tree_classifier.get_params()

Accuracy: 0.9830097087378641


{'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': False,
 'random_state': None,
 'splitter': 'best'}

# Cross Validation DecisionTree

In [32]:
#Performing 10-fold cross validation.  
params={"criterion":["gini","entropy"], "max_depth": range(1,10), 
        "min_samples_split":range(2,10),"min_samples_leaf":range(1,5)}
grid=GridSearchCV(tree_classifier, param_grid=params, cv=10, verbose=1, n_jobs=-1)
grid.fit(X_train,Y_train)
depth = []
depth = []
for i in range(3,20):
    clf = tree.DecisionTreeClassifier(max_depth=i)
    scores = cross_val_score(estimator=clf, X=X_train, y=Y_train, cv=10, n_jobs=4)
    depth.append((i,scores.mean()))
print(max(depth))

Fitting 10 folds for each of 576 candidates, totalling 5760 fits


[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 5760 out of 5760 | elapsed:   23.5s finished


(19, 0.9843734174353409)


In [33]:
#Finding the best parameter to give the best result using Grid Search.
grid.best_params_

{'criterion': 'entropy',
 'max_depth': 6,
 'min_samples_leaf': 1,
 'min_samples_split': 2}

In [34]:
#Printing the accuracy calculated with best parameter. 
grid.best_score_

0.9895833333333334

# Scikit-learn Random Forest Classifier Demonstration

In [35]:
#Creating a rfc object and fitting with train data.  
rfc = RandomForestClassifier()
rfc.fit(X_train,Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [36]:
#Calculating the accuracy score for random forest classifier.
Y_rfcpred = rfc.predict(X_test)
print("Accuracy:", metrics.accuracy_score(Y_test, Y_rfcpred))

Accuracy: 0.9975728155339806


# Cross Validation for Random Forest

In [37]:
#Calculating accuracy score for 10 cross validation folds. 
from sklearn.model_selection import cross_val_score
acc = cross_val_score(estimator=rfc, X=X_train, y=Y_train, cv=10)
print("scores for each fold")
for val in acc:
    print(val)


scores for each fold
0.9896907216494846
0.979381443298969
0.9896907216494846
1.0
1.0
1.0
0.9894736842105263
1.0
0.9894736842105263
1.0


In [38]:
#Calculating the mean and standard deviation 
print(acc.mean())
print(acc.std())

0.9937710255018992
0.0068650763715342


# GRID SEARCH FOR RANDOM FOREST CLASSIFIER

In [39]:
#Defining the parameters for random forest classifier. 
params = {
    'n_estimators': [100, 300, 500],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

In [40]:
#Finding the best accuracy score and the best hyperparameter that gives the best result.
clf = GridSearchCV(RandomForestClassifier(), params, cv=10, n_jobs=-1)
clf.fit(X_train, Y_train)

print(clf.score(X_train, Y_train))
print(clf.best_params_)
print(clf.best_score_)


1.0
{'bootstrap': False, 'criterion': 'entropy', 'n_estimators': 100}
0.99375


# Comparing the Performances

In our example, random forest classifier has given a higher accuracy. 
It's a common fact that random forest algorithm generally gives better accuracies. 
Since the dataset we used was a generated one especially for classification and due to inappropriate data randomization the accuracies were too high. 
And they easily overfit which is a bad thing. 