In [2]:
import numpy as np
import pandas as pd
import sys

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [3]:
data = pd.read_csv("bill_authentication.csv")

In [4]:
data.head()

Unnamed: 0,Variance,Skewness,Curtosis,Entropy,Class
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0
3,3.4566,9.5228,-4.0112,-3.5944,0
4,0.32924,-4.4552,4.5718,-0.9888,0


In [5]:
def split_data_into_train_and_test(data, test_size=0.2, random_state=0):
    c = len(data.columns)
    X = data.iloc[:, 0:c-1].values
    y = data.iloc[:, c-1].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test
    
    
def scale_X_values(X_train, X_test):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    
def get_classifier(X_train, y_train, n_estimators=20, random_state=0):
    classifier = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)
    classifier.fit(X_train, y_train)
    return classifier


def test_classifier(classifier, X_test, y_test):
    y_pred = classifier.predict(X_test)
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("Confusion matrix:", conf_matrix, "\n", sep="\n")
    
    class_report = classification_report(y_test, y_pred)
    print("Classification_report:", class_report,"\n", sep="\n")
    
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy_score:", accuracy, sep="\n")
    
    return accuracy, class_report, conf_matrix


def proceed_testing_random_forest_classifier(data, n_estimators=20, test_size=0.2, random_state=0):
    X_train, X_test, y_train, y_test = split_data_into_train_and_test(data, test_size, random_state)
    scale_X_values(X_train, X_test)
    classifier = get_classifier(X_train, y_train, n_estimators=n_estimators, random_state=random_state)
    test_classifier(classifier, X_test, y_test)

In [6]:
proceed_testing_random_forest_classifier(data)

Confusion matrix:
[[155   2]
 [  1 117]]


Classification_report:
             precision    recall  f1-score   support

          0       0.99      0.99      0.99       157
          1       0.98      0.99      0.99       118

avg / total       0.99      0.99      0.99       275



Accuracy_score:
0.9890909090909091


In [7]:
proceed_testing_random_forest_classifier(data, 1)

Confusion matrix:
[[153   4]
 [  7 111]]


Classification_report:
             precision    recall  f1-score   support

          0       0.96      0.97      0.97       157
          1       0.97      0.94      0.95       118

avg / total       0.96      0.96      0.96       275



Accuracy_score:
0.96


In [8]:
proceed_testing_random_forest_classifier(data, 2)

Confusion matrix:
[[156   1]
 [  8 110]]


Classification_report:
             precision    recall  f1-score   support

          0       0.95      0.99      0.97       157
          1       0.99      0.93      0.96       118

avg / total       0.97      0.97      0.97       275



Accuracy_score:
0.9672727272727273


In [9]:
proceed_testing_random_forest_classifier(data, 3)

Confusion matrix:
[[154   3]
 [  2 116]]


Classification_report:
             precision    recall  f1-score   support

          0       0.99      0.98      0.98       157
          1       0.97      0.98      0.98       118

avg / total       0.98      0.98      0.98       275



Accuracy_score:
0.9818181818181818


In [10]:
proceed_testing_random_forest_classifier(data, 4)

Confusion matrix:
[[155   2]
 [  3 115]]


Classification_report:
             precision    recall  f1-score   support

          0       0.98      0.99      0.98       157
          1       0.98      0.97      0.98       118

avg / total       0.98      0.98      0.98       275



Accuracy_score:
0.9818181818181818


In [11]:
proceed_testing_random_forest_classifier(data, 5)

Confusion matrix:
[[155   2]
 [  1 117]]


Classification_report:
             precision    recall  f1-score   support

          0       0.99      0.99      0.99       157
          1       0.98      0.99      0.99       118

avg / total       0.99      0.99      0.99       275



Accuracy_score:
0.9890909090909091


In [12]:
proceed_testing_random_forest_classifier(data, 6)

Confusion matrix:
[[155   2]
 [  1 117]]


Classification_report:
             precision    recall  f1-score   support

          0       0.99      0.99      0.99       157
          1       0.98      0.99      0.99       118

avg / total       0.99      0.99      0.99       275



Accuracy_score:
0.9890909090909091


In [13]:
proceed_testing_random_forest_classifier(data, 20)

Confusion matrix:
[[155   2]
 [  1 117]]


Classification_report:
             precision    recall  f1-score   support

          0       0.99      0.99      0.99       157
          1       0.98      0.99      0.99       118

avg / total       0.99      0.99      0.99       275



Accuracy_score:
0.9890909090909091


In [14]:
proceed_testing_random_forest_classifier(data, 40)

Confusion matrix:
[[155   2]
 [  1 117]]


Classification_report:
             precision    recall  f1-score   support

          0       0.99      0.99      0.99       157
          1       0.98      0.99      0.99       118

avg / total       0.99      0.99      0.99       275



Accuracy_score:
0.9890909090909091


In [15]:
proceed_testing_random_forest_classifier(data, 200)

Confusion matrix:
[[155   2]
 [  1 117]]


Classification_report:
             precision    recall  f1-score   support

          0       0.99      0.99      0.99       157
          1       0.98      0.99      0.99       118

avg / total       0.99      0.99      0.99       275



Accuracy_score:
0.9890909090909091


Wnioski: Lasy losowe dość dobrze sprawdzają się na testowanym zbiorze danych. Zwiększanie liczby lasów poprawia efektywność klasyfikatora, aczkolwiek skuteczność nie rośnie w nieskończoność. 

In [16]:
proceed_testing_random_forest_classifier(data, test_size=0.1)

Confusion matrix:
[[76  1]
 [ 0 61]]


Classification_report:
             precision    recall  f1-score   support

          0       1.00      0.99      0.99        77
          1       0.98      1.00      0.99        61

avg / total       0.99      0.99      0.99       138



Accuracy_score:
0.9927536231884058


In [17]:
proceed_testing_random_forest_classifier(data, test_size=0.2)

Confusion matrix:
[[155   2]
 [  1 117]]


Classification_report:
             precision    recall  f1-score   support

          0       0.99      0.99      0.99       157
          1       0.98      0.99      0.99       118

avg / total       0.99      0.99      0.99       275



Accuracy_score:
0.9890909090909091


In [18]:
proceed_testing_random_forest_classifier(data, test_size=0.3)

Confusion matrix:
[[229   3]
 [  2 178]]


Classification_report:
             precision    recall  f1-score   support

          0       0.99      0.99      0.99       232
          1       0.98      0.99      0.99       180

avg / total       0.99      0.99      0.99       412



Accuracy_score:
0.9878640776699029
