In [1]:
# Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Read the CSV file 

In [2]:
# Loding the dataset into pandas dataframe.
df = pd.read_csv('https://raw.githubusercontent.com/diksha-cl/Data-files/master/data_banknote_authentication.csv')

In [3]:
# print all the available features.
df.columns

Index(['variance', 'skew', 'kurtosis', 'entropy', 'authentic'], dtype='object')

In [4]:
# Check for nulls.
df.columns[df.isnull().any()]

Index([], dtype='object')

In [5]:
# Count the number of malignants and benigns in the dataset.
df['authentic'].value_counts()

0    762
1    610
Name: authentic, dtype: int64

In [6]:
df.sample(n=5, random_state=55).sort_values(['variance'])

Unnamed: 0,variance,skew,kurtosis,entropy,authentic
1180,-2.2183,-1.254,2.9986,0.36378,1
769,-0.89409,3.1991,-1.8219,-2.9452,1
1353,0.11592,3.2219,-3.4302,-2.8457,1
239,2.3952,9.5083,-3.1783,-3.0086,0
722,4.8451,8.1116,-2.9512,-1.4724,0


## Create the Dataframe of features (X) and the target (Y) variables

In [7]:
# Load the features to a variable X
# X is created by simply dropping the diagnosis column and retaining all others
X = df.drop('authentic', axis = 1)

# Load the target variable to y
y = df['authentic']

## Split Test Train

**> Train-Test split -** We split our data into two parts, namely, the train set and the test set (ideally its a 70-30 train-test split which is upto you). We then try to build our function f(x) (aka model) using the train set and see how well it does on the test set.   

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

## Create an Instance of the classifier and train it.

In [9]:
# Let's create an instance for the LogisticRegression model and then train it with the training set.
from sklearn.ensemble import RandomForestClassifier
Classifier = RandomForestClassifier(random_state=0, min_samples_leaf=10)
Classifier.fit(X_train,y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

## Get the Predictions

In [10]:
# Getting predictions from the model 
y_test_hat = Classifier.predict(X_test)

# Compare the predicted values with the actuals.
Results = pd.DataFrame({'Actual': y_test})
column = pd.DataFrame({'Predictions': y_test_hat})
Results = Results.join(column.set_index(Results.index))
Results.head(5)

Unnamed: 0,Actual,Predictions
1240,1,1
703,0,0
821,1,1
1081,1,1
37,0,0


### 1. The accuracy scores

It is simply calculated as number of classes predicted right divided by total number of samples.

In [11]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_test_hat))

0.9830097087378641


In [12]:
# Get the predictions from the model for the training set. 
y_train_hat = Classifier.predict(X_train)
print(accuracy_score(y_train, y_train_hat))

0.9875


### 2. The confusion matrix

In [13]:
from sklearn.metrics import confusion_matrix, recall_score, precision_score
 
cm = confusion_matrix(y_test, y_test_hat)
print(cm)

[[229   6]
 [  1 176]]


In [14]:
# Assigning Variables for convinience
TN = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TP = cm[1][1]

recall = TP / float(FN + TP)
print("recall:", recall)

precision = TP / float(TP + FP)
print("precision:", precision)

specificity = TN / (TN + FP)
print("specificity:", specificity)

recall: 0.9943502824858758
precision: 0.967032967032967
specificity: 0.9744680851063829


### Feature importance

In [15]:
feature_importances = pd.DataFrame(Classifier.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance',                                                                 ascending=False)

feature_importances

Unnamed: 0,importance
variance,0.623634
skew,0.202934
kurtosis,0.133198
entropy,0.040235
