In [2]:
import numpy as np
import pandas as pd
import sklearn

In [20]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.decomposition import PCA

In [21]:
cancer_data = load_breast_cancer()
scaler = StandardScaler().fit(cancer_data.data)
scaled_data = scaler.transform(cancer_data.data)

In [22]:
# print(cancer_data.DESCR)

In [26]:
pca = PCA(n_components=5)
pca.fit(scaled_data)
reduced_data = pca.transform(scaled_data)
df = pd.DataFrame(reduced_data, columns=['component1', 'component2', 'component3', 'component4', 'component5'])
df['target'] = cancer_data.target
print(df.head())
x_train, x_test = train_test_split(df.drop('target', axis=1), stratify = df['target'], 
                                                    shuffle = True,
                                                    random_state = 144)

   component1  component2  component3  component4  component5  target
0    9.192837    1.948583   -1.123166    3.633731   -1.195110       0
1    2.387802   -3.768172   -0.529293    1.118264    0.621775       0
2    5.733896   -1.075174   -0.551748    0.912083   -0.177086       0
3    7.122953   10.275589   -3.232790    0.152547   -2.960878       0
4    3.935302   -1.948072    1.389767    2.940639    0.546747       0


In [27]:
random_forest = RandomForestClassifier(oob_score=True)
random_forest.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [28]:
print('Test Set Prediction: {}'. format(random_forest.predict(x_test)))

Test Set Prediction: [0 1 0 1 1 0 1 1 1 1 1 0 0 1 1 1 1 0 1 1 1 1 1 1 1 0 0 1 0 1 1 0 1 1 0 0 0
 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 1 0 0 0 1 1 1 1 0 0
 1 1 0 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 0 0 1 1 0 1 1 0 1 0 0 1 1 0 0 1 1 0 1 1 1 0 0 1 0]


In [29]:
print('Test Set Accuracy: {:.2f}'.format(random_forest.score(x_test, y_test)))

Test Set Accuracy: 0.96


In [30]:
confusion = confusion_matrix(y_test, random_forest.predict(x_test))
report = classification_report(y_test, random_forest.predict(x_test), target_names = ['malignant', 'benign'])
print('Confusion matrix: \n{}'.format(confusion))
print(report)

Confusion matrix: 
[[50  3]
 [ 3 87]]
              precision    recall  f1-score   support

   malignant       0.94      0.94      0.94        53
      benign       0.97      0.97      0.97        90

    accuracy                           0.96       143
   macro avg       0.96      0.96      0.96       143
weighted avg       0.96      0.96      0.96       143

