In [57]:
# imports
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import torch

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# set numeric precison
pd.set_option("precision", 2)
torch.set_printoptions(precision=2, sci_mode=False)
%precision 2

# enable reload of changed files
%load_ext autoreload
%autoreload 2

# plot inline
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
bc = load_breast_cancer()

In [23]:
X = bc.data
y = bc.target

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [55]:
knn = GridSearchCV(KNeighborsClassifier(), {'n_neighbors': range(1, 20)}, scoring='accuracy', n_jobs=-1, cv=5).fit(X_train, y_train)

In [56]:
pd.DataFrame(knn.cv_results_).sort_values('rank_test_score').iloc[:3,4:]

Unnamed: 0,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
4,5,{'n_neighbors': 5},0.92,0.92,0.93,0.89,0.92,0.92,0.01,1
5,6,{'n_neighbors': 6},0.94,0.92,0.93,0.88,0.92,0.92,0.02,2
7,8,{'n_neighbors': 8},0.94,0.92,0.96,0.88,0.88,0.92,0.03,3


In [60]:
rfc = GridSearchCV(RandomForestClassifier(n_estimators=1000), {'criterion': ['gini', 'entropy']}, scoring='accuracy', n_jobs=-1, cv=5).fit(X_train, y_train)

In [62]:
pd.DataFrame(rfc.cv_results_).sort_values('rank_test_score').iloc[:,4:]

Unnamed: 0,param_criterion,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,gini,{'criterion': 'gini'},0.97,0.95,0.96,0.95,0.95,0.96,0.01,1
1,entropy,{'criterion': 'entropy'},0.97,0.95,0.96,0.93,0.95,0.95,0.01,2


In [64]:
final = RandomForestClassifier(**rfc.best_params_).fit(X_train, y_train)

In [65]:
final.score(X_test, y_test)

0.9574468085106383

In [66]:
import pickle

In [67]:
pickle.dump(final, open( "model.p", "wb" ) )

In [68]:
# open vscode and load

In [71]:
X_test[:3]

array([[1.25e+01, 1.86e+01, 8.11e+01, 4.82e+02, 9.97e-02, 1.06e-01,
        8.00e-02, 3.82e-02, 1.93e-01, 6.37e-02, 3.96e-01, 1.04e+00,
        2.50e+00, 3.03e+01, 6.95e-03, 1.91e-02, 2.70e-02, 1.04e-02,
        1.78e-02, 3.59e-03, 1.50e+01, 2.46e+01, 9.60e+01, 6.78e+02,
        1.43e-01, 2.38e-01, 2.67e-01, 1.02e-01, 3.01e-01, 8.75e-02],
       [1.89e+01, 2.13e+01, 1.24e+02, 1.13e+03, 9.01e-02, 1.03e-01,
        1.08e-01, 7.95e-02, 1.58e-01, 5.46e-02, 7.89e-01, 7.97e-01,
        5.49e+00, 9.60e+01, 4.44e-03, 1.65e-02, 2.27e-02, 1.37e-02,
        1.39e-02, 1.70e-03, 2.49e+01, 2.66e+01, 1.66e+02, 1.87e+03,
        1.19e-01, 2.34e-01, 2.69e-01, 1.79e-01, 2.55e-01, 6.59e-02],
       [1.55e+01, 1.95e+01, 1.02e+02, 7.49e+02, 1.09e-01, 1.22e-01,
        1.47e-01, 8.09e-02, 1.93e-01, 5.80e-02, 4.74e-01, 7.86e-01,
        3.09e+00, 4.83e+01, 6.24e-03, 1.48e-02, 2.81e-02, 1.09e-02,
        1.40e-02, 2.46e-03, 1.93e+01, 2.60e+01, 1.25e+02, 1.16e+03,
        1.55e-01, 2.39e-01, 3.79e-01, 1.51e-01

In [72]:
np.savetxt("data.csv", X_test[:3], delimiter=",")

In [73]:
y_test[:3]

array([1, 0, 0])