In [38]:
#processing
import pandas as pd
import numpy as np

#dataset
from sklearn import datasets

#model selection
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV

#pre-processing
from sklearn.preprocessing import StandardScaler

#visualization 
import seaborn as sns
from numpy import mean
from numpy import std

#model
from xgboost import XGBClassifier

#scoring
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [39]:
data = datasets.load_breast_cancer()

print(data)

df = pd.DataFrame(data=data.data, columns = data.feature_names)
df['target'] = data.target

df["target"]


#df.describe()

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]]), 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
 

0      0
1      0
2      0
3      0
4      0
      ..
564    0
565    0
566    0
567    0
568    1
Name: target, Length: 569, dtype: int64

In [40]:
#clean
df.isnull().sum()


#nomalize only the continuous variables
df_standardized = pd.DataFrame(StandardScaler().fit_transform(df.drop('target', axis=1)))

df_standardized['target'] = data.target
df_standardized.columns = df.columns

print(df_standardized.describe())


df_standardized['target'].value_counts()


#split into training and testing
x_train, x_test, y_train, y_test = train_test_split(df_standardized.iloc[:, :-1], df_standardized.iloc[:, -1], test_size=.2)



#stratified cv
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

#create basline model to test against
model = XGBClassifier()

scores = cross_val_score(model, x_train, y_train, cv = cv, n_jobs=-1, scoring='accuracy')


#print the basline score to improve upon
print(mean(scores))


        mean radius  mean texture  mean perimeter     mean area  \
count  5.690000e+02  5.690000e+02    5.690000e+02  5.690000e+02   
mean  -3.153111e-15 -6.568462e-15   -6.993039e-16 -8.553985e-16   
std    1.000880e+00  1.000880e+00    1.000880e+00  1.000880e+00   
min   -2.029648e+00 -2.229249e+00   -1.984504e+00 -1.454443e+00   
25%   -6.893853e-01 -7.259631e-01   -6.919555e-01 -6.671955e-01   
50%   -2.150816e-01 -1.046362e-01   -2.359800e-01 -2.951869e-01   
75%    4.693926e-01  5.841756e-01    4.996769e-01  3.635073e-01   
max    3.971288e+00  4.651889e+00    3.976130e+00  5.250529e+00   

       mean smoothness  mean compactness  mean concavity  mean concave points  \
count     5.690000e+02      5.690000e+02    5.690000e+02         5.690000e+02   
mean      6.081447e-15     -1.136369e-15   -2.997017e-16         1.023981e-15   
std       1.000880e+00      1.000880e+00    1.000880e+00         1.000880e+00   
min      -3.112085e+00     -1.610136e+00   -1.114873e+00        -1.26182

In [41]:
#create lists of hyperparameters to fit and test
#max_depth
#colsample_bytree
#n_estimators
#reg_lambda

param_grid = {'max_depth' :[5, 10, 15], "colsample_bytree":[.3, .5, .8], "n_estimators":[60, 80, 100, 120], "reg_lambda":[0, .5, 1]}


grid_model = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, verbose=0, cv=cv)
results = grid_model.fit(x_train, y_train)

print(f"best results is {results.best_score_}")
print(f"best paramaters are {results.best_params_}")

best results is 0.9714975845410627
best paramaters are {'colsample_bytree': 0.3, 'max_depth': 15, 'n_estimators': 60, 'reg_lambda': 0}


In [44]:
#the accuracy of our model is improved

test_results = grid_model.predict(x_test)

print(confusion_matrix(y_test, test_results))
print("accuracy score: ", accuracy_score(y_test, test_results))


[[40  3]
 [ 2 69]]
accuracy score:  0.956140350877193
