In [1]:
from sklearn.datasets import load_breast_cancer

import numpy as np
import pandas as pd

In [2]:
cancer = load_breast_cancer()
data, y, labels = cancer.data, cancer.target, cancer.feature_names
df = pd.DataFrame(data, columns=labels)
target = 'TARGET'
df[target] = y

df.sample(8)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,TARGET
116,8.95,15.76,58.74,245.2,0.09462,0.1243,0.09263,0.02308,0.1305,0.07163,...,17.07,63.34,270.0,0.1179,0.1879,0.1544,0.03846,0.1652,0.07722,1
546,10.32,16.35,65.31,324.9,0.09434,0.04994,0.01012,0.005495,0.1885,0.06201,...,21.77,71.12,384.9,0.1285,0.08842,0.04384,0.02381,0.2681,0.07399,1
344,11.71,15.45,75.03,420.3,0.115,0.07281,0.04006,0.0325,0.2009,0.06506,...,18.16,84.16,516.4,0.146,0.1115,0.1087,0.07864,0.2765,0.07806,1
274,17.93,24.48,115.2,998.9,0.08855,0.07027,0.05699,0.04744,0.1538,0.0551,...,34.69,135.1,1320.0,0.1315,0.1806,0.208,0.1136,0.2504,0.07948,0
146,11.8,16.58,78.99,432.0,0.1091,0.17,0.1659,0.07415,0.2678,0.07371,...,26.38,91.93,591.7,0.1385,0.4092,0.4504,0.1865,0.5774,0.103,0
538,7.729,25.49,47.98,178.8,0.08098,0.04878,0.0,0.0,0.187,0.07285,...,30.92,57.17,248.0,0.1256,0.0834,0.0,0.0,0.3058,0.09938,1
14,13.73,22.61,93.6,578.3,0.1131,0.2293,0.2128,0.08025,0.2069,0.07682,...,32.01,108.8,697.7,0.1651,0.7725,0.6943,0.2208,0.3596,0.1431,0
284,12.89,15.7,84.08,516.6,0.07818,0.0958,0.1115,0.0339,0.1432,0.05935,...,19.69,92.12,595.6,0.09926,0.2317,0.3344,0.1017,0.1999,0.07127,1


In [3]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score

In [4]:
df.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension',
       'TARGET'],
      dtype='object')

In [5]:
len(df.columns)

31

In [6]:
used_cols = [c for c in df.columns.tolist() if c not in [target]]

# worst_cols = [c for c in df.columns.tolist() if 'worst' in c]

used_cols = ['mean symmetry'] #+ worst_cols

X, y = df[used_cols], df[target]

nb = GaussianNB()
scores = cross_val_score(nb, X, y, 
                         n_jobs=-1, 
                         scoring='roc_auc', 
                         cv=5)

print(scores.mean(), "+/-", scores.std())

0.6997172310347874 +/- 0.03404217036816094


In [7]:
nb = GaussianNB()
cols = []
means = []
stds = []
start_cols = ['worst perimeter', 'worst texture', 'mean smoothness']

for col in df.columns:
    if col in start_cols:
        continue
    
    col_list = start_cols + [ col ]
    #col_list.append(col)
    print(col_list)
    
    scores = cross_val_score(nb, df[col_list], df[target], 
                         n_jobs=-1, 
                         scoring='roc_auc', 
                         cv=5)
    cols.append(col)
    means.append(scores.mean())
    stds.append(scores.std())
    
    print(col_list, ":", scores.mean(), "+/-", scores.std())

['worst perimeter', 'worst texture', 'mean smoothness', 'mean radius']
['worst perimeter', 'worst texture', 'mean smoothness', 'mean radius'] : 0.9877597627106306 +/- 0.005980769834332763
['worst perimeter', 'worst texture', 'mean smoothness', 'mean texture']
['worst perimeter', 'worst texture', 'mean smoothness', 'mean texture'] : 0.9884283738607355 +/- 0.010147697557762929
['worst perimeter', 'worst texture', 'mean smoothness', 'mean perimeter']
['worst perimeter', 'worst texture', 'mean smoothness', 'mean perimeter'] : 0.987754823514732 +/- 0.005909796112831647
['worst perimeter', 'worst texture', 'mean smoothness', 'mean area']
['worst perimeter', 'worst texture', 'mean smoothness', 'mean area'] : 0.9849897836632197 +/- 0.006737799352376105
['worst perimeter', 'worst texture', 'mean smoothness', 'mean compactness']
['worst perimeter', 'worst texture', 'mean smoothness', 'mean compactness'] : 0.9841129464123242 +/- 0.008823500922518934
['worst perimeter', 'worst texture', 'mean smoo

In [8]:
sorted(means)

[0.9824152927903338,
 0.9832624948658358,
 0.9838817140569514,
 0.9841129464123242,
 0.9849897836632197,
 0.985093896713615,
 0.9867808920707709,
 0.9868183259765312,
 0.9870067952937263,
 0.98726350350163,
 0.9875063039737132,
 0.987754823514732,
 0.9877597627106306,
 0.98817205558935,
 0.9884283738607355,
 0.9887283650221743,
 0.9888028428971763,
 0.9889546581816481,
 0.9891286998476648,
 0.989280125195618,
 0.9897767743411372,
 0.9899113024399628,
 0.9910514768195737,
 0.9911835353204499,
 0.9915460463036618,
 0.991702410847514,
 0.9920998861385366,
 1.0]

In [9]:
nb = GaussianNB()
used_cols = ['worst perimeter', 'worst texture', 'mean smoothness']
scores = cross_val_score(nb, df[used_cols], df[target], 
                         n_jobs=-1, 
                         scoring='roc_auc', 
                         cv=5)
print(used_cols, ":", scores.mean(), "+/-", scores.std())

['worst perimeter', 'worst texture', 'mean smoothness'] : 0.9923607536693027 +/- 0.005942530301031657


In [12]:
df['log worst perimeter'] = np.log(df['worst perimeter'])
df['log worst texture']   = np.log(df['worst texture'])
df['log mean smoothness']   = np.log(df['mean smoothness'])

In [13]:
nb = GaussianNB()
used_cols = ['log worst perimeter', 'log worst texture', 'log mean smoothness']
scores = cross_val_score(nb, df[used_cols], df[target], 
                         n_jobs=-1, 
                         scoring='roc_auc', 
                         cv=5)
print(used_cols, ":", scores.mean(), "+/-", scores.std())

['log worst perimeter', 'log worst texture', 'log mean smoothness'] : 0.9936502737354358 +/- 0.005204586561430623


In [15]:
df['log2 worst perimeter'] = np.log(1+df['log worst perimeter'])
df['log2 worst texture']   = np.log(1+df['log worst texture'])
df['log2 mean smoothness'] = np.log(1+df['log mean smoothness'])

In [16]:
nb = GaussianNB()
used_cols = ['log worst perimeter', 'log worst texture', 'log mean smoothness']
scores = cross_val_score(nb, df[used_cols], df[target], 
                         n_jobs=-1, 
                         scoring='roc_auc', 
                         cv=5)
print(used_cols, ":", scores.mean(), "+/-", scores.std())

['log worst perimeter', 'log worst texture', 'log mean smoothness'] : 0.9936502737354358 +/- 0.005204586561430623


In [None]:
# ALLa
# 0.9872499857023277 +/- 0.008220915233521506

#
#

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
for col in df.columns:
    print(col)
    SIZE=(20,8)
    #plt.title(col)
    plt.figure(figsize=SIZE)
    df[col].hist(bins=20)
    plt.show()

In [None]:
SIZE=(20,8)
plt.figure(figsize=SIZE)
df['worst radius'].hist(bins=20)
plt.show()