In [122]:
import pandas as pd

fileURL = './breastcancer.csv'
breast_cancer_db = pd.read_csv(fileURL)
breast_cancer_db = breast_cancer_db.ix[:,1:32]

In [2]:
breast_cancer_db.loc[(breast_cancer_db['diagnosis'] == 'B'),'diagnosis'] = 0
breast_cancer_db.loc[(breast_cancer_db['diagnosis'] == 'M'),'diagnosis'] = 1
breast_cancer_db['diagnosis'] = pd.to_numeric(breast_cancer_db['diagnosis'])

In [3]:
def get_X_y(feature_cols, target):
    X = breast_cancer_db[feature_cols]
    y = breast_cancer_db[target]
    return X, y

In [4]:
feature_cols = ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean',
               'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean',
               'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se',
               'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
               'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst',
               'smoothness_worst', 'smoothness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst',
               'fractal_dimension_worst']

target = 'diagnosis'
X, y = get_X_y(feature_cols, target)

In [5]:
from sklearn.naive_bayes import GaussianNB
gnb_clf = GaussianNB()

In [6]:
from sklearn.linear_model import LogisticRegression
lrg_clf = LogisticRegression(C=10, fit_intercept=False, tol=0.0001, class_weight='balanced')

In [7]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=10, weights='distance')

In [8]:
from sklearn import tree
dt_clf = tree.DecisionTreeClassifier(criterion = "entropy", min_samples_split=3, 
                                     min_samples_leaf=3)

In [9]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(criterion = "gini", n_estimators=15)

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA 

pca = PCA(n_components=10)

def build_pipeline(clf):
    pipe = Pipeline(steps=[('pca', pca), ('clf', clf)])
    return pipe

In [11]:
from sklearn.cross_validation import cross_val_score

def model_and_validate(clf):
    pipe = build_pipeline(clf)
    scores = cross_val_score(pipe, X, y, cv=10, scoring='accuracy') 
    return scores.mean()



In [14]:
print (model_and_validate(gnb_clf))

0.906972171809


In [15]:
print (model_and_validate(lrg_clf))

0.949083916688


In [16]:
print (model_and_validate(dt_clf))

0.917435830957


In [17]:
print (model_and_validate(knn_clf))

0.931689136635


In [18]:
print (model_and_validate(rf_clf))

0.945513568404


In [20]:
from sklearn.cluster import KMeans
km_cls = KMeans(n_clusters=1)

In [21]:
print (model_and_validate(km_cls))

0.627662907268


In [22]:
from mlxtend.classifier import StackingClassifier

stk_clf = StackingClassifier(classifiers=[rf_clf, lrg_clf], meta_classifier=lrg_clf)

In [21]:
print model_and_validate(stk_clf)

0.947360859044


In [119]:
from sklearn.model_selection import train_test_split
def model_and_Evaluation(clf):
    pipe = build_pipeline(clf)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,stratify=y)
    pipe.fit(X_train,y_train)
    predict_proba = pipe.predict_proba(X_test)
    #print(predict_proba[:,1][3])
    for i in range(len(y_test)):
        prob = predict_proba[:,1][i]
        print(prob,":", y_test.tolist()[i])


In [120]:
model_and_Evaluation(stk_clf)

0.5 : 0
0.5 : 0
0.990790672643 : 1
0.990790672643 : 1
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.990790672643 : 1
0.5 : 0
0.990790672643 : 1
0.990790672643 : 1
0.990790672643 : 1
0.990790672643 : 1
0.5 : 0
0.5 : 0
0.990790672643 : 1
0.990790672643 : 1
0.990790672643 : 1
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.990790672643 : 1
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.990790672643 : 1
0.5 : 0
0.5 : 0
0.990790672643 : 1
0.5 : 0
0.5 : 0
0.990790672643 : 1
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.990790672643 : 1
0.990790672643 : 1
0.990790672643 : 1
0.5 : 0
0.5 : 0
0.5 : 0
0.990790672643 : 1
0.990790672643 : 1
0.990790672643 : 1
0.990790672643 : 1


In [121]:
model_and_Evaluation(lrg_clf)

0.892562696583 : 1
0.000806190822355 : 0
0.561518772536 : 1
2.48813339284e-05 : 0
0.999773067084 : 1
0.00104253381236 : 0
0.0814333366591 : 0
0.000259580969004 : 0
0.434735859248 : 0
0.596071618598 : 1
0.112230395106 : 0
0.000455849151933 : 0
1.22064376504e-05 : 0
0.0122622505173 : 0
0.999736842929 : 1
0.0274271606844 : 0
0.0152856970535 : 0
0.806492247854 : 1
0.00543017007004 : 0
0.996399410153 : 1
0.000340062654711 : 0
0.999999786379 : 1
0.000228941127939 : 0
0.999990897911 : 1
0.000680961865466 : 0
0.0129509463001 : 0
9.8550213885e-05 : 0
0.0057700696561 : 0
0.000870171861202 : 0
0.999991175001 : 1
0.000156668926373 : 0
0.999576422757 : 1
0.00794572419906 : 0
0.999999819732 : 1
8.20819756203e-05 : 0
0.140290662053 : 0
0.00461252679307 : 0
0.000853976110334 : 0
0.142690599361 : 0
0.999663253545 : 1
0.00047771546759 : 0
0.999817650688 : 1
0.999999998199 : 1
0.00187230024829 : 0
0.0427325204665 : 0
0.471791248858 : 1
6.63452531008e-06 : 0
0.781057941772 : 1
0.924385552694 : 1
0.0164580