In [56]:
%pylab inline
import pandas as pd
import matplotlib.pyplot as plt
fileURL = './breastcancer.csv'
breast_cancer_db = pd.read_csv(fileURL)
breast_cancer_db = breast_cancer_db.ix[:,1:32]

Populating the interactive namespace from numpy and matplotlib


In [2]:
breast_cancer_db.loc[(breast_cancer_db['diagnosis'] == 'B'),'diagnosis'] = 0
breast_cancer_db.loc[(breast_cancer_db['diagnosis'] == 'M'),'diagnosis'] = 1
breast_cancer_db['diagnosis'] = pd.to_numeric(breast_cancer_db['diagnosis'])

In [3]:
def get_X_y(feature_cols, target):
    X = breast_cancer_db[feature_cols]
    y = breast_cancer_db[target]
    return X, y

In [4]:
feature_cols = ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean',
               'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean',
               'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se',
               'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
               'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst',
               'smoothness_worst', 'smoothness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst',
               'fractal_dimension_worst']

target = 'diagnosis'
X, y = get_X_y(feature_cols, target)

In [5]:
from sklearn.naive_bayes import GaussianNB
gnb_clf = GaussianNB()

In [6]:
from sklearn.linear_model import LogisticRegression
lrg_clf = LogisticRegression(C=10, fit_intercept=False, tol=0.0001, class_weight='balanced')

In [7]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=10, weights='distance')

In [8]:
from sklearn import tree
dt_clf = tree.DecisionTreeClassifier(criterion = "entropy", min_samples_split=3, 
                                     min_samples_leaf=3)

In [9]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(criterion = "gini", n_estimators=15)

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA 

pca = PCA(n_components=10)

def build_pipeline(clf):
    pipe = Pipeline(steps=[('pca', pca), ('clf', clf)])
    return pipe

In [11]:
from sklearn.cross_validation import cross_val_score

def model_and_validate(clf):
    pipe = build_pipeline(clf)
    scores = cross_val_score(pipe, X, y, cv=10, scoring='accuracy') 
    return scores.mean()



In [12]:
print (model_and_validate(gnb_clf))

0.906972171809


In [13]:
print (model_and_validate(lrg_clf))

0.947359778757


In [14]:
print (model_and_validate(dt_clf))

0.917497407311


In [15]:
print (model_and_validate(knn_clf))

0.931689136635


In [16]:
print (model_and_validate(rf_clf))

0.940279578256


In [17]:
from sklearn.cluster import KMeans
km_cls = KMeans(n_clusters=1)

In [18]:
print (model_and_validate(km_cls))

0.627662907268


In [19]:
from mlxtend.classifier import StackingClassifier

stk_clf = StackingClassifier(classifiers=[rf_clf, lrg_clf], meta_classifier=lrg_clf)

In [22]:
print (model_and_validate(stk_clf))

0.943757021865


In [121]:
from sklearn.model_selection import train_test_split
import queue
from matplotlib.pyplot import hist
import collections

def model_and_Evaluation(clf):
    pipe = build_pipeline(clf)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,stratify=y)
    pipe.fit(X_train,y_train)
    predict_proba = pipe.predict_proba(X_test)
    d = {"prob":predict_proba[:,1], "result":y_test}
    df = pd.DataFrame(data=d)
    
    q = queue.PriorityQueue()
    for i in range(len(y_test)):
        prob = predict_proba[:,1][i]
        q.put((-prob,y_test.tolist()[i]))
        
    for i in range(30):
        tuple = q.get()
        print(-tuple[0],":", tuple[1])
        
    dic = {}
    for j in range(len(y_test)):
        prob = predict_proba[:,1][j]
        y_true = y_test.tolist()[j]
        if (prob,y_true) in dic:
            dic[(prob,y_true)] = dic.get((prob,y_true))+1
        else:
            dic[(prob,y_true)] = 1
    od = collections.OrderedDict(sorted(dic.items()))
    print (od)

In [127]:
model_and_Evaluation(stk_clf)

0.999474853941 : 1
0.999474853941 : 1
0.996912884707 : 1
0.996912884707 : 1
0.996912884707 : 1
0.996912884707 : 1
0.996912884707 : 1
0.996912884707 : 1
0.996912884707 : 1
0.996912884707 : 1
0.996912884707 : 1
0.996912884707 : 1
0.996912884707 : 1
0.996912884707 : 1
0.996912884707 : 1
0.996912884707 : 1
0.996912884707 : 1
0.996912884707 : 1
0.996912884707 : 1
0.996912884707 : 1
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
OrderedDict([((0.14506016054644927, 0), 3), ((0.5, 0), 33), ((0.5, 1), 1), ((0.99691288470673645, 1), 18), ((0.99947485394100299, 1), 2)])


In [123]:
model_and_Evaluation(lrg_clf)

0.999999999996 : 1
0.999999999899 : 1
0.999999999685 : 1
0.999999998101 : 1
0.999999997654 : 1
0.999999996335 : 1
0.999999587994 : 1
0.999999277943 : 1
0.999987434589 : 1
0.99997932443 : 1
0.99997682086 : 1
0.999853086669 : 1
0.999698372464 : 1
0.999014160423 : 1
0.998234605766 : 1
0.997495309365 : 1
0.991296348797 : 1
0.982832016001 : 1
0.979447244884 : 1
0.947691922118 : 1
0.75171594027 : 1
0.396538533106 : 0
0.390260479431 : 0
0.245382948274 : 0
0.229469075807 : 0
0.206271316032 : 0
0.0776595318816 : 0
0.0375018618118 : 0
0.0239896039107 : 0
0.0207898899353 : 0
OrderedDict([((5.6576570588124265e-10, 0), 1), ((1.7780001743410068e-08, 0), 1), ((3.608453839135318e-07, 0), 1), ((2.0193678796099532e-06, 0), 1), ((4.1202207789396117e-06, 0), 1), ((5.1973783032385196e-06, 0), 1), ((1.7157756961646842e-05, 0), 1), ((2.1548275501319733e-05, 0), 1), ((2.3824550996425482e-05, 0), 1), ((0.00013293731144920306, 0), 1), ((0.00014585591928839784, 0), 1), ((0.00016162572766687295, 0), 1), ((0.00018