In [1]:
%pylab inline
import pandas as pd
import matplotlib.pyplot as plt
fileURL = './breastcancer.csv'
breast_cancer_db = pd.read_csv(fileURL)
breast_cancer_db = breast_cancer_db.ix[:,1:32]

Populating the interactive namespace from numpy and matplotlib


In [2]:
breast_cancer_db.loc[(breast_cancer_db['diagnosis'] == 'B'),'diagnosis'] = 0
breast_cancer_db.loc[(breast_cancer_db['diagnosis'] == 'M'),'diagnosis'] = 1
breast_cancer_db['diagnosis'] = pd.to_numeric(breast_cancer_db['diagnosis'])

In [3]:
def get_X_y(feature_cols, target):
    X = breast_cancer_db[feature_cols]
    y = breast_cancer_db[target]
    return X, y

In [4]:
feature_cols = ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean',
               'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean',
               'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se',
               'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
               'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst',
               'smoothness_worst', 'smoothness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst',
               'fractal_dimension_worst']

target = 'diagnosis'
X, y = get_X_y(feature_cols, target)

In [5]:
from sklearn.naive_bayes import GaussianNB
gnb_clf = GaussianNB()

In [6]:
from sklearn.linear_model import LogisticRegression
lrg_clf = LogisticRegression(C=10, fit_intercept=False, tol=0.0001, class_weight='balanced')

In [7]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=10, weights='distance')

In [8]:
from sklearn import tree
dt_clf = tree.DecisionTreeClassifier(criterion = "entropy", min_samples_split=3, 
                                     min_samples_leaf=3)

In [9]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(criterion = "gini", n_estimators=15)

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA 

pca = PCA(n_components=10)

def build_pipeline(clf):
    pipe = Pipeline(steps=[('pca', pca), ('clf', clf)])
    return pipe

In [11]:
from sklearn.cross_validation import cross_val_score

def model_and_validate(clf):
    pipe = build_pipeline(clf)
    scores = cross_val_score(pipe, X, y, cv=10, scoring='accuracy') 
    return scores.mean()



In [12]:
print (model_and_validate(gnb_clf))

0.906972171809


In [13]:
print (model_and_validate(lrg_clf))

0.949083916688


In [14]:
print (model_and_validate(dt_clf))

0.913956226774


In [15]:
print (model_and_validate(knn_clf))

0.931689136635


In [16]:
print (model_and_validate(rf_clf))

0.938586768646


In [17]:
from sklearn.cluster import KMeans
km_cls = KMeans(n_clusters=1)

In [18]:
print (model_and_validate(km_cls))

0.627662907268


In [19]:
from mlxtend.classifier import StackingClassifier

stk_clf = StackingClassifier(classifiers=[rf_clf, lrg_clf], meta_classifier=lrg_clf)

In [20]:
print (model_and_validate(stk_clf))

0.940372482931


In [33]:
from sklearn.model_selection import train_test_split
import queue
from matplotlib.pyplot import hist
import collections

def model_and_Evaluation(clf,target_class):
    pipe = build_pipeline(clf)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,stratify=y)
    pipe.fit(X_train,y_train)
    predict_proba = pipe.predict_proba(X_test)
    d = {"prob":predict_proba[:,1], "result":y_test}
    df = pd.DataFrame(data=d)
    
    q = queue.PriorityQueue()
    for i in range(len(y_test)):
        prob = predict_proba[:,target_class][i]
        q.put((-prob,y_test.tolist()[i]))
        
    for i in range(53):
        tuple = q.get()
        print(-tuple[0],":", tuple[1])
        
    dic = {}
    for j in range(len(y_test)):
        prob = predict_proba[:,1][j]
        y_true = y_test.tolist()[j]
        if (prob,y_true) in dic:
            dic[(prob,y_true)] = dic.get((prob,y_true))+1
        else:
            dic[(prob,y_true)] = 1
    od = collections.OrderedDict(sorted(dic.items()))
    print (od)

In [35]:
model_and_Evaluation(stk_clf,1)

0.997205083938 : 0
0.997205083938 : 1
0.997205083938 : 1
0.997205083938 : 1
0.997205083938 : 1
0.997205083938 : 1
0.997205083938 : 1
0.997205083938 : 1
0.997205083938 : 1
0.997205083938 : 1
0.997205083938 : 1
0.997205083938 : 1
0.997205083938 : 1
0.997205083938 : 1
0.997205083938 : 1
0.997205083938 : 1
0.997205083938 : 1
0.997205083938 : 1
0.997205083938 : 1
0.997205083938 : 1
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
OrderedDict([((0.28082646479949591, 1), 2), ((0.5, 0), 35), ((0.99720508393767737, 0), 1), ((0.99720508393767737, 1), 19)])


In [36]:
model_and_Evaluation(lrg_clf,1)

1.0 : 1
1.0 : 1
1.0 : 1
0.999999999821 : 1
0.999999999566 : 1
0.999999999262 : 1
0.999999907231 : 1
0.999999713454 : 1
0.999997441076 : 1
0.999984421404 : 1
0.999983025842 : 1
0.999974369419 : 1
0.999887332121 : 1
0.999768837157 : 1
0.999259973775 : 1
0.999010947439 : 1
0.994703566372 : 1
0.982954168862 : 1
0.862184850592 : 0
0.65480711218 : 1
0.489478590283 : 1
0.445404323528 : 0
0.232458639748 : 0
0.177365838289 : 1
0.124392080678 : 0
0.0283957836058 : 0
0.0219738957676 : 0
0.0212330644841 : 0
0.020586228169 : 0
0.0195253848595 : 0
0.0177533286816 : 0
0.0175437730777 : 0
0.0115329413593 : 0
0.00835890445609 : 0
0.00426371809633 : 0
0.00387167585472 : 0
0.00319913293522 : 0
0.00318366225567 : 0
0.00288218895272 : 0
0.00190119421099 : 0
0.00173703891666 : 0
0.00153424550474 : 0
0.00146265927832 : 0
0.0011317627118 : 0
0.000823179710492 : 0
0.000692260642851 : 0
0.000581712266003 : 0
0.000270395067192 : 0
0.000210229255237 : 0
0.000150076294675 : 0
7.16141538047e-05 : 0
6.68278436493e-0

In [39]:
model_and_Evaluation(stk_clf,0)

0.887792449759 : 0
0.887792449759 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.5 : 0
0.00317346057267 : 0
0.00317346057267 : 0
0.00317346057267 : 1
0.00317346057267 : 1
0.00317346057267 : 1
0.00317346057267 : 1
0.00317346057267 : 1
0.00317346057267 : 1
0.00317346057267 : 1
0.00317346057267 : 1
0.00317346057267 : 1
0.00317346057267 : 1
0.00317346057267 : 1
0.00317346057267 : 1
0.00317346057267 : 1
0.00317346057267 : 1
0.00317346057267 : 1
0.00317346057267 : 1
0.00317346057267 : 1
OrderedDict([((0.11220755024072962, 0), 2), ((0.5, 0), 32), ((0.99682653942732735, 0), 2), ((0.99682653942732735, 1), 21)])


In [40]:
model_and_Evaluation(lrg_clf,0)

0.999999970362 : 0
0.999999970151 : 0
0.999999042255 : 0
0.99999862174 : 0
0.999998453622 : 0
0.999996295633 : 0
0.99999491957 : 0
0.999992893035 : 0
0.999992484946 : 0
0.999979718113 : 0
0.999971540347 : 0
0.999969607179 : 0
0.999958348118 : 0
0.99994791176 : 0
0.999925411346 : 0
0.999897756564 : 0
0.999731839796 : 0
0.999568433655 : 0
0.999304182964 : 0
0.998990605705 : 0
0.998855227176 : 0
0.998243201945 : 0
0.997547759383 : 0
0.996656862449 : 0
0.985233981927 : 0
0.978541870647 : 0
0.97726475415 : 0
0.976503427886 : 0
0.9551238577 : 0
0.954367362282 : 0
0.949659460557 : 0
0.946507924661 : 0
0.942053138012 : 0
0.891633364427 : 0
0.874410839325 : 0
0.826965200621 : 1
0.580737792739 : 0
0.374155094608 : 1
0.25964287931 : 1
0.159271925924 : 1
0.0198352904451 : 1
0.0195236032904 : 1
0.00104540614441 : 1
0.00072245641937 : 1
0.000248092924553 : 1
0.000101458562198 : 1
4.79967250245e-05 : 1
2.49550867178e-05 : 1
1.72380540584e-05 : 1
9.70596071115e-06 : 1
9.53732684272e-06 : 1
8.042680706