In [2]:
import pandas as pd
import matplotlib.pyplot as plt
fileURL = './breastcancer.csv'
breast_cancer_db = pd.read_csv(fileURL)
breast_cancer_mean_db = breast_cancer_db.ix[:,1:12]

In [3]:
breast_cancer_mean_db[:3]

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999


In [4]:
breast_cancer_mean_db.loc[(breast_cancer_mean_db['diagnosis'] == 'B'),'diagnosis'] = 0
breast_cancer_mean_db.loc[(breast_cancer_mean_db['diagnosis'] == 'M'),'diagnosis'] = 1
breast_cancer_mean_db['diagnosis'] = pd.to_numeric(breast_cancer_mean_db['diagnosis'])
breast_cancer_mean_db.rename(columns = {'concave points_mean':'concave_points_mean'}, inplace = True)

In [5]:
def get_X_y(feature_cols, target):
    X = breast_cancer_mean_db[feature_cols]
    y = breast_cancer_mean_db[target]
    return X, y

In [6]:
feature_cols = ['texture_mean', 'perimeter_mean', 'smoothness_mean',
               'compactness_mean', 'concave_points_mean']
target = 'diagnosis'
X, y = get_X_y(feature_cols, target)

In [7]:
X[:3]

Unnamed: 0,texture_mean,perimeter_mean,smoothness_mean,compactness_mean,concave_points_mean
0,10.38,122.8,0.1184,0.2776,0.1471
1,17.77,132.9,0.08474,0.07864,0.07017
2,21.25,130.0,0.1096,0.1599,0.1279


In [8]:
y[:3]

0    1
1    1
2    1
Name: diagnosis, dtype: int64

In [9]:
from sklearn.naive_bayes import GaussianNB
gnb_clf = GaussianNB()

In [10]:
from sklearn.linear_model import LogisticRegression
lrg_clf = LogisticRegression(C=10, fit_intercept=True, tol=0.0001, class_weight='balanced')

In [11]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=10, weights='uniform')

In [12]:
from sklearn import tree
dt_clf = tree.DecisionTreeClassifier(criterion = "entropy", min_samples_split=4, 
                                     min_samples_leaf=2)

In [13]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(criterion = "gini", n_estimators=5)

In [14]:
from sklearn.cross_validation import cross_val_score

def model_and_validate(clf):
    scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy') 
    return scores.mean()



In [17]:
print (model_and_validate(gnb_clf))

0.921070996457


In [19]:
print (model_and_validate(lrg_clf))

0.912234249417


In [21]:
print (model_and_validate(dt_clf))

0.903586552588


In [23]:
print (model_and_validate(knn_clf))

0.891178376977


In [24]:
print (model_and_validate(rf_clf))

0.921070996457


In [26]:
from mlxtend.classifier import StackingClassifier

stk_clf = StackingClassifier(classifiers=[rf_clf, lrg_clf], meta_classifier=lrg_clf)

In [27]:
print (model_and_validate(stk_clf))

0.92258015729


In [69]:
import queue
from sklearn.model_selection import train_test_split
def model_and_Evaluation(clf):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,stratify=y)
    clf.fit(X_train,y_train)
    predict_proba = clf.predict_proba(X_test)
    #print(predict_proba[:,1][3])
    q = queue.PriorityQueue()
    for i in range(len(y_test)):
        prob = predict_proba[:,1][i]
        q.put((-prob,y_test.tolist()[i]))
        #print(prob,":", y_test.tolist()[i])
    for i in range(20):
        tuple = q.get()
        print(-tuple[0],":", tuple[1])

In [70]:
model_and_Evaluation(stk_clf)

0.995305919395 : 0
0.995305919395 : 0
0.995305919395 : 0
0.995305919395 : 1
0.995305919395 : 1
0.995305919395 : 1
0.995305919395 : 1
0.995305919395 : 1
0.995305919395 : 1
0.995305919395 : 1
0.995305919395 : 1
0.995305919395 : 1
0.995305919395 : 1
0.995305919395 : 1
0.995305919395 : 1
0.995305919395 : 1
0.995305919395 : 1
0.995305919395 : 1
0.995305919395 : 1
0.995305919395 : 1


In [71]:
model_and_Evaluation(rf_clf)

1.0 : 0
1.0 : 1
1.0 : 1
1.0 : 1
1.0 : 1
1.0 : 1
1.0 : 1
1.0 : 1
1.0 : 1
1.0 : 1
1.0 : 1
1.0 : 1
1.0 : 1
0.8 : 0
0.8 : 1
0.8 : 1
0.8 : 1
0.6 : 0
0.6 : 1
0.6 : 1
