In [1]:
import pandas as pd

fileURL = './breastcancer.csv'
breast_cancer_db = pd.read_csv(fileURL)
breast_cancer_mean_db = breast_cancer_db.ix[:,1:12]

In [2]:
breast_cancer_mean_db.loc[(breast_cancer_mean_db['diagnosis'] == 'B'),'diagnosis'] = 0
breast_cancer_mean_db.loc[(breast_cancer_mean_db['diagnosis'] == 'M'),'diagnosis'] = 1
breast_cancer_mean_db['diagnosis'] = pd.to_numeric(breast_cancer_mean_db['diagnosis'])

In [3]:
def get_X_y(feature_cols, target):
    X = breast_cancer_mean_db[feature_cols]
    y = breast_cancer_mean_db[target]
    return X, y

In [4]:
feature_cols = ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean',
               'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean',
               'fractal_dimension_mean']
target = 'diagnosis'
X, y = get_X_y(feature_cols, target)

In [5]:
from sklearn.grid_search import GridSearchCV  

def tune_clf(clf, tuned_parameters):
    clf_gs = GridSearchCV(clf, tuned_parameters, cv=10, scoring='accuracy') 
    clf_gs.fit(X, y)
    return clf_gs.best_params_, clf_gs.best_score_

In [6]:
from sklearn.linear_model import LogisticRegression
lrg = LogisticRegression()

In [7]:
lrg.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'max_iter': 100,
 'multi_class': 'ovr',
 'n_jobs': 1,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'liblinear',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [8]:
lrg_tuned_parameters = [{'fit_intercept': [True, False], 
                         'class_weight':[None, "balanced", {0:1.68,1:1}],
                         'tol': [1e-3, 1e-4, 1e-5],  
                         'C': [0.1, 1, 10]}]

In [9]:
lrg_best_parames, lrg_best_accuracy = tune_clf(lrg, lrg_tuned_parameters)

In [10]:
print 'best_params for logistic regression is: '
print lrg_best_parames
print 'Accuray after tuned for logistic regression is: '
print lrg_best_accuracy

best_params for logistic regression is: 
{'C': 10, 'fit_intercept': True, 'tol': 0.0001, 'class_weight': 'balanced'}
Accuray after tuned for logistic regression is: 
0.920913884007


In [11]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()

In [12]:
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': 1,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [13]:
knn_tuned_parameters = [{'n_neighbors': [3, 5, 7, 10], 'weights':['uniform', 'distance']},
                        {'algorithm': ["ball_tree", "kd_tree"], 'leaf_size': [10, 30, 50]}]

In [14]:
knn_best_parames, knn_best_accuracy = tune_clf(knn, knn_tuned_parameters)

In [15]:
print 'Best_params for K Nearest Neighbors is: '
print knn_best_parames
print 'Accuray after tuned for Nearest Neighbors is: '
print knn_best_accuracy

Best_params for K Nearest Neighbors is: 
{'n_neighbors': 10, 'weights': 'uniform'}
Accuray after tuned for Nearest Neighbors is: 
0.891036906854


In [16]:
from sklearn import tree
dt = tree.DecisionTreeClassifier()

In [17]:
dt.get_params()

{'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': False,
 'random_state': None,
 'splitter': 'best'}

In [18]:
dt_tuned_parameters = [{'criterion': ['gini', 'entropy'], 'min_samples_split':[2, 3, 4],
                        'min_samples_leaf': [1, 2, 3]}]

In [19]:
dt_best_parames, dt_best_accuracy = tune_clf(dt, dt_tuned_parameters)

In [20]:
print 'Best_params for Decision Tree is: '
print dt_best_parames
print 'Accuray after tuned for Decision Tree is: '
print dt_best_accuracy

Best_params for Decision Tree is: 
{'min_samples_split': 4, 'criterion': 'entropy', 'min_samples_leaf': 2}
Accuray after tuned for Decision Tree is: 
0.927943760984


In [21]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [22]:
rf.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [23]:
rf_tuned_parameters = [{'criterion': ['gini', 'entropy'], 'n_estimators': [5, 10, 15]}]

In [24]:
rf_best_parames, rf_best_accuracy = tune_clf(rf, rf_tuned_parameters)

In [25]:
print 'Best_params for Random Forrest Tree is: '
print rf_best_parames
print 'Accuray after tuned for Random Forrest Tree is: '
print rf_best_accuracy

Best_params for Random Forrest Tree is: 
{'n_estimators': 5, 'criterion': 'gini'}
Accuray after tuned for Random Forrest Tree is: 
0.940246045694
