In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('Breast_cancer_data.csv')
data.head(20)

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,diagnosis
0,17.99,10.38,122.8,1001.0,0.1184,0
1,20.57,17.77,132.9,1326.0,0.08474,0
2,19.69,21.25,130.0,1203.0,0.1096,0
3,11.42,20.38,77.58,386.1,0.1425,0
4,20.29,14.34,135.1,1297.0,0.1003,0
5,12.45,15.7,82.57,477.1,0.1278,0
6,18.25,19.98,119.6,1040.0,0.09463,0
7,13.71,20.83,90.2,577.9,0.1189,0
8,13.0,21.82,87.5,519.8,0.1273,0
9,12.46,24.04,83.97,475.9,0.1186,0


In [3]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [17]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [28]:
model_params = {
    "svm": {
        "model": svm.SVC(gamma="auto"),
        "params": {
            "C": [1,10,20],
            "kernel": ["rbf", "linear", "sigmoid"]
        }
    },
    "random_forest": {
        "model": RandomForestClassifier(),
        "params": {
            "n_estimators": [1,5,10, 20],
            "criterion": ['gini', 'entropy']
        }
    },
    "logistic_regression": {
        "model": LogisticRegression(solver='liblinear', multi_class='auto'),
        "params": {
            "C": [1,5,10,20]
        }
    },
    "gaussian_nb": {
        "model": GaussianNB(priors=None),
        "params": {
            "var_smoothing": [1e-9, 1e-10, 1e-5, 1e-20, 1e-15]
        }
    },
    "decision_tree": {
        "model": DecisionTreeClassifier(),
        "params": {
            "criterion": ['gini', 'entropy'],
            "splitter": ['best', 'random']

        }
    }
}

In [29]:
from sklearn.model_selection import GridSearchCV

scores = []

for model_name, mp in  model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })


In [30]:
results_df = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
results_df

Unnamed: 0,model,best_score,best_params
0,svm,0.91784,"{'C': 20, 'kernel': 'linear'}"
1,random_forest,0.91784,"{'criterion': 'entropy', 'n_estimators': 10}"
2,logistic_regression,0.91784,{'C': 10}
3,gaussian_nb,0.901408,{'var_smoothing': 1e-20}
4,decision_tree,0.892019,"{'criterion': 'gini', 'splitter': 'random'}"


In [31]:
classifier = svm.SVC(C=20, kernel='linear')
classifier.fit(X_train, y_train)

SVC(C=20, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [32]:
y_pred = classifier.predict(X_test)

In [33]:
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, y_pred)

In [34]:
score

0.9090909090909091

In [35]:
import pickle
pickle_out = open("classifier.pkl", "wb")
pickle.dump(classifier, pickle_out)
pickle_out.close()