# Cross validation

Useful links
* http://scikit-learn.org/stable/modules/cross_validation.html
* http://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

In [11]:
import sklearn
from sklearn import cross_validation
import pandas as pd

In [12]:
from sklearn.datasets import load_breast_cancer
dataset = load_breast_cancer()

In [13]:
target = pd.Series(dataset.target, dtype='category')
target.cat.rename_categories(['malignant', 'benign'], inplace=True)

In [14]:
column_names = [
    'radius', 'texture', 'perimeter', 'area',
    'smoothness', 'compactness', 'concavity', 'concave_points',
    'symmetry', 'fractal_dimension']
df = pd.DataFrame(data=dataset.data[:, :10], columns=column_names)

In [15]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

def get_metrics(clf, data, target, name):
    accuracy_scores = cross_validation.cross_val_score(
        clf, data, target, cv=5, scoring='accuracy')
    precision_scores = cross_validation.cross_val_score(
        clf, data, target, cv=5, scoring='precision')
    recall_scores = cross_validation.cross_val_score(
        clf, data, target, cv=5, scoring='recall')    
    return {
        'classifier': name,
        'accuracy': accuracy_scores.mean(),
        'precision': precision_scores.mean(),
        'recall': recall_scores.mean()
    }

In [16]:
from sklearn import linear_model
# C is the inverse of regularization parameter (smaller values specify strong regularization)
logreg = linear_model.LogisticRegression(C=1e5)
result1 = get_metrics(logreg, df.values, dataset.target, 'logistic regression')

In [17]:
from sklearn.svm import SVC
clf = SVC(kernel='rbf')
result2 = get_metrics(clf, df.values, dataset.target, 'support vector (radial basis)')

In [24]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth=10)
result3 = get_metrics(clf, df.values, dataset.target, 'decision tree')

In [25]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=50)
result4 = get_metrics(clf, df.values, dataset.target, 'random forest')

In [26]:
pd.DataFrame([result1, result2, result3, result4], columns=['classifier', 'accuracy', 'precision', 'recall'])

Unnamed: 0,classifier,accuracy,precision,recall
0,logistic regression,0.92631,0.939899,0.943975
1,support vector (radial basis),0.717045,0.704226,0.946674
2,decision tree,0.917491,0.936985,0.941158
3,random forest,0.947457,0.958679,0.966393
