In [1]:
import numpy as np

In [2]:
from sklearn.datasets import load_breast_cancer

In [3]:
data = load_breast_cancer()

In [4]:
X = data.data

In [5]:
X.shape

(569, 30)

In [9]:
type(data)

sklearn.utils.Bunch

In [10]:
y = data.target

In [13]:
print(data.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 3 is Mean Radius, f

In [14]:
from sklearn import linear_model

In [15]:
clf = linear_model.LogisticRegression()

In [16]:
clf

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
n_samples = X.shape[0]
n_train = n_samples //2
n_test = n_samples - n_train

In [18]:
train_index = range(0, n_train)
test_index = range(n_train, n_samples)

In [19]:
train_index

range(0, 284)

In [20]:
test_index

range(284, 569)

In [21]:
X_train = X[train_index]
X_test = X[test_index]

y_train = y[train_index]
y_test = y[test_index]

In [22]:
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]

In [23]:
clf.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [24]:
clf.score(X_train, y_train)

0.9647887323943662

In [25]:
clf.score(X_test, y_test)

0.9473684210526315

In [26]:
clf.predict(X_test)

array([1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1])

In [28]:
wrong = 0
for i,j in zip(clf.predict(X_test), y_test):
    if i == j:
        print(i,j)
    else:
        print(i,j, "wrong")
        wrong +=1

1 1
1 1
1 1
1 1
1 1
1 1
0 1 wrong
1 1
1 1
1 1
1 1
1 1
1 1
1 0 wrong
1 1
1 1
0 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
0 0
1 1
1 1
1 1
1 1
0 0
0 0
0 0
1 1
1 1
1 1
1 1
0 0
1 1
0 0
1 1
0 0
0 1 wrong
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
0 0
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
0 0
1 1
0 0
0 0
0 0
1 1
0 0
0 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 0 wrong
1 1
1 1
1 1
0 0
1 1
1 1
0 0
0 0
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 1 wrong
0 0
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
0 0
0 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
0 0
1 1
0 0
1 1
0 1 wrong
0 0
1 1
0 0
1 1
1 1
1 1
0 1 wrong
1 1
1 1
1 1
1 1
0 0
0 0
1 1
1 1
1 1
0 1 wrong
0 1 wrong
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 1 wrong
1 1
1 1
0 0
1 1
0 1 wrong
1 1
1 1
1 1
1 1
1 1
0 0
1 1
0 0
1 1
0 1 wrong
0 0
1 1
1 1
1 1
1 1
1 1
0 0
0 0
1 1
0 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
0 0
1 1
0 0
1 1


In [29]:
print("{0} / {1} = {2}".format(wrong, n_test, 1-wrong/n_test))

15 / 285 = 0.9473684210526316


## iris

In [30]:
from sklearn.datasets import load_iris

In [31]:
data = load_iris()

In [32]:
X = data.data

In [33]:
dir(data)

['DESCR', 'data', 'feature_names', 'filename', 'target', 'target_names']

In [34]:
X.shape

(150, 4)

In [35]:
X[0]

array([5.1, 3.5, 1.4, 0.2])

In [36]:
data.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [37]:
y = data.target

In [38]:
data.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [39]:
print(data.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [40]:
n_samples = X.shape[0]
n_train = n_samples // 2
n_test = n_samples - n_train

In [41]:
n_samples

150

In [42]:
n_train

75

In [43]:
train_index = range(0, n_train)
test_index = range(n_train, n_samples)

In [44]:
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]

In [45]:
clf.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [46]:
clf.score(X_train, y_train)

1.0

In [47]:
clf.score(X_test, y_test)

0.3333333333333333

In [48]:
print(clf.predict(X_test),y_test)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1] [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2]


In [49]:
wrong = 0
for i,j in zip(clf.predict(X_test), y_test):
    if i == j:
        print(i,j)
    else:
        print(i,j, "wrong")
        wrong +=1

1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong
1 2 wrong


In [50]:
print("{0} / {1} = {2}".format(wrong, n_test, 1-wrong/n_test))

50 / 75 = 0.33333333333333337


In [51]:
y_train, y_test

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1]),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2]))

In [52]:
from sklearn.model_selection import ShuffleSplit

In [53]:
ss = ShuffleSplit(n_splits=1, train_size=0.5, test_size=0.5, random_state=0)

In [54]:
ss

ShuffleSplit(n_splits=1, random_state=0, test_size=0.5, train_size=0.5)

In [55]:
train_index, test_index = next(ss.split(X))

In [56]:
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]

In [57]:
y_train, y_test

(array([0, 2, 1, 0, 1, 2, 1, 0, 2, 2, 2, 2, 0, 0, 2, 2, 0, 2, 0, 2, 2, 0,
        0, 2, 0, 0, 0, 1, 2, 2, 0, 0, 0, 1, 1, 0, 0, 1, 0, 2, 1, 2, 1, 0,
        2, 0, 2, 0, 0, 2, 0, 2, 1, 1, 1, 2, 2, 1, 1, 0, 1, 2, 2, 0, 1, 1,
        1, 1, 0, 0, 0, 2, 1, 2, 0]),
 array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1,
        0, 0, 2, 0, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 0, 1, 1, 1, 2, 0, 2, 0,
        0, 1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 0, 2, 1, 1, 1,
        1, 2, 0, 0, 2, 1, 0, 0, 1]))

In [58]:
clf.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [59]:
clf.score(X_train, y_train)

0.92

In [60]:
clf.score(X_test, y_test)

0.84

In [61]:
wrong = 0
for i,j in zip (clf.predict(X_test), y_test):
    if i == j:
        print (i,j)
    else:
        print(i,j, "wrong")
        wrong +=1

2 2
1 1
0 0
2 2
0 0
2 2
0 0
1 1
1 1
1 1
2 2
1 1
1 1
1 1
2 1 wrong
0 0
2 1 wrong
2 1 wrong
0 0
0 0
2 2
2 1 wrong
0 0
0 0
2 2
0 0
0 0
1 1
1 1
0 0
2 2
2 1 wrong
0 0
2 2
2 2
2 1 wrong
0 0
2 1 wrong
2 1 wrong
1 1
2 2
0 0
2 2
0 0
0 0
1 1
2 2
2 2
2 2
2 2
1 1
2 2
2 1 wrong
1 1
2 2
2 2
2 2
2 2
1 1
2 2
2 1 wrong
0 0
2 2
2 1 wrong
1 1
1 1
2 1 wrong
2 2
0 0
0 0
2 2
1 1
0 0
0 0
1 1
