In [13]:
import pandas
from sklearn.datasets import load_breast_cancer

# Import the dataset into a dataframe

raw = load_breast_cancer(as_frame=True)

# Show dataset description

print(raw.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [14]:
# Show the list of feature names for the dataset
list(raw.feature_names)


['mean radius',
 'mean texture',
 'mean perimeter',
 'mean area',
 'mean smoothness',
 'mean compactness',
 'mean concavity',
 'mean concave points',
 'mean symmetry',
 'mean fractal dimension',
 'radius error',
 'texture error',
 'perimeter error',
 'area error',
 'smoothness error',
 'compactness error',
 'concavity error',
 'concave points error',
 'symmetry error',
 'fractal dimension error',
 'worst radius',
 'worst texture',
 'worst perimeter',
 'worst area',
 'worst smoothness',
 'worst compactness',
 'worst concavity',
 'worst concave points',
 'worst symmetry',
 'worst fractal dimension']

In [15]:
# Show the list of target names
list(raw.target_names)


['malignant', 'benign']

In [16]:
# Assign the data section of the dataframe to 'X'

X = raw.data
X

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [17]:
# Assign the target section of the dataframe to 'y'

y = raw.target
y

0      0
1      0
2      0
3      0
4      0
      ..
564    0
565    0
566    0
567    0
568    1
Name: target, Length: 569, dtype: int64

In [18]:
# Split X, y into X_train, X_test, y_train, y_test with 7:3 ratio

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3 )
print(len(X))
print(len(X_train),X_train.head())
print(X_test.head(),len(X_test))
print(y_train.head())
print(y_test.head())

569
398      mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
520        9.295         13.90           59.96      257.8          0.13710   
401       11.930         10.91           76.14      442.7          0.08872   
477       13.900         16.62           88.97      599.4          0.06828   
412        9.397         21.68           59.75      268.8          0.07969   
396       13.510         18.89           88.10      558.1          0.10590   

     mean compactness  mean concavity  mean concave points  mean symmetry  \
520           0.12250         0.03332             0.024210         0.2197   
401           0.05242         0.02606             0.017960         0.1601   
477           0.05319         0.02224             0.013390         0.1813   
412           0.06053         0.03735             0.005128         0.1274   
396           0.11470         0.08580             0.053810         0.1806   

     mean fractal dimension  ...  worst radius  worst textur

In [19]:
# Build a logistic regression model of solver='liblinear' with X_train, y_train

from sklearn.linear_model import LogisticRegression

regression=LogisticRegression(solver='liblinear')

regression.fit(X_train,y_train)

In [20]:
# Predict y_pred from X_test

y_predict=regression.predict(X_test)

print(regression.score(X_test,y_test))

0.9649122807017544


In [21]:
# Show confustion matrix

from sklearn import metrics
conf = (metrics.confusion_matrix(y_test, y_predict))

conf

array([[ 58,   5],
       [  1, 107]])

In [22]:
# Show classification report

report = metrics.classification_report(y_test, y_predict)
print(report)

              precision    recall  f1-score   support

           0       0.98      0.92      0.95        63
           1       0.96      0.99      0.97       108

    accuracy                           0.96       171
   macro avg       0.97      0.96      0.96       171
weighted avg       0.97      0.96      0.96       171

