In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import util

In [2]:
from sklearn.linear_model import LogisticRegression as sklogreg

In [3]:
# Load in the breast cancer train and test sets with pandas.
cancer_train_full = pd.read_csv("../data/breast_cancer_train_1_70.csv")
cancer_test_full = pd.read_csv("../data/breast_cancer_test_1_30.csv")

In [4]:
# Check the first 5 rows of the data to see what it looks like.
cancer_train_full.head()

Unnamed: 0,label,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
2,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
3,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
4,1,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,...,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244


In [5]:
cancer_test_full.head()

Unnamed: 0,label,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
0,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
1,1,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,0.1794,...,22.88,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368
2,1,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,0.203,...,15.09,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075
3,1,13.73,22.61,93.6,578.3,0.1131,0.2293,0.2128,0.08025,0.2069,...,15.03,32.01,108.8,697.7,0.1651,0.7725,0.6943,0.2208,0.3596,0.1431
4,0,13.54,14.36,87.46,566.3,0.09779,0.08129,0.06664,0.04781,0.1885,...,15.11,19.26,99.7,711.2,0.144,0.1773,0.239,0.1288,0.2977,0.07259


In [6]:
# You should probably check the shape of the dataframe, just so you know the matrix shape of your data.
cancer_train_full.shape

(398, 31)

In [7]:
cancer_test_full.shape

(171, 31)

In [8]:
# This dataset is clean already, but it's good to check whether there are any 'na' or non-float or non-integer values. This checks the total
# number of NaN values in all colums.
cancer_train_full.isna().sum()

label                      0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave_points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave_points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave_points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64

In [9]:
# Check what the datatypes are for each row. If any data cell is different than the rest, or not identifiable, I'm pretty sure the type will be 
# "object". In this case, our data is very clean
cancer_train_full.dtypes

label                        int64
radius_mean                float64
texture_mean               float64
perimeter_mean             float64
area_mean                  float64
smoothness_mean            float64
compactness_mean           float64
concavity_mean             float64
concave_points_mean        float64
symmetry_mean              float64
fractal_dimension_mean     float64
radius_se                  float64
texture_se                 float64
perimeter_se               float64
area_se                    float64
smoothness_se              float64
compactness_se             float64
concavity_se               float64
concave_points_se          float64
symmetry_se                float64
fractal_dimension_se       float64
radius_worst               float64
texture_worst              float64
perimeter_worst            float64
area_worst                 float64
smoothness_worst           float64
compactness_worst          float64
concavity_worst            float64
concave_points_worst

In [10]:
# For a slightly more comprehensive rundown of info, you can try the following:
cancer_train_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   label                    398 non-null    int64  
 1   radius_mean              398 non-null    float64
 2   texture_mean             398 non-null    float64
 3   perimeter_mean           398 non-null    float64
 4   area_mean                398 non-null    float64
 5   smoothness_mean          398 non-null    float64
 6   compactness_mean         398 non-null    float64
 7   concavity_mean           398 non-null    float64
 8   concave_points_mean      398 non-null    float64
 9   symmetry_mean            398 non-null    float64
 10  fractal_dimension_mean   398 non-null    float64
 11  radius_se                398 non-null    float64
 12  texture_se               398 non-null    float64
 13  perimeter_se             398 non-null    float64
 14  area_se                  3

In [11]:
# It can be helpful to check the correlation of the features as well. This can let us know whether certain features are redundant or not
# useful.
cancer_train_full.corr()

Unnamed: 0,label,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
label,1.0,0.720207,0.388297,0.732996,0.698995,0.383231,0.611058,0.697711,0.786951,0.334147,...,0.774056,0.420368,0.777035,0.735671,0.413729,0.61028,0.666863,0.794076,0.405576,0.351227
radius_mean,0.720207,1.0,0.279287,0.997931,0.98686,0.160444,0.525007,0.685375,0.824809,0.146947,...,0.973539,0.238019,0.967513,0.951159,0.088154,0.441532,0.543863,0.748766,0.136224,0.022412
texture_mean,0.388297,0.279287,1.0,0.285063,0.281239,-0.042951,0.202032,0.263457,0.25713,0.079763,...,0.303869,0.90898,0.312446,0.300707,0.048392,0.254074,0.269238,0.252577,0.090835,0.108593
perimeter_mean,0.732996,0.997931,0.285063,1.0,0.986307,0.197404,0.573806,0.722369,0.852248,0.180566,...,0.974125,0.244776,0.973223,0.952827,0.119618,0.482317,0.577995,0.773571,0.161983,0.065524
area_mean,0.698995,0.98686,0.281239,0.986307,1.0,0.169443,0.517484,0.695058,0.82397,0.151027,...,0.964292,0.230581,0.960074,0.967287,0.092546,0.415782,0.528272,0.724552,0.115722,0.01544
smoothness_mean,0.383231,0.160444,-0.042951,0.197404,0.169443,1.0,0.671573,0.522144,0.548928,0.567403,...,0.210283,0.025187,0.23567,0.206649,0.821082,0.48958,0.43733,0.499625,0.427543,0.547761
compactness_mean,0.611058,0.525007,0.202032,0.573806,0.517484,0.671573,1.0,0.876257,0.843221,0.594328,...,0.563115,0.211894,0.61471,0.543187,0.568304,0.869401,0.804777,0.811248,0.520645,0.694633
concavity_mean,0.697711,0.685375,0.263457,0.722369,0.695058,0.522144,0.876257,1.0,0.925422,0.482853,...,0.707541,0.259461,0.742948,0.702546,0.430275,0.758235,0.885557,0.855428,0.404839,0.514814
concave_points_mean,0.786951,0.824809,0.25713,0.852248,0.82397,0.548928,0.843221,0.925422,1.0,0.452217,...,0.842954,0.255247,0.864996,0.827937,0.444266,0.698552,0.771156,0.911742,0.376515,0.40423
symmetry_mean,0.334147,0.146947,0.079763,0.180566,0.151027,0.567403,0.594328,0.482853,0.452217,1.0,...,0.193528,0.105016,0.222596,0.189262,0.442707,0.478812,0.421701,0.408368,0.732373,0.45406


In [12]:
# Split off the label column to make the y train vector. The iloc indexes mean: "Take all rows, and only the first column, to make a new
# dataframe."
y_train = cancer_train_full.iloc[:, 0]
# Split of the rest for the feature matrix. Take all rows, and every column but the first.
x_train = cancer_train_full.iloc[:, 1:]

# Repeat for the test sets.
y_test = cancer_test_full.iloc[:, 0]
x_test = cancer_test_full.iloc[:, 1:]

In [13]:
y_train.head()

0    1
1    1
2    1
3    1
4    1
Name: label, dtype: int64

In [14]:
x_train.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
2,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
3,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
4,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,0.07613,...,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244


In [15]:
# Convert the data to numpy arrays.
y_train = y_train.to_numpy()
x_train = x_train.to_numpy()
y_test = y_test.to_numpy()
x_test = x_test.to_numpy()

In [16]:
y_train

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,

In [17]:
# Fit a sklearn logistic regression model with default regularization to the data. I also had to increase the max iterations to guarantee convergence.
sklearn_logreg = sklogreg(fit_intercept=True, max_iter=5000)
sklearn_logreg.fit(x_train, y_train)

LogisticRegression(max_iter=5000)

In [18]:
# Now, use the score function to print the accuracy on the test set. You should get .95321.. using the data included by default.
print('Accuracy with sk-learn Logistic Regression: {0}'.format(sklearn_logreg.score(x_test, y_test)))

Accuracy with sk-learn Logistic Regression: 0.9532163742690059


In [19]:
# You can try using the Logistic Regression model you made yourself to replicate these results. But given that this data is much more complex than
# the 2d data we initially used, you'll probably have to introduce new stuff like regularization to ensure that it converges. In the end, we've now
# got a model that, given a lot of different CONTINUOUS variables (like radius, texture, etc.), can accurately predict whether a tumor is cancerous.

In [20]:
# For data with discrete variables, you can still use logistic regression, but there are options better suited to it like Naive Bayes and Decision
# Tree algorithms.