# Read in data
1. Read the breast cancer cells data in `data.csv`
2. Turn all non-numerical values into numerical values
3. Check if there are any empty values

In [1]:
# Import Pandas
import pandas as pd

In [2]:
# Read and preview data
data = pd.read_csv("data.csv")
data.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [3]:
# Turn non-numerical values to numerical values
data['diagnosis'] = data['diagnosis'].map({'B': 0, 'M': 1}) # B (Benign)    = 0
data.head()                                                 # M (Malignant) = 1

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [4]:
data.isnull().sum() # Check for null values

id                         0
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64

# Prepare data
1. Feed features to `X`
2. Feed targets to `y`

In [5]:
# Setup features and target variables as X and y
X = data[[ # Features
        'radius_mean', 'texture_mean', 'perimeter_mean',
        'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'
]]
y = data['diagnosis'] # Target

In [6]:
X.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [7]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: diagnosis, dtype: int64

# Model Pre-Testing
1. Split data into training and testing sets
2. Create a list of models to test

In [8]:
# Import classification models
from sklearn.linear_model import LogisticRegression # Logistic Regression
from sklearn.neighbors import KNeighborsClassifier # K Neigherst Neighbors
from sklearn.svm import SVC # Support Vector Machine
from sklearn.tree import DecisionTreeClassifier # Decision Tree
from sklearn.ensemble import RandomForestClassifier # Random Forest
from sklearn.ensemble import GradientBoostingClassifier # Gradient Boosting
from sklearn.naive_bayes import GaussianNB # Gaussian Naive Bayes

from sklearn.model_selection import train_test_split # Import training and testing module
from sklearn.metrics import recall_score # Import evaluation metric

In [9]:
# Create function that returns model accuracy
def model_accuracy(checking_model, train_X, test_X, train_y, test_y):
        model = checking_model # Uses requested model
        model.fit(train_X, train_y) # Fits requested model with training data
        y_pred = model.predict(test_X) # Uses new testing data for the model to predict from
        return recall_score(test_y, y_pred) # Returns accuracy between prediction vs. actual testing data

In [10]:
# Split data into training and testing sets
# 80% training data - 20% testing data
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Create a list of models to test
models_checked = [
    LogisticRegression(max_iter=3000),
    KNeighborsClassifier(1),
    KNeighborsClassifier(3),
    KNeighborsClassifier(5),
    KNeighborsClassifier(7),
    KNeighborsClassifier(9),
    KNeighborsClassifier(11),
    KNeighborsClassifier(13),
    KNeighborsClassifier(15),
    KNeighborsClassifier(19),
    KNeighborsClassifier(23),
    KNeighborsClassifier(27),
    KNeighborsClassifier(31),
    KNeighborsClassifier(37),
    KNeighborsClassifier(41),
    KNeighborsClassifier(47),
    SVC(kernel='rbf'),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(n_estimators=10),
    RandomForestClassifier(n_estimators=30),
    RandomForestClassifier(n_estimators=50),
    RandomForestClassifier(n_estimators=75),
    RandomForestClassifier(n_estimators=100),
    RandomForestClassifier(n_estimators=150),
    RandomForestClassifier(n_estimators=200),
    RandomForestClassifier(n_estimators=250),
    RandomForestClassifier(n_estimators=300),
    RandomForestClassifier(n_estimators=400),
    RandomForestClassifier(n_estimators=500),
    RandomForestClassifier(n_estimators=600),
    RandomForestClassifier(n_estimators=750),
    RandomForestClassifier(n_estimators=900),
    RandomForestClassifier(n_estimators=1000),
    GradientBoostingClassifier(n_estimators=10),
    GradientBoostingClassifier(n_estimators=30),
    GradientBoostingClassifier(n_estimators=50),
    GradientBoostingClassifier(n_estimators=75),
    GradientBoostingClassifier(n_estimators=100),
    GradientBoostingClassifier(n_estimators=150),
    GradientBoostingClassifier(n_estimators=200),
    GradientBoostingClassifier(n_estimators=250),
    GradientBoostingClassifier(n_estimators=300),
    GradientBoostingClassifier(n_estimators=400),
    GradientBoostingClassifier(n_estimators=500),
    GradientBoostingClassifier(n_estimators=600),
    GradientBoostingClassifier(n_estimators=700),
    GradientBoostingClassifier(n_estimators=750),
    GradientBoostingClassifier(n_estimators=850),
    GaussianNB()
]

# Model Testing
1. Test all models
2. Pick most accurate model

In [12]:
best_model = None # Sets a variable for the best model
best_accuracy = 0 # Sets a variable for the best accuracy
results = {} # Sets an empty dictionary that stores results

for model in models_checked: # Loops through all models in list shown previously
    accuracy = model_accuracy(model, train_X, test_X, train_y, test_y) # Calculates accuracy using model accuracy function created above
    results[model] = accuracy # Stores the accuracy in the results dictionary

    if accuracy > best_accuracy: # If the current model's accuracy is higher than the current best accuracy, update the best model and accuracy
        best_accuracy = accuracy
        best_model = model
    print(f"{model} complete!")

# Prints the best model
print(f"\n\nBest Model\n{best_model}") # Model name
print(f"{best_accuracy * 100}%") # Accuracy of model
model = best_model # Sets the model variable to the best model

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(max_iter=3000) complete!
KNeighborsClassifier(n_neighbors=1) complete!
KNeighborsClassifier(n_neighbors=3) complete!
KNeighborsClassifier() complete!
KNeighborsClassifier(n_neighbors=7) complete!
KNeighborsClassifier(n_neighbors=9) complete!
KNeighborsClassifier(n_neighbors=11) complete!
KNeighborsClassifier(n_neighbors=13) complete!
KNeighborsClassifier(n_neighbors=15) complete!
KNeighborsClassifier(n_neighbors=19) complete!
KNeighborsClassifier(n_neighbors=23) complete!
KNeighborsClassifier(n_neighbors=27) complete!
KNeighborsClassifier(n_neighbors=31) complete!
KNeighborsClassifier(n_neighbors=37) complete!
KNeighborsClassifier(n_neighbors=41) complete!
KNeighborsClassifier(n_neighbors=47) complete!
SVC() complete!
DecisionTreeClassifier(max_depth=5) complete!
RandomForestClassifier(n_estimators=10) complete!
RandomForestClassifier(n_estimators=30) complete!
RandomForestClassifier(n_estimators=50) complete!
RandomForestClassifier(n_estimators=75) complete!
RandomF

In [13]:
model