## Imported Libraries

In [47]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## Load data labels

In [19]:
labels = ["Sample code number", "Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape", "Marginal Adhesion", "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", "Mitoses", "Class"]

In [20]:
data = pd.read_csv('breast-cancer-wisconsin.data', names = labels)
data

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4


## Remove missing values

In [21]:
def get_index_remove(df):
    
    index_remove = []

    for i in range(len(df.values)):
        for j in df.values[i]:
            if j == '?':
                index_remove.append(i)
                
    return index_remove

In [22]:
my_missing_values_index = get_index_remove(data)

In [23]:
data  = data.drop(my_missing_values_index,axis = 0, )

## Create X & y vectors

In [24]:
X = data.iloc[:,1:-1].values
y = data.iloc[:,-1].values

## Transform to binary format

In [26]:
#2 for benign, 4 for malignant

def binary_tag(vector):
        for i in range(len(vector)):
            if vector[i] == 2:
                vector[i] = 0
            else:
                vector[i] = 1
                
        return vector

In [27]:
y = binary_tag(y)

## Split train & test

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=23)

## Scale independent Variables

In [40]:
sc = StandardScaler()

In [41]:
X_train_scaled = sc.fit_transform(X_train)

In [42]:
X_test_scaled = sc.transform(X_test)

## Train Logistic Regression Model

In [45]:
classifier = LogisticRegression(random_state = 23)
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=23, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## Predict

In [46]:
y_pred = classifier.predict(X_test)

## Evaluate Model

In [50]:
acc = accuracy_score(y_test, y_pred)
print('The accuracy is: ', acc * 100)

The accuracy is:  97.6608187134503
