# Breast Cancer Logistic Regression
<p style="color:green;">The Data is from UCI machine learning dataset</p>

> [UCI Repository](https://archive.ics.uci.edu/ml/index.php)


## Importing the libraries

In [154]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Importing the dataset

In [155]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
breast_cancer_wisconsin_original = fetch_ucirepo(id=15)

# data (as pandas dataframes)
X = breast_cancer_wisconsin_original.data.features
y = breast_cancer_wisconsin_original.data.targets

# metadata
print(breast_cancer_wisconsin_original.metadata)

# variable information
print(breast_cancer_wisconsin_original.variables)

{'uci_id': 15, 'name': 'Breast Cancer Wisconsin (Original)', 'repository_url': 'https://archive.ics.uci.edu/dataset/15/breast+cancer+wisconsin+original', 'data_url': 'https://archive.ics.uci.edu/static/public/15/data.csv', 'abstract': 'Original Wisconsin Breast Cancer Database', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 699, 'num_features': 9, 'feature_types': ['Integer'], 'demographics': [], 'target_col': ['Class'], 'index_col': ['Sample_code_number'], 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1990, 'last_updated': 'Sun Mar 10 2024', 'dataset_doi': '10.24432/C5HP4Z', 'creators': ['WIlliam Wolberg'], 'intro_paper': None, 'additional_info': {'summary': "Samples arrive periodically as Dr. Wolberg reports his clinical cases. The database therefore reflects this chronological grouping of the data. This grouping information appears immediately below, having been removed fro

In [156]:
(
	type(X),
	type(y),
	type(breast_cancer_wisconsin_original.metadata),
	type(breast_cancer_wisconsin_original.variables),
	type(breast_cancer_wisconsin_original),
)

(pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 ucimlrepo.dotdict.dotdict,
 pandas.core.frame.DataFrame,
 ucimlrepo.dotdict.dotdict)

In [157]:
X.shape, y.shape

((699, 9), (699, 1))

In [158]:
X

Unnamed: 0,Clump_thickness,Uniformity_of_cell_size,Uniformity_of_cell_shape,Marginal_adhesion,Single_epithelial_cell_size,Bare_nuclei,Bland_chromatin,Normal_nucleoli,Mitoses
0,5,1,1,1,2,1.0,3,1,1
1,5,4,4,5,7,10.0,3,2,1
2,3,1,1,1,2,2.0,3,1,1
3,6,8,8,1,3,4.0,3,7,1
4,4,1,1,3,2,1.0,3,1,1
...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2.0,1,1,1
695,2,1,1,1,2,1.0,1,1,1
696,5,10,10,3,7,3.0,8,10,2
697,4,8,6,4,3,4.0,10,6,1


In [159]:
# clean up the data removing the missing values
def clean_data(X, y):
	# Another way to clean the data is to concatenate the features and target, then drop rows with missing values
	cleaning_dataset = pd.concat([X, y], axis=1).dropna().reset_index(drop=True)
	X_cleaned = cleaning_dataset.iloc[:, 1:-1]
	y_cleaned = cleaning_dataset.iloc[:, -1]
	return X_cleaned, y_cleaned

In [160]:
X_new, y_new = clean_data(X, y)

In [161]:
from sklearn.impute import SimpleImputer


def clean_using_imputer(X):
	# impute missing values
	imputer = SimpleImputer(strategy='mean')
	X_imputed = imputer.fit_transform(X)
	# return pd.DataFrame(X_imputed)
	# Convert back to DataFrame and preserve column headers
	return pd.DataFrame(X_imputed, columns=X.columns, index=X.index)


X_imputed = clean_using_imputer(X)
X_imputed

Unnamed: 0,Clump_thickness,Uniformity_of_cell_size,Uniformity_of_cell_shape,Marginal_adhesion,Single_epithelial_cell_size,Bare_nuclei,Bland_chromatin,Normal_nucleoli,Mitoses
0,5.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0
1,5.0,4.0,4.0,5.0,7.0,10.0,3.0,2.0,1.0
2,3.0,1.0,1.0,1.0,2.0,2.0,3.0,1.0,1.0
3,6.0,8.0,8.0,1.0,3.0,4.0,3.0,7.0,1.0
4,4.0,1.0,1.0,3.0,2.0,1.0,3.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...
694,3.0,1.0,1.0,1.0,3.0,2.0,1.0,1.0,1.0
695,2.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0
696,5.0,10.0,10.0,3.0,7.0,3.0,8.0,10.0,2.0
697,4.0,8.0,6.0,4.0,3.0,4.0,10.0,6.0,1.0


## Splitting the dataset into the Training set and Test set

In [162]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
	X_new, y_new, test_size=0.2, random_state=0
)

X_train2, X_test2, y_train2, y_test2 = train_test_split(
	X_imputed, y, test_size=0.2, random_state=0
)

## Training the Logistic Regression model on the Training set

In [163]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)

classifier2 = LogisticRegression(random_state=0)
classifier2.fit(X_train2, y_train2.values.ravel())

## Predicting the Test set results

In [164]:
y_pred = classifier.predict(X_test)
y_pred2 = classifier2.predict(X_test2)
comparing_results = pd.DataFrame(
	{
		'Actual': y_test,
		'Predicted': y_pred,
	}
)
# print(
# 	np.concatenate(
# 		(y_pred.reshape(len(y_pred), 1), y_test.to_numpy().reshape(len(y_test), 1)), 1
# 	)
# )
y_test.shape, y_pred.shape

((137,), (137,))

## Making the Confusion Matrix

In [165]:
from sklearn.metrics import confusion_matrix, accuracy_score

cm = confusion_matrix(y_test, y_pred)
cm2 = confusion_matrix(y_test2, y_pred2)
print('Confusion Matrix using cleaned data:')
print(cm)
print('Confusion Matrix using imputed data:')
print(cm2)
print(f'Accuracy Score of cm: {accuracy_score(y_test, y_pred)}')
print(f'Accuracy Score of cm2: {accuracy_score(y_test2, y_pred2)}')

Confusion Matrix using cleaned data:
[[83  4]
 [ 3 47]]
Confusion Matrix using imputed data:
[[82  3]
 [ 1 54]]
Accuracy Score of cm: 0.948905109489051
Accuracy Score of cm2: 0.9714285714285714


## Computing Accuracy of K-fold

In [173]:
from sklearn.model_selection import cross_val_score

accuracies = cross_val_score(classifier, X_train, y_train.to_numpy().ravel(), cv=10)
accuracies2 = cross_val_score(classifier2, X_train2, y_train2.values.ravel(), cv=10)

print(f'Accuracy using cleaned data: {accuracies.mean():.2f}')
print(f'Accuracy using imputed data: {accuracies2.mean():.2f}')
print()
print(f'Standard Deviation using cleaned data: {accuracies.std():.2f}')
print(f'Standard Deviation using imputed data: {accuracies2.std():.2f}')

Accuracy using cleaned data: 0.97
Accuracy using imputed data: 0.97

Standard Deviation using cleaned data: 0.02
Standard Deviation using imputed data: 0.03
