In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17) 
  
# data (as pandas dataframes) 
X = breast_cancer_wisconsin_diagnostic.data.features 
y = breast_cancer_wisconsin_diagnostic.data.targets 
  
# metadata 
print(breast_cancer_wisconsin_diagnostic.metadata) 
  
# variable information 
print(breast_cancer_wisconsin_diagnostic.variables) 

{'uci_id': 17, 'name': 'Breast Cancer Wisconsin (Diagnostic)', 'repository_url': 'https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic', 'data_url': 'https://archive.ics.uci.edu/static/public/17/data.csv', 'abstract': 'Diagnostic Wisconsin Breast Cancer Database.', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 569, 'num_features': 30, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['Diagnosis'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1993, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C5DW2B', 'creators': ['William Wolberg', 'Olvi Mangasarian', 'Nick Street', 'W. Street'], 'intro_paper': {'title': 'Nuclear feature extraction for breast tumor diagnosis', 'authors': 'W. Street, W. Wolberg, O. Mangasarian', 'published_in': 'Electronic imaging', 'year': 1993, 'url': 'https://www.semanticscholar.org/paper/53

In [33]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 2124)

In [34]:
#selecting features

from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif

#chi2 is chi square test
#f_classif is anova F 
#mutual info.. self explanatory 

selector = SelectKBest(score_func = f_classif, k = 10)
X_new = selector.fit_transform(X_train,y_train)

selected_indices = selector.get_support(indices=True)
selected_features = X.columns[selected_indices]

print("Selected Features:")
print(selected_features)

Selected Features:
Index(['radius1', 'perimeter1', 'area1', 'concavity1', 'concave_points1',
       'radius3', 'perimeter3', 'area3', 'concavity3', 'concave_points3'],
      dtype='object')


  y = column_or_1d(y, warn=True)


In [35]:
selected_features

Index(['radius1', 'perimeter1', 'area1', 'concavity1', 'concave_points1',
       'radius3', 'perimeter3', 'area3', 'concavity3', 'concave_points3'],
      dtype='object')

In [38]:
y_train

Unnamed: 0,Diagnosis
327,B
403,B
292,B
523,B
387,B
...,...
490,B
103,B
27,M
56,M


In [43]:
y_train_encoded = np.where(y_train['Diagnosis'] == 'B', 0, 1) #Benign = 0 Malign = 1
y_train_encoded

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1,

In [64]:
y_test_encoded = np.where(y_test['Diagnosis'] == 'B', 0, 1) #Benign = 0 Malign = 1

In [47]:
X_train = X_train[selected_features]

In [71]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler().fit(X_train) #z-score normalization

print(f'The means of the data set are as followed: \n {scaler.mean_}')

X_train_scaled = scaler.transform(X_train)
X_train_scaled

X_test_scaled = scaler.transform(X_test)

#scaling the data

The means of the data set are as followed: 
 [1.43261549e+01 9.33353286e+01 6.74126995e+02 9.23702012e-02
 5.07912136e-02 1.65466244e+01 1.09198192e+02 9.11515023e+02
 2.82070141e-01 1.17680991e-01]


In [58]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression()
lr_model.fit(X_train_scaled, y_train_encoded)

In [84]:
#scaling data and building pipeline

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


pipe = make_pipeline(StandardScaler(), LogisticRegression())
pipe.fit(X_train, y_train_encoded)

In [67]:
X_test = X_test[selected_features]

In [75]:
pipe.score(X_test, y_test_encoded) #applying scaling on testing data without leaking training

0.9790209790209791

In [82]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(X_train_scaled, y_train_encoded)

y_pred = model.predict(X_test_scaled)

In [83]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Calculate accuracy
accuracy = accuracy_score(y_test_encoded, y_pred)
print("Accuracy:", accuracy)

# Generate and print confusion matrix and classification report
conf_matrix = confusion_matrix(y_test_encoded, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

class_report = classification_report(y_test_encoded, y_pred)
print("Classification Report:")
print(class_report)

Accuracy: 0.9790209790209791
Confusion Matrix:
[[94  2]
 [ 1 46]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.98        96
           1       0.96      0.98      0.97        47

    accuracy                           0.98       143
   macro avg       0.97      0.98      0.98       143
weighted avg       0.98      0.98      0.98       143

