# Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Step 2: Load dataset

In [2]:
try:
    data = pd.read_csv('Breast cancer dataset.csv')
except FileNotFoundError:
    print("Error: Dataset file not found. Please ensure 'Breast cancer dataset.csv' is in the correct directory.")
    raise

# Step 3: Preprocess dataset
# Identify numeric columns (excluding diagnosis for now)

In [3]:
numeric_cols = data.select_dtypes(include=[np.number]).columns
print("Numeric columns:", numeric_cols)

Numeric columns: Index(['id', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')


# Check for missing values

In [4]:
print("\nChecking for missing values:")
print(data[numeric_cols].isna().sum())


Checking for missing values:
id                         0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64


# Impute missing values with median for numeric columns

In [5]:
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())

# Check for infinite values

In [6]:
print("\nChecking for infinite values:")
print(np.isinf(data[numeric_cols]).sum())
data[numeric_cols] = data[numeric_cols].replace([np.inf, -np.inf], np.nan)
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())


Checking for infinite values:
id                         0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64


# Verify no NaNs remain

In [7]:
print("\nChecking for NaNs after imputation:")
print(data[numeric_cols].isna().sum())


Checking for NaNs after imputation:
id                         0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64


# Encode diagnosis

In [8]:
if 'diagnosis' in data.columns:
    data['diagnosis'] = data['diagnosis'].map({'M': 1, 'B': 0})
else:
    print("Error: 'diagnosis' column not found.")
    raise KeyError("'diagnosis' column missing")


# Drop id column if present

In [9]:
if 'id' in data.columns:
    data = data.drop('id', axis=1)

# Drop zero-variance features

In [10]:
zero_variance_cols = [col for col in numeric_cols if col in data.columns and data[col].var() == 0]
print("\nZero variance columns:", zero_variance_cols)
data = data.drop(columns=zero_variance_cols)


Zero variance columns: []


# Separate features and target

In [11]:
X = data.drop('diagnosis', axis=1).values
y = data['diagnosis'].values

# Verify no NaNs in X

In [12]:
if np.any(np.isnan(X)):
    print("Error: NaN values found in features after preprocessing.")
    raise ValueError("NaN values in X")

# Standardize features

In [13]:
scaler = StandardScaler()
try:
    X_scaled = scaler.fit_transform(X)
except ValueError as e:
    print("Error in standardization:", e)
    raise

# Split data

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 4 & 6: Define and implement logistic regression from scratch

In [15]:
def sigmoid(z):
    z = np.clip(z, -500, 500)
    return 1 / (1 + np.exp(-z))

def compute_loss(X, y, w, b):
    m = len(y)
    z = np.dot(X, w) + b
    y_hat = sigmoid(z)
    epsilon = 1e-15
    loss = -np.mean(y * np.log(y_hat + epsilon) + (1 - y) * np.log(1 - y_hat + epsilon))
    return loss

def gradient_descent(X, y, w, b, learning_rate, num_iterations):
    m = len(y)
    loss_history = []

    for i in range(num_iterations):
        z = np.dot(X, w) + b
        y_hat = sigmoid(z)
        dw = np.dot(X.T, (y_hat - y)) / m
        db = np.mean(y_hat - y)
        w -= learning_rate * dw
        b -= learning_rate * db
        loss = compute_loss(X, y, w, b)
        loss_history.append(loss)
        if i % 100 == 0:
            print(f"Iteration {i}, Loss: {loss:.4f}")

    return w, b, loss_history

def predict(X, w, b, threshold=0.5):
    z = np.dot(X, w) + b
    y_hat = sigmoid(z)
    return (y_hat >= threshold).astype(int)


# Train custom model