In [1]:
import pandas as pd
df = pd.read_csv('../data/breast-cancer-data.csv')

In [2]:
df.head()

Unnamed: 0,age,menopause,tumer-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiate,class
0,40-49',premeno',15-19',0-2',yes',3',right',left_up',no',recurrence-events'
1,50-59',ge40',15-19',0-2',no',1',right',central',no',no-recurrence-events'
2,50-59',ge40',35-39',0-2',no',2',left',left_low',no',recurrence-events'
3,40-49',premeno',35-39',0-2',yes',3',right',left_low',yes',no-recurrence-events'
4,40-49',premeno',30-34',3-5',yes',2',left',right_up',no',recurrence-events'


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   age          286 non-null    object
 1   menopause    286 non-null    object
 2   tumer-size   286 non-null    object
 3   inv-nodes    286 non-null    object
 4   node-caps    278 non-null    object
 5   deg-malig    286 non-null    object
 6   breast       286 non-null    object
 7   breast-quad  285 non-null    object
 8   irradiate    286 non-null    object
 9   class        286 non-null    object
dtypes: object(10)
memory usage: 22.5+ KB


In [4]:
# Removal of apostrophes
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].str.rstrip("'")

# Checking if I removed apostrophes
df.head()

Unnamed: 0,age,menopause,tumer-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiate,class
0,40-49,premeno,15-19,0-2,yes,3,right,left_up,no,recurrence-events
1,50-59,ge40,15-19,0-2,no,1,right,central,no,no-recurrence-events
2,50-59,ge40,35-39,0-2,no,2,left,left_low,no,recurrence-events
3,40-49,premeno,35-39,0-2,yes,3,right,left_low,yes,no-recurrence-events
4,40-49,premeno,30-34,3-5,yes,2,left,right_up,no,recurrence-events


In [5]:
# Filling in missing values in a way that we input the most frequent entries instead of missing ones
for column in ['node-caps', 'breast-quad']:
    most_frequent_value = df[column].mode()[0]
    df[column] = df[column].fillna(most_frequent_value)

    df.info( )

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   age          286 non-null    object
 1   menopause    286 non-null    object
 2   tumer-size   286 non-null    object
 3   inv-nodes    286 non-null    object
 4   node-caps    286 non-null    object
 5   deg-malig    286 non-null    object
 6   breast       286 non-null    object
 7   breast-quad  285 non-null    object
 8   irradiate    286 non-null    object
 9   class        286 non-null    object
dtypes: object(10)
memory usage: 22.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   age          286 non-null    object
 1   menopause    286 non-null    object
 2   tumer-size   286 non-null    object
 3   inv-nodes    286 non-null    object
 4   node-

In [6]:
##ENCODING
X = df.drop('class', axis=1)
y = df['class']

In [7]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(y)

# To see how function labeled our classes
print(dict(zip(le.classes_, le.transform(le.classes_))))

{'no-recurrence-events': np.int64(0), 'recurrence-events': np.int64(1)}


In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

# We define which columns are ordinal and in which order
ordinal_features = ['age', 'tumer-size', 'inv-nodes', 'deg-malig']
age_cats = ['20-29', '30-39', '40-49', '50-59', '60-69', '70-79']
tumer_size_cats = ['0-4', '5-9', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54']
inv_nodes_cats = ['0-2', '3-5', '6-8', '9-11', '12-14', '15-17', '24-26']
deg_malig_cats = ['1', '2', '3']
ordinal_categories = [age_cats, tumer_size_cats, inv_nodes_cats, deg_malig_cats]

# We define whick columns are nominal
nominal_features = ['menopause', 'node-caps', 'breast', 'breast-quad', 'irradiate']



In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

In [10]:
# --- LOGISTIC REGRESSION STEP 1: Define the Preprocessor ---
# This object will apply different transformations to different columns.
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

# We define the preprocessor object
# It uses OrdinalEncoder for ordinal features and OneHotEncoder for nominal features
preprocessor = ColumnTransformer(transformers=[
    ('ord', OrdinalEncoder(categories=ordinal_categories), ordinal_features),
    ('nom', OneHotEncoder(handle_unknown='ignore'), nominal_features)
])

In [11]:
# --- LOGISTIC REGRESSION STEP 2: Build the Full Pipeline ---
# This pipeline combines the preprocessing step with the classification model.
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

model_pipeline = Pipeline(steps=[
    # Preprocess the data using defined preprocessor
    ('preprocessor', preprocessor),

    # Apply the classification model
    ('classifier', LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced'))
])

In [15]:
# --- LOGISTIC REGRESSION STEP 3: Train the Model ---
# We fit the entire pipeline to our training data (X_train, y_train).
model_pipeline.fit(X_train, y_train)

In [16]:
# --- LOGISTIC TRESSION STEP 4: Evaluate the Model ---
# We use the trained model to make predictions on the unseen test data.
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Make predictions on the test set
y_pred = model_pipeline.predict(X_test)

print('--- Logistic Regression Performance ---')
print('Accuracy:', accuracy_score(y_test, y_pred))
print('\nConfusion Matrix:\n', confusion_matrix(y_test, y_pred))
print('\nClassification Report:\n', classification_report(y_test, y_pred, target_names=le.classes_))

--- Logistic Regression Performance ---
Accuracy: 0.5862068965517241

Confusion Matrix:
 [[27 14]
 [10  7]]

Classification Report:
                       precision    recall  f1-score   support

no-recurrence-events       0.73      0.66      0.69        41
   recurrence-events       0.33      0.41      0.37        17

            accuracy                           0.59        58
           macro avg       0.53      0.54      0.53        58
        weighted avg       0.61      0.59      0.60        58



In [17]:
# --- FINAL OPTIMIZATION: Logistic Regression with Scaling and Tuning ---
# Import necessary classes
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# 1. Create the final, improved pipeline that includes a scaler
# The scaler helps the model converge faster and often improves performance.
lr_final_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000))
])

# 2. Define the parameter grid for 'C'
# We are searching for the best regularization strength.
param_grid_lr = {
    'classifier__C': [0.01, 0.1, 1, 10, 100]
}

# 3. Set up and run GridSearchCV on the final pipeline
grid_search_lr = GridSearchCV(estimator=lr_final_pipeline,
                              param_grid=param_grid_lr,
                              cv=5,
                              scoring='recall',
                              n_jobs=-1)

# Fit the grid search to the training data
grid_search_lr.fit(X_train, y_train)

# 4. Print the results of the best tuned and scaled model
print("Best C parameter found: ", grid_search_lr.best_params_)

y_pred_final_lr = grid_search_lr.predict(X_test)

print('\n--- Tuned & Scaled Logistic Regression Performance ---')
print(classification_report(y_test, y_pred_final_lr, target_names=le.classes_))
print('\nConfusion Matrix:\n', confusion_matrix(y_test, y_pred_final_lr))

Best C parameter found:  {'classifier__C': 10}

--- Tuned & Scaled Logistic Regression Performance ---
                      precision    recall  f1-score   support

no-recurrence-events       0.72      0.63      0.68        41
   recurrence-events       0.32      0.41      0.36        17

            accuracy                           0.57        58
           macro avg       0.52      0.52      0.52        58
        weighted avg       0.60      0.57      0.58        58


Confusion Matrix:
 [[26 15]
 [10  7]]


In [18]:
# --- FINAL EXPERIMENT 2: Logistic Regression with SMOTE Oversampling ---

# Import the specific Pipeline from imblearn and the SMOTE class
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

# Build the pipeline using SMOTE
# Note that the Logistic Regression classifier no longer needs class_weight='balanced'
# because SMOTE is handling the class imbalance.
smote_pipeline = ImbPipeline(steps=[
    # Step 1: Preprocess the data (the same as before)
    ('preprocessor', preprocessor),

    # Step 2: Apply SMOTE to the training data to create synthetic samples
    ('smote', SMOTE(random_state=42)),

    # Step 3: Apply the classification model
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

# Train the SMOTE pipeline
smote_pipeline.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_smote = smote_pipeline.predict(X_test)

print('--- Logistic Regression Performance with SMOTE ---')
print(classification_report(y_test, y_pred_smote, target_names=le.classes_))

--- Logistic Regression Performance with SMOTE ---
                      precision    recall  f1-score   support

no-recurrence-events       0.74      0.63      0.68        41
   recurrence-events       0.35      0.47      0.40        17

            accuracy                           0.59        58
           macro avg       0.55      0.55      0.54        58
        weighted avg       0.63      0.59      0.60        58

