In [None]:
import pandas as pd
df = pd.read_csv('../data/breast-cancer-data.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# Removal of apostrofes
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].str.rstrip("'")

# Checking if I removed apostrofes
df.head()

In [None]:
# Filling in missing values in a way that we input the most frequent entries instead of missing ones
for column in ['node-caps', 'breast-quad']:
    most_frequent_value = df[column].mode()[0]
    df[column] = df[column].fillna(most_frequent_value)

    df.info( )

In [None]:
##ENCODING
X = df.drop('class', axis=1)
y = df['class']

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(y)

# To see how function labeled our classes
print(dict(zip(le.classes_, le.transform(le.classes_))))

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

# We define which columns are ordinal and in which order
ordinal_features = ['age', 'tumer-size', 'inv-nodes', 'deg-malig']
age_cats = ['20-29', '30-39', '40-49', '50-59', '60-69', '70-79']
tumer_size_cats = ['0-4', '5-9', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54']
inv_nodes_cats = ['0-2', '3-5', '6-8', '9-11', '12-14', '15-17', '24-26']
deg_malig_cats = ['1', '2', '3']
ordinal_categories = [age_cats, tumer_size_cats, inv_nodes_cats, deg_malig_cats]

# We define whick columns are nominal
nominal_features = ['menopause', 'node-caps', 'breast', 'breast-quad', 'irradiate']



In [36]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

In [37]:
# --- LOGISTIC REGRESSION STEP 1: Define the Preprocessor ---
# This object will apply different transformations to different columns.
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

# We define the preprocessor object
# It uses OrdinalEncoder for ordinal features and OneHotEncoder for nominal features
preprocessor = ColumnTransformer(transformers=[
    ('ord', OrdinalEncoder(categories=ordinal_categories), ordinal_features),
    ('nom', OneHotEncoder(handle_unknown='ignore'), nominal_features)
])

In [39]:
# --- LOGISTIC REGRESSION STEP 2: Build the Full Pipeline ---
# This pipeline combines the preprocessing step with the classification model.
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

model_pipeline = Pipeline(steps=[
    # Preprocess the data using defined preprocessor
    ('preprocessor', preprocessor),

    # Apply the classification model
    ('classifier', LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced'))
])