In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [4]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
column_names = ['id', 'clump_thickness', 'cell_size_uniformity', 'cell_shape_uniformity', 
                'marginal_adhesion', 'single_epithelial_size', 'bare_nuclei', 
                'bland_chromatin', 'normal_nucleoli', 'mitoses', 'class']

df = pd.read_csv(url, names=column_names)


In [6]:
df = df.replace('?', pd.NA).dropna()
df = df.drop('id', axis=1)


In [8]:
df = df.apply(pd.to_numeric)

In [10]:
X = df.drop('class', axis=1)
y = df['class'].replace({2: 0, 4: 1})  # 2 for benign, 4 for malignant


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Train and evaluate models

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import pickle


In [18]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC()
}


In [20]:
best_model = None
best_accuracy = 0
best_model_name = ""
best_model = None
best_accuracy = 0
best_model_name = ""


In [22]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model
        best_model_name = name

print(f"\nBest model: {best_model_name} with accuracy: {best_accuracy:.4f}")


Logistic Regression Accuracy: 0.9562
Decision Tree Accuracy: 0.9270
Random Forest Accuracy: 0.9489
SVM Accuracy: 0.9635

Best model: SVM with accuracy: 0.9635


In [24]:
with open('model.pkl', 'wb') as f:
    pickle.dump(best_model, f)
    
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
