In [8]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
import pickle

columns = ["id", "clump_thickness", "cell_size_uniformity", "cell_shape_uniformity", "marginal_adhesion",
    "single_epithelial_cell_size", "bare_nuclei", "bland_chromatin", "normal_nucleoli", "mitoses", "class"]

# Load and clean dataset
df = pd.read_csv("/Users/darwinjuan/Downloads/breast+cancer+wisconsin+original/breast-cancer-wisconsin.data", header=None, names=columns)

#Convert question mark to NaN and drop rows that are missing
df["bare_nuclei"] = pd.to_numeric(df["bare_nuclei"].replace("?", np.nan))
df = df.dropna()

#Use correlation to find top 4 absolutely correlated features
correlations = df.drop(columns=["id"]).corr()["class"].abs().sort_values(ascending=False)
top_features = correlations.iloc[1:5].index.tolist()

print(f"Top 4 correlated features: {top_features}")

#Prepare data with selected features
X = df[top_features]
#Convert class to 0/1 
y = (df["class"] == 4).astype(int)

#Feature Scaling 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#Retrain model 
model = XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, random_state=42, eval_metric="logloss")
model.fit(X_scaled, y)

#Save model as a pkl file
with open("/Users/darwinjuan/Downloads/ANA680/model.pkl", 'wb') as file: pickle.dump(model, file)

with open("/Users/darwinjuan/Downloads/ANA680/scaler.pkl", 'wb') as file: pickle.dump(scaler, file)
    

Top 4 correlated features: ['bare_nuclei', 'cell_shape_uniformity', 'cell_size_uniformity', 'bland_chromatin']
Model successfully saved
