# Refactored Example

In [1]:
import numpy as np
import os
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from utils import compute_feature_frequency, load_uci_dataset, train_ensemble_models, normalize_dataset

In [2]:
FILE_PATH = "../data/breast_cancer_clinical_records.csv" # Change this to the path of the dataset
REPO_ID = 15 # Change this to the repository ID of the dataset

In [3]:
# Load dataset from UCI Machine Learning Repository if it does not exist
if not os.path.exists(FILE_PATH):
    clinical_records_metadata, clinical_records = load_uci_dataset(repo_id=REPO_ID)
    print(clinical_records_metadata)
    clinical_records.to_csv(FILE_PATH, index=False)

In [4]:
# Load dataset from CSV file
clinical_records = pd.read_csv(FILE_PATH)
print("Number of samples:", len(clinical_records))
clinical_records.head()

Number of samples: 699


Unnamed: 0,Clump_thickness,Uniformity_of_cell_size,Uniformity_of_cell_shape,Marginal_adhesion,Single_epithelial_cell_size,Bare_nuclei,Bland_chromatin,Normal_nucleoli,Mitoses,Class
0,5,1,1,1,2,1.0,3,1,1,2
1,5,4,4,5,7,10.0,3,2,1,2
2,3,1,1,1,2,2.0,3,1,1,2
3,6,8,8,1,3,4.0,3,7,1,2
4,4,1,1,3,2,1.0,3,1,1,2


In [5]:
# Load dataset from CSV file
clinical_records_norm = normalize_dataset(dataset=clinical_records, num_bins=3)
clinical_records.head()

Continuous columns identified: Index(['Clump_thickness', 'Uniformity_of_cell_size',
       'Uniformity_of_cell_shape', 'Marginal_adhesion',
       'Single_epithelial_cell_size', 'Bare_nuclei', 'Bland_chromatin',
       'Normal_nucleoli', 'Mitoses', 'Class'],
      dtype='object')


Unnamed: 0,Clump_thickness,Uniformity_of_cell_size,Uniformity_of_cell_shape,Marginal_adhesion,Single_epithelial_cell_size,Bare_nuclei,Bland_chromatin,Normal_nucleoli,Mitoses,Class
0,Bin 2,Bin 1,Bin 1,Bin 1,Bin 1,Bin 1,Bin 1,Bin 1,Bin 1,Bin 1
1,Bin 2,Bin 1,Bin 1,Bin 2,Bin 2,Bin 3,Bin 1,Bin 1,Bin 1,Bin 1
2,Bin 1,Bin 1,Bin 1,Bin 1,Bin 1,Bin 1,Bin 1,Bin 1,Bin 1,Bin 1
3,Bin 2,Bin 3,Bin 3,Bin 1,Bin 1,Bin 1,Bin 1,Bin 2,Bin 1,Bin 1
4,Bin 1,Bin 1,Bin 1,Bin 1,Bin 1,Bin 1,Bin 1,Bin 1,Bin 1,Bin 1
