In [1]:
# 1. IMPORTS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve

In [2]:
# 2. LOAD DATA
data1 = pd.read_csv("data.csv")
data1.head(10)
data1.shape

(569, 33)

In [3]:
# 2. LOAD DATA
data2 = pd.read_csv("Breast_Cancer.csv")
data2.head(10)
data2.shape

(2509, 34)

In [4]:
# 3. BASIC CLEANING
# - drop duplicates
data1 = data1.drop_duplicates()

# - fix column names
for col in ["Unnamed: 32", "id"]:
    if col in data1.columns:
        data1 = data1.drop(columns = [col])

# - encode targets
data1["diagnosis"] = (data1["diagnosis"].astype(str).str.strip().map({"M": 1, "B": 0}))

print("Dataset A – diagnosis value counts:")
print(data1["diagnosis"].value_counts(), "\n")

# - check missing values
print("Dataset A – missing values per column:")
print(data1.isnull().sum(), "\n")

# - describe()
print("Dataset A – info():")
print(data1.info(), "\n")

print("Dataset A – describe():")
print(data1.describe().T, "\n")

Dataset A – diagnosis value counts:
diagnosis
0    357
1    212
Name: count, dtype: int64 

Dataset A – missing values per column:
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0

In [5]:
# 3. BASIC CLEANING
# - fix column names
data2.columns = (data2.columns.str.strip().str.replace(" ", "_").str.replace("'", "", regex=False))

drop_cols = []

# Patient_ID (useless for prediction)
if "Patient_ID" in data2.columns:
    drop_cols.append("Patient_ID")

# Patients_Vital_Status is leakage (it directly says 'Living'/'Deceased')
if "Patients_Vital_Status" in data2.columns:
    drop_cols.append("Patients_Vital_Status")

# Drop collected columns
if drop_cols:
    data2 = data2.drop(columns=drop_cols)

# - encode targets
if "Overall_Survival_Status" in data2.columns:
    data2["Overall_Survival_Status"] = data2["Overall_Survival_Status"].astype(str).str.strip()
    
    # Map Living -> 1 (survived), Deceased -> 0 (died)
    data2["survived"] = data2["Overall_Survival_Status"].map({"Living": 1, "Deceased": 0})
    
    # Drop original column
    data2 = data2.drop(columns=["Overall_Survival_Status"])

print("\nDataset B – survived value counts:")
print(data2["survived"].value_counts(), "\n")

# - check missing values
print("Dataset B – missing values before imputing:")
print(data2.isnull().sum().sort_values(ascending=False).head(15), "\n")

# Numeric columns → fill with median
numeric_cols = data2.select_dtypes(include=[np.number]).columns
data2[numeric_cols] = data2[numeric_cols].fillna(data2[numeric_cols].median())

# Categorical columns → fill with most frequent value
categorical_cols = data2.select_dtypes(include=["object"]).columns
for col in categorical_cols:
    data2[col] = data2[col].fillna(data2[col].mode()[0])

print("Dataset B – missing values AFTER imputing:", data2.isnull().sum().sum(), "\n")

# - One-Hot Encoding
categorical_cols = data2.select_dtypes(include=["object"]).columns
data2 = pd.get_dummies(data2, columns=categorical_cols, drop_first=True)

# - describe()
print("Dataset B – info():")
print(data2.info(), "\n")

print("Dataset B – describe():")
print(data2.describe().T)


Dataset B – survived value counts:
survived
0.0    1144
1.0     837
Name: count, dtype: int64 

Dataset B – missing values before imputing:
3-Gene_classifier_subtype       745
Tumor_Stage                     721
Primary_Tumor_Laterality        639
Cellularity                     592
Type_of_Breast_Surgery          554
Integrative_Cluster             529
Inferred_Menopausal_State       529
Chemotherapy                    529
Pam50_+_Claudin-low_subtype     529
Radio_Therapy                   529
PR_Status                       529
HER2_status_measured_by_SNP6    529
HER2_Status                     529
Hormone_Therapy                 529
Overall_Survival_(Months)       528
dtype: int64 

Dataset B – missing values AFTER imputing: 0 

Dataset B – info():
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2509 entries, 0 to 2508
Data columns (total 68 columns):
 #   Column                                                          Non-Null Count  Dtype  
---  ------                          

In [6]:
# 4. EXPLORATORY ANALYSIS
# - class distribution
# - histograms
# - correlation heatmap
# - pairplots
# - boxplots

In [7]:
# 5. DATA PREPROCESSING
# - scaling (if needed)
# - train/test split
# - categorical encoding
# - imputation if needed

In [8]:
# 6. MODELING DATASET A
# - Logistic Regression
# - RandomForest
# - SVM
# - PCA + model
# - GridSearchCV
# - ROC curve and confusion matrix

In [9]:
# 7. MODELING DATASET B
# - Logistic Regression baseline
# - RandomForest
# - GradientBoosting / XGBoost
# - Neural Network (MLP)
# - Hyperparameter tuning
# - ROC curve, confusion matrix

In [10]:
# 8. FEATURE IMPORTANCE ANALYSIS
# - RF feature importance
# - SHAP (optional)

In [11]:
# 9. RESULTS COMPARISON
# - summary tables
# - accuracy, F1, ROC-AUC vs model
# - visualisation

In [12]:
# 10. CONCLUSIONS FOR REPORT
# - strengths and weaknesses
# - comparison of datasets
# - future work recommendations