In [2]:
# 1. IMPORTS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve

In [4]:
# 2. LOAD DATA
data = pd.read_csv("data.csv")
data.head(10)
data.shape

(569, 33)

In [5]:
# 3. BASIC CLEANING
# - drop duplicates
data = data.drop_duplicates()

# - fix column names
for col in ["Unnamed: 32", "id"]:
    if col in data.columns:
        data = data.drop(columns = [col])

# - encode targets
data["diagnosis"] = (data["diagnosis"].astype(str).str.strip().map({"M": 1, "B": 0}))

print("Dataset A – diagnosis value counts:")
print(data["diagnosis"].value_counts(), "\n")

# - check missing values
print("Dataset A – missing values per column:")
print(data.isnull().sum(), "\n")

# - describe()
print("Dataset A – info():")
print(data.info(), "\n")

print("Dataset A – describe():")
print(data.describe().T, "\n")

Dataset A – diagnosis value counts:
diagnosis
0    357
1    212
Name: count, dtype: int64 

Dataset A – missing values per column:
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0

In [6]:
# 4. EXPLORATORY ANALYSIS
# - class distribution
# - histograms
# - correlation heatmap
# - pairplots
# - boxplots

In [7]:
# 5. DATA PREPROCESSING
# - scaling (if needed)
# - train/test split
# - categorical encoding
# - imputation if needed

In [8]:
# 6. MODELING DATASET A
# - Logistic Regression
# - RandomForest
# - SVM
# - PCA + model
# - GridSearchCV
# - ROC curve and confusion matrix

In [9]:
# 7. MODELING DATASET B
# - Logistic Regression baseline
# - RandomForest
# - GradientBoosting / XGBoost
# - Neural Network (MLP)
# - Hyperparameter tuning
# - ROC curve, confusion matrix

In [10]:
# 8. FEATURE IMPORTANCE ANALYSIS
# - RF feature importance
# - SHAP (optional)

In [11]:
# 9. RESULTS COMPARISON
# - summary tables
# - accuracy, F1, ROC-AUC vs model
# - visualisation

In [12]:
# 10. CONCLUSIONS FOR REPORT
# - strengths and weaknesses
# - comparison of datasets
# - future work recommendations