In [None]:
!pip install matplotlib seaborn scipy

In [71]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Load the dataset
df = pd.read_csv('Data/data.csv')

# Display first few rows to confirm it loaded
print(df.head())

         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  texture_worst  perimeter_worst  area_worst  smoothness

### Part 1: The 10-Point Data Inspection

In [15]:
# Step 1
print('Print the "Data Shape":\nNumber of Rows, Number of Features:')
print(df.shape)

Print the "Data Shape":
Number of Rows, Number of Features:
(569, 33)


**Understanding df.shape**
```
This cell prints the shape of the dataset (there are 569 rows or samples and 33 columns or features for each sample).
```
- How many rows (observations/patients)? 569
- How many columns (features)? 33
- What does each row represent in clinical terms? Each row represents a cytological sample (a sample of cells) from an individual's breast cancer.

In [16]:
# Step 2
print("List of columns:\n")
print(df.columns)

List of columns:

Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'],
      dtype='object')


**Understanding df.columns**
```
This cell prints a list of the name for each column. This helps us understand what analysis we are able to do with the data // The 'Unnamed: 32' is an artifact from how the file was saved and the data contains an extra column that is blank.

Each of the features is a variable that we can analyze.

The naming pattern is suffix-based:

The columns that end with _mean are showing the average (mean) of the feature value across the sample.

The columns that end with _se are showing the "standard error" of the feature or the variability/spread of that feature's measurements.

The columns that end with _worst are showing the largest/most severe value of the feature.

Some columns that may need further research are compactness, concavity, concave points, and fractal_dimension. These values come from specific nuclear geometry computations rather than direct measurements.
```

In [28]:
# Extract and display only the base feature names (without _mean, _se, _worst)
raw_features = str(df.columns)
processed_features = raw_features.strip('Index')
processed_features = processed_features.replace('\n','')
processed_features = processed_features.replace(',','\n')
processed_features = processed_features.split('\n')
def ProcessText(features):
    max_runs = (len(processed_features)-1)
    x = 0
    for i in processed_features:
        processed_features[x] = i.strip("([ ])")
        processed_features[x] = processed_features[x].replace("_se","")
        processed_features[x] = processed_features[x].replace("_mean","")
        processed_features[x] = processed_features[x].replace("_worst","")
        if x == max_runs:
            return processed_features
        else:
            x += 1
final_processed = ProcessText(processed_features)
print(final_processed)

["'id'", "'diagnosis'", "'radius'", "'texture'", "'perimeter'", "'area'", "'smoothness'", "'compactness'", "'concavity'", "'concave points'", "'symmetry'", "'fractal_dimension'", "'radius'", "'texture'", "'perimeter'", "'area'", "'smoothness'", "'compactness'", "'concavity'", "'concave points'", "'symmetry'", "'fractal_dimension'", "'radius'", "'texture'", "'perimeter'", "'area'", "'smoothness'", "'compactness'", "'concavity'", "'concave points'", "'symmetry'", "'fractal_dimension'", "'Unnamed: 32'", "dtype='object'"]


**Above cell is a list of all column names**  
*Note: the suffixes have been removed due to the instructions in breast_cancer_10point_problems. (these were Standard Error: _se, _mean, and _worst)*


In [17]:
# Step 3
print("Features:                 Data Type:\n")
print(df.dtypes)

Features:                 Data Type:

id                           int64
diagnosis                   object
radius_mean                float64
texture_mean               float64
perimeter_mean             float64
area_mean                  float64
smoothness_mean            float64
compactness_mean           float64
concavity_mean             float64
concave points_mean        float64
symmetry_mean              float64
fractal_dimension_mean     float64
radius_se                  float64
texture_se                 float64
perimeter_se               float64
area_se                    float64
smoothness_se              float64
compactness_se             float64
concavity_se               float64
concave points_se          float64
symmetry_se                float64
fractal_dimension_se       float64
radius_worst               float64
texture_worst              float64
perimeter_worst            float64
area_worst                 float64
smoothness_worst           float64
compactness_worst

**Understanding df.dtypes**
```
This cell prints the name of each column (features) and the data type for each.
```
- All of the columns are numeric except for `diagnosis` and the final column which is unnamed (`id` is int64 while the remaining columns are float64)

- The `diagnosis` column is an "object" type

- Are there any data types that seem incorrect? The final column is unnamed and has no values in any of the rows. However, this column is still counted as float64 and all of the values are NaN

In [18]:
# Step 4
print("First Few Rows:\n")
print(df.head())

First Few Rows:

         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  texture_worst  perimeter_worst  area_

**Understanding df.head()**
```
This cell prints the the first 5 rows (0-4) under the name for each column
```
- Most of the values are floating point numbers and a significant amount of them are **<1**

- Do you notice anything unusual or unexpected? Only the previously mentioned unnamed column with no values.

- What are the possible values for the `diagnosis` column? **"M"** for malignant or **"B"** for benign.

In [19]:
# Step 5
print("Last Few Rows:\n")
print(df.tail())

Last Few Rows:

         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
564  926424         M        21.56         22.39          142.00     1479.0   
565  926682         M        20.13         28.25          131.20     1261.0   
566  926954         M        16.60         28.08          108.30      858.1   
567  927241         M        20.60         29.33          140.10     1265.0   
568   92751         B         7.76         24.54           47.92      181.0   

     smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
564          0.11100           0.11590         0.24390              0.13890   
565          0.09780           0.10340         0.14400              0.09791   
566          0.08455           0.10230         0.09251              0.05302   
567          0.11780           0.27700         0.35140              0.15200   
568          0.05263           0.04362         0.00000              0.00000   

     ...  texture_worst  perimeter

**Understanding df.tail()**
```
This cell prints the the last 5 rows (in this dataset rows 564-568) under the name for each column
```
- This data ends clearly
- The last rows **do** seem consistent with the first rows.

In [19]:
# Step 6
print("Features:                 MemoryUsage:\n")
print(df.memory_usage(deep=True))
print("\nTotal usage (bytes):")
print(df.memory_usage(deep=True).sum())

Features:                 MemoryUsage:

Index                        132
id                          4552
diagnosis                  28450
radius_mean                 4552
texture_mean                4552
perimeter_mean              4552
area_mean                   4552
smoothness_mean             4552
compactness_mean            4552
concavity_mean              4552
concave points_mean         4552
symmetry_mean               4552
fractal_dimension_mean      4552
radius_se                   4552
texture_se                  4552
perimeter_se                4552
area_se                     4552
smoothness_se               4552
compactness_se              4552
concavity_se                4552
concave points_se           4552
symmetry_se                 4552
fractal_dimension_se        4552
radius_worst                4552
texture_worst               4552
perimeter_worst             4552
area_worst                  4552
smoothness_worst            4552
compactness_worst           4552
con

**Understanding df.memory_usage()**
```
This cell prints the memory usage of each column in bytes.
```
- This dataset is only 174.246 KB
- This is a small dataset as large ones can be multiple gigabytes.

In [21]:
# Step 7
print("Features:                Null Data:\n")
print(df.isnull().sum())

Features:                Null Data:

id                           0
diagnosis                    0
radius_mean                  0
texture_mean                 0
perimeter_mean               0
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave points_mean          0
symmetry_mean                0
fractal_dimension_mean       0
radius_se                    0
texture_se                   0
perimeter_se                 0
area_se                      0
smoothness_se                0
compactness_se               0
concavity_se                 0
concave points_se            0
symmetry_se                  0
fractal_dimension_se         0
radius_worst                 0
texture_worst                0
perimeter_worst              0
area_worst                   0
smoothness_worst             0
compactness_worst            0
concavity_worst              0
concave points_worst         0
symmetry_worst               0
fr

**Understanding df.isnull.sum()**
```
This cell prints all of the features and a count of the rows that have null values within that category
```

In [22]:
# Step 8
print("Identify Duplicate Rows:\n")
print(df.duplicated())

Identify Duplicate Rows:

0      False
1      False
2      False
3      False
4      False
       ...  
564    False
565    False
566    False
567    False
568    False
Length: 569, dtype: bool


**Understanding df.duplicated()**
```
This cell prints a list of the rows and an indication of if it is a duplicate of another row. In this case there are no duplicated rows
```

In [23]:
# Step 9
print("Overview / Summary of Data:\n")
print(df.describe())

Overview / Summary of Data:

                 id  radius_mean  texture_mean  perimeter_mean    area_mean  \
count  5.690000e+02   569.000000    569.000000      569.000000   569.000000   
mean   3.037183e+07    14.127292     19.289649       91.969033   654.889104   
std    1.250206e+08     3.524049      4.301036       24.298981   351.914129   
min    8.670000e+03     6.981000      9.710000       43.790000   143.500000   
25%    8.692180e+05    11.700000     16.170000       75.170000   420.300000   
50%    9.060240e+05    13.370000     18.840000       86.240000   551.100000   
75%    8.813129e+06    15.780000     21.800000      104.100000   782.700000   
max    9.113205e+08    28.110000     39.280000      188.500000  2501.000000   

       smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
count       569.000000        569.000000      569.000000           569.000000   
mean          0.096360          0.104341        0.088799             0.048919   
std           0.

**Understanding df.describe()**
```
This cell prints an overview of the data. It summarizes each feature with a count, mean, standard deviation, minimum value, lower percentile (25%), median (50%), upper percentile (75%), and maximum value
```

In [None]:
# Step 10
print("Number of Unique Values:")
print("Features:                 Unique Values:\n")
print(df.nunique())

**Understanding df.nunique()**
```
This cell prints the number of nuique values for each feature. Some cells have more unique values than others, this is the cardinality of the value
```

### Part 2: List of all columns

In [67]:
df = df.iloc[:, :-1]

### Part 4: Create Cell Irregularity Groups

| Irregularity Category | Concavity Range | Clinical Rationale |
|-----------------------|-----------------|-------------------|
| Smooth | 0 - 0.03 | Minimal concavity, regular cell boundaries |
| Mild | 0.03 - 0.08 | Some irregularity, often seen in benign tumors |
| Moderate | 0.08 - 0.15 | Notable irregularity, warrants attention |
| Severe | 0.15 - 0.25 | High irregularity, suspicious for malignancy |
| Extreme | > 0.25 | Very irregular, strong indicator of malignancy |


In [72]:
bins = [0, 0.035, 0.085, 0.155, 0.255, np.inf]
labels = ['Smooth', 'Mild', 'Moderate', 'Severe', 'Extreme']
df['concavity_category'] = pd.cut(df['concavity_mean'], bins=bins, labels=labels, include_lowest=True)
print(df['concavity_category'])

0       Extreme
1      Moderate
2        Severe
3        Severe
4        Severe
         ...   
564      Severe
565    Moderate
566    Moderate
567     Extreme
568      Smooth
Name: concavity_category, Length: 569, dtype: category
Categories (5, object): ['Smooth' < 'Mild' < 'Moderate' < 'Severe' < 'Extreme']


In [95]:
category_counts = {}
malignancy_category_count = 0
benign_category_count = 0
for category in df['concavity_category'].cat.categories:
    count = len(df[df['concavity_category'] == category])
    category_counts[category] = count
    malignant_in_category = len(df[(df['concavity_category'] == category) & (df['diagnosis'] == 'M')])
    benign_in_category = count - malignant_in_category  # Benign = total - malignant
    malignancy_category_count += malignant_in_category
    benign_category_count += benign_in_category
    
    print(f"{category}: {count} cases ({malignant_in_category} M, {benign_in_category} B)")
print("\nTotal malignant:", malignancy_category_count)
print("Total benign:", benign_category_count)
print("Total cases:", malignancy_category_count + benign_category_count)

Smooth: 172 cases (5 M, 167 B)
Mild: 167 cases (21 M, 146 B)
Moderate: 125 cases (86 M, 39 B)
Severe: 83 cases (81 M, 2 B)
Extreme: 22 cases (19 M, 3 B)

Total malignant: 212
Total benign: 357
Total cases: 569


In [75]:
print(category_counts)

{'Smooth': 172, 'Mild': 167, 'Moderate': 125, 'Severe': 83, 'Extreme': 22}
