In [7]:
!pip install matplotlib seaborn scipy

Collecting scipy
  Using cached scipy-1.17.0-cp312-cp312-macosx_14_0_arm64.whl.metadata (62 kB)
Using cached scipy-1.17.0-cp312-cp312-macosx_14_0_arm64.whl (20.1 MB)
Installing collected packages: scipy
Successfully installed scipy-1.17.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

df = pd.read_csv('Data/data.csv')

**Understanding import cell**
```
This cell imports all of the modules and provies a path to read the dataset.
```

In [9]:
print("Data Shape:\nNumber of Rows, Number of Features:\n")
print(df.shape)

Data Shape:
Number of Rows:                 Number of Features:

(569, 33)


## This cell prints the shape of the dataset (there are 569 rows or samples and 33 columns or features for each sample).

In [10]:
print("List of columns:\n")
print(df.columns)

List of columns:

Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'],
      dtype='object')


**Understanding df.columns**
```
This cell prints a list of the name for each column. This helps us understand what analysis we are able to do with the data // The 'Unnamed: 32' is an artifact from how the file was saved and the data contains an extra column that is blank.

Each of the features is a variable that we can analyze.

The columns that end with _mean are showing the average (mean) of the feature value across the sample.

The columns that end with _se are showing the "standard error" of the feature or the variability/spread of that feature's measurements.

The columns that end with _worst are showing the largest/most severe value of the feature.
```

In [51]:
# Extract and display only the base feature names (without _mean, _se, _worst)
raw_features = str(df.columns)
processed_features = raw_features.strip('Index')
processed_features = processed_features.replace('\n','')
processed_features = processed_features.replace(',','\n')
processed_features = processed_features.split('\n')
def ProcessText(features):
    max_runs = (len(processed_features)-1)
    x = 0
    for i in processed_features:
        processed_features[x] = i.strip("([ ])")
        processed_features[x] = processed_features[x].replace("_se","")
        processed_features[x] = processed_features[x].replace("_mean","")
        processed_features[x] = processed_features[x].replace("_worst","")
        if x == max_runs:
            return processed_features
        else:
            x += 1
final_processed = ProcessText(processed_features)
print(final_processed)

["'id'", "'diagnosis'", "'radius'", "'texture'", "'perimeter'", "'area'", "'smoothness'", "'compactness'", "'concavity'", "'concave points'", "'symmetry'", "'fractal_dimension'", "'radius'", "'texture'", "'perimeter'", "'area'", "'smoothness'", "'compactness'", "'concavity'", "'concave points'", "'symmetry'", "'fractal_dimension'", "'radius'", "'texture'", "'perimeter'", "'area'", "'smoothness'", "'compactness'", "'concavity'", "'concave points'", "'symmetry'", "'fractal_dimension'", "'Unnamed: 32'", "dtype='object'"]


In [11]:
print("Features:                 Data Type:\n")
print(df.dtypes)

Features:                 Data Type:

id                           int64
diagnosis                   object
radius_mean                float64
texture_mean               float64
perimeter_mean             float64
area_mean                  float64
smoothness_mean            float64
compactness_mean           float64
concavity_mean             float64
concave points_mean        float64
symmetry_mean              float64
fractal_dimension_mean     float64
radius_se                  float64
texture_se                 float64
perimeter_se               float64
area_se                    float64
smoothness_se              float64
compactness_se             float64
concavity_se               float64
concave points_se          float64
symmetry_se                float64
fractal_dimension_se       float64
radius_worst               float64
texture_worst              float64
perimeter_worst            float64
area_worst                 float64
smoothness_worst           float64
compactness_worst

In [12]:
print("First Few Rows:\n")
print(df.head())

First Few Rows:

         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  texture_worst  perimeter_worst  area_

In [13]:
print("Last Few Rows:\n")
print(df.tail())

Last Few Rows:

         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
564  926424         M        21.56         22.39          142.00     1479.0   
565  926682         M        20.13         28.25          131.20     1261.0   
566  926954         M        16.60         28.08          108.30      858.1   
567  927241         M        20.60         29.33          140.10     1265.0   
568   92751         B         7.76         24.54           47.92      181.0   

     smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
564          0.11100           0.11590         0.24390              0.13890   
565          0.09780           0.10340         0.14400              0.09791   
566          0.08455           0.10230         0.09251              0.05302   
567          0.11780           0.27700         0.35140              0.15200   
568          0.05263           0.04362         0.00000              0.00000   

     ...  texture_worst  perimeter

In [14]:
print("Features:                 Memory Usage:\n")
print(df.memory_usage())

Features:                 Memory Usage:

Index                       132
id                         4552
diagnosis                  4552
radius_mean                4552
texture_mean               4552
perimeter_mean             4552
area_mean                  4552
smoothness_mean            4552
compactness_mean           4552
concavity_mean             4552
concave points_mean        4552
symmetry_mean              4552
fractal_dimension_mean     4552
radius_se                  4552
texture_se                 4552
perimeter_se               4552
area_se                    4552
smoothness_se              4552
compactness_se             4552
concavity_se               4552
concave points_se          4552
symmetry_se                4552
fractal_dimension_se       4552
radius_worst               4552
texture_worst              4552
perimeter_worst            4552
area_worst                 4552
smoothness_worst           4552
compactness_worst          4552
concavity_worst            4552

In [15]:
print("Features:                Null Data:\n")
print(df.isnull().sum())

Features:                Null Data:

id                           0
diagnosis                    0
radius_mean                  0
texture_mean                 0
perimeter_mean               0
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave points_mean          0
symmetry_mean                0
fractal_dimension_mean       0
radius_se                    0
texture_se                   0
perimeter_se                 0
area_se                      0
smoothness_se                0
compactness_se               0
concavity_se                 0
concave points_se            0
symmetry_se                  0
fractal_dimension_se         0
radius_worst                 0
texture_worst                0
perimeter_worst              0
area_worst                   0
smoothness_worst             0
compactness_worst            0
concavity_worst              0
concave points_worst         0
symmetry_worst               0
fr

In [16]:
print("Identify Duplicate Rows:\n")
print(df.duplicated())

Identify Duplicate Rows:

0      False
1      False
2      False
3      False
4      False
       ...  
564    False
565    False
566    False
567    False
568    False
Length: 569, dtype: bool


In [17]:
print("Overview / Summary of Data:\n")
print(df.describe())

Overview / Summary of Data:

                 id  radius_mean  texture_mean  perimeter_mean    area_mean  \
count  5.690000e+02   569.000000    569.000000      569.000000   569.000000   
mean   3.037183e+07    14.127292     19.289649       91.969033   654.889104   
std    1.250206e+08     3.524049      4.301036       24.298981   351.914129   
min    8.670000e+03     6.981000      9.710000       43.790000   143.500000   
25%    8.692180e+05    11.700000     16.170000       75.170000   420.300000   
50%    9.060240e+05    13.370000     18.840000       86.240000   551.100000   
75%    8.813129e+06    15.780000     21.800000      104.100000   782.700000   
max    9.113205e+08    28.110000     39.280000      188.500000  2501.000000   

       smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
count       569.000000        569.000000      569.000000           569.000000   
mean          0.096360          0.104341        0.088799             0.048919   
std           0.

In [18]:
print("Number of Unique Values:")
print("Features:                 Unique Values:\n")
print(df.nunique())

Number of Unique Values:
Features:                 Unique Values:

id                         569
diagnosis                    2
radius_mean                456
texture_mean               479
perimeter_mean             522
area_mean                  539
smoothness_mean            474
compactness_mean           537
concavity_mean             537
concave points_mean        542
symmetry_mean              432
fractal_dimension_mean     499
radius_se                  540
texture_se                 519
perimeter_se               533
area_se                    528
smoothness_se              547
compactness_se             541
concavity_se               533
concave points_se          507
symmetry_se                498
fractal_dimension_se       545
radius_worst               457
texture_worst              511
perimeter_worst            514
area_worst                 544
smoothness_worst           411
compactness_worst          529
concavity_worst            539
concave points_worst       492
sym