In [1]:
# import matplotlib as plt
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sb
import warnings as datawarnings
datawarnings.filterwarnings('ignore')
        

In [2]:
df = pd.read_pickle('df_cleaned.pkl')
df

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,1,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,842517,1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,84300903,1,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,84348301,1,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,84358402,1,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,1,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,926682,1,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,926954,1,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,927241,1,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


# Feature engeeniring

## Adding new features:
In this dataset, there are various measurements related to breast cancer diagnosis. 

For each type of measurement, there are three columns: mean value, worst value, and SE value.

For each such set of columns, we added 4 calculated columns based on the existing three columns.

The following is a detailed explanation of the 4 columns created for each measurement:


**_mean_to_worst_ratio:**

Calculation: Divides the mean of the feature by the sum of the worst-case value and a small epsilon (to avoid division by zero).
Interpretation: This ratio indicates how much the mean deviates from the worst-case value. A higher ratio suggests that the mean is closer to the worst-case, while a lower ratio implies a larger difference.

**_se_to_mean_ratio:**

Calculation: Divides the standard error of the feature by the mean (plus epsilon).
Interpretation: This ratio can be used to assess relative variability. A higher ratio indicates a larger standard error relative to the mean, suggesting more variability in the data.

**_worst_mean_diff:**

Calculation: Subtracts the mean from the worst-case value.
Interpretation: This difference directly shows how much the worst-case value deviates from the average.

**_z_score_worst":**

Calculation: Calculates the z-score for the worst-case value, which measures how many standard deviations away from the mean it is.
Interpretation: A higher z-score indicates that the worst-case value is further away from the mean in terms of standard deviations.

**In summary:**
The code creates new columns in a DataFrame df2 to store these calculated metrics. 

In [5]:
import re
df2 = df.copy()
# to avoid dividing by zero add this value to the denominator
epsilon = 1e-6
for col in df.columns:
    if re.search(r"_mean", col):    
        first_part = col[:col.rfind('_')] if '_' in col else col
        # This could give insight into how much the mean deviates from the extremity
        df2[f"{first_part}_mean_to_worst_ratio"] = df[f"{first_part}_mean"]/(df[f"{first_part}_worst"] + epsilon)

        # This ratio can indicate relative variability
        df2[f"{first_part}_se_to_mean_ratio"] = df[f"{first_part}_se"]/(df[f"{first_part}_mean"] + epsilon)

        # This can highlight how much the worst-case deviates from the average.
        df2[f"{first_part}_worst_mean_diff"] = df[f"{first_part}_worst"] - df[f"{first_part}_mean"]
        
        # how many standard errors the worst-case is away from the mean
        df2[f"{first_part}_z_score_worst"] = (df[f"{first_part}_worst"] - df[f"{first_part}_mean"])/ (df[f"{first_part}_se"]+ epsilon)



**check for null values**

In [7]:
rows_with_nulls = df2[df2.isnull().any(axis=1)]
print(rows_with_nulls.shape[0])


0


# Feature selection


In [9]:
y = df2['diagnosis']
X = df2.drop(columns=['diagnosis', 'id'])
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 70 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   radius_mean                            569 non-null    float64
 1   texture_mean                           569 non-null    float64
 2   perimeter_mean                         569 non-null    float64
 3   area_mean                              569 non-null    float64
 4   smoothness_mean                        569 non-null    float64
 5   compactness_mean                       569 non-null    float64
 6   concavity_mean                         569 non-null    float64
 7   concave points_mean                    569 non-null    float64
 8   symmetry_mean                          569 non-null    float64
 9   fractal_dimension_mean                 569 non-null    float64
 10  radius_se                              569 non-null    float64
 11  textur

##


## Multivariable feature selection

At this stage, I want to filter out columns that do not contribute and reach 30 columns out of 70 

using multivariable selection with the following models: 

Lasso, Ridge, SVM, GradientBoost, RandomForest, XGBoost.

In [11]:

from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.linear_model import Ridge
from sklearn.inspection import permutation_importance

# Fit models and determine if a feature is selected (1) or not (0)
lasso = Lasso(alpha=0.01).fit(X, y)
lasso_selected = (np.abs(lasso.coef_) > 0).astype(int)

ridge = Ridge(alpha=1.0).fit(X, y)
ridge_selected = (np.abs(ridge.coef_) > 0).astype(int)

svc = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42).fit(X,y)
results = permutation_importance(svc, X, y, scoring='accuracy')
svc_selected = (results.importances_mean > 0).astype(int)

gb = GradientBoostingClassifier().fit(X, y)
gb_selected = (gb.feature_importances_ > 0).astype(int)

rf = RandomForestClassifier().fit(X, y)
rf_selected = (rf.feature_importances_ > 0).astype(int)

xgb = xgb.XGBClassifier().fit(X, y)
xgb_selected = (xgb.feature_importances_ > 0) .astype(int)

# Create a DataFrame to store results
selection_df = pd.DataFrame({
    'Feature': X.columns,
    'Lasso': lasso_selected,
    'Ridge': ridge_selected,
    'SVC': svc_selected,
    'GradientBoost': gb_selected,
    'RandomForest': rf_selected,
    'Xgb': xgb_selected,
    
})

# Sum the number of selections for each feature
selection_df['Sum'] = selection_df[['Lasso', 'Ridge', 'SVC', 'GradientBoost', 'RandomForest', 'Xgb'
                                   ]].sum(axis=1)

selection_df = selection_df.sort_values(by='Sum', ascending=False)
print(selection_df)



                       Feature  Lasso  Ridge  SVC  GradientBoost  \
13                     area_se      1      1    1              1   
3                    area_mean      1      1    1              1   
21               texture_worst      1      1    1              1   
22             perimeter_worst      1      1    1              1   
44        area_worst_mean_diff      1      1    1              1   
..                         ...    ...    ...  ...            ...   
28              symmetry_worst      0      1    0              1   
29     fractal_dimension_worst      0      1    0              1   
30  radius_mean_to_worst_ratio      0      1    0              0   
65      symmetry_z_score_worst      0      1    0              0   
33        radius_z_score_worst      0      1    0              0   

    RandomForest  Xgb  Sum  
13             1    1    6  
3              1    1    6  
21             1    1    6  
22             1    1    6  
44             1    1    6  
..       

In [12]:

more_than_5 = selection_df[selection_df['Sum'] >= 5]['Feature'].tolist()
print(f"\nMore or equal 5, total columns: {len(more_than_5)}\n")
print(more_than_5)
more_than_4 = selection_df[selection_df['Sum'] >= 4]['Feature'].tolist()
print(f"\nMore or equal 4, total columns: {len(more_than_4)}\n")
print(more_than_4)

less_then_3 = selection_df[selection_df['Sum'] < 3]['Feature'].tolist()
print(f"\nless then 3, total columns: {len(less_then_3)}\n")
print(less_then_3)



More or equal 5, total columns: 12

['area_se', 'area_mean', 'texture_worst', 'perimeter_worst', 'area_worst_mean_diff', 'area_worst', 'fractal_dimension_z_score_worst', 'perimeter_mean', 'texture_mean', 'texture_worst_mean_diff', 'perimeter_worst_mean_diff', 'perimeter_se']

More or equal 4, total columns: 47

['area_se', 'area_mean', 'texture_worst', 'perimeter_worst', 'area_worst_mean_diff', 'area_worst', 'fractal_dimension_z_score_worst', 'perimeter_mean', 'texture_mean', 'texture_worst_mean_diff', 'perimeter_worst_mean_diff', 'perimeter_se', 'smoothness_mean_to_worst_ratio', 'area_se_to_mean_ratio', 'smoothness_worst_mean_diff', 'area_mean_to_worst_ratio', 'texture_z_score_worst', 'smoothness_se_to_mean_ratio', 'compactness_z_score_worst', 'smoothness_z_score_worst', 'radius_worst_mean_diff', 'concavity_mean_to_worst_ratio', 'concavity_se_to_mean_ratio', 'concavity_worst_mean_diff', 'concavity_z_score_worst', 'concave points_mean_to_worst_ratio', 'concave points_se_to_mean_ratio'

### Multivariable feature selection - results

The models consistently favored the original features.

<u>More or equal 5:</u>
This category contains 11 columns that have a score of 5 or higher, indicating they are considered relatively important.

<u>More or equal 4:</u> This category contains 48 columns with a score of 4 or higher, including those from the previous category.
 
**The goal of reducing the number of columns to 30 has not been achieved using this method.**

##

## Univariable feature selection
Given the inability to reduce the feature set to 30 columns using the previous method,
we will explore a univariate approach where each feature's relationship with the target variable is assessed

### distribution of each column

Since this method examines each column individually in relation to the target column, we must first assess the distribution of each column. 
Based on the distribution, we will employ the appropriate statistical test. 

For columns with a <u>**normal distribution, we will use a t-test.**</u>

For columns with a <u>**non-normal distribution, we will use the Wilcoxon Rank-Sum test.**</u>

In [16]:
skewness_values = X.skew()
para_columns = skewness_values[abs(skewness_values) <= 0.5].index.tolist()
print(f"\ntotal parametric columns:{len(para_columns)}\n")
non_para_columns = skewness_values[abs(skewness_values) > 0.5].index.tolist()
print(f"\ntotal nonparametric columns:{len(non_para_columns)}\n")


total parametric columns:11


total nonparametric columns:59



**Parametric t-test**

In [18]:
from scipy import stats
results = []
# Step 2: Perform t-tests for each feature
for column in para_columns:
    group1 = X[column][y == 0]  # Group 0 (benign)
    group2 = X[column][y == 1]  # Group 1 (malignant)
    
    # Perform independent t-test
    t_statistic, p_value = stats.ttest_ind(group1, group2)
    
    # Append results to the DataFrame
    results.append({'Feature': column, 'T-statistic': t_statistic, 'P-value': p_value})

# Convert the results list to a DataFrame
results_df = pd.DataFrame(results)
sorted_features = results_df.sort_values(by='P-value')

# Step 3: Display the results
print("T-Test Results sorted by P-value:")
print(sorted_features)

print("\nP-value) > 0.05: \n ")
results_df[(results_df['P-value'] > 0.05)]


T-Test Results sorted by P-value:
                                  Feature  T-statistic        P-value
3                    concave points_worst   -31.054555  1.969100e-124
7          concave points_worst_mean_diff   -19.849779   5.681090e-67
6      concave points_mean_to_worst_ratio   -13.836209   1.010847e-37
10  fractal_dimension_mean_to_worst_ratio    13.647337   7.155977e-37
1                           texture_worst   -12.230981   1.078057e-30
2                        smoothness_worst   -11.066747   6.575144e-26
0                         smoothness_mean    -9.146099   1.051850e-18
8            concave points_z_score_worst    -6.716225   4.541915e-11
9            symmetry_mean_to_worst_ratio     5.110660   4.394815e-07
5          smoothness_mean_to_worst_ratio     4.588244   5.506764e-06
4             texture_mean_to_worst_ratio     4.115146   4.443689e-05

P-value) > 0.05: 
 


Unnamed: 0,Feature,T-statistic,P-value


**Nonparametric Wilcoxon Rank-Sum**

In [20]:
from scipy.stats import ranksums
# Step 1: Prepare to hold the results for the Wilcoxon rank-sum test
results = []

# Step 2: Perform the Wilcoxon rank-sum test for each feature
for column in non_para_columns:
    group1 = X[column][y == 0]  # Benign group
    group2 = X[column][y == 1]  # Malignant group
    
    # Perform the Wilcoxon rank-sum test
    stat, p_value = ranksums(group1, group2)
    
    # Append results to the list
    results.append({'Feature': column, 'Statistic': stat, 'P-value': p_value})

# Convert results to a DataFrame
results_df = pd.DataFrame(results)
sorted_features = results_df.copy().sort_values(by='P-value')
print("Wilcoxon Rank-Sum Test Results sorted by P-value:")
print(sorted_features)


Wilcoxon Rank-Sum Test Results sorted by P-value:
                               Feature  Statistic       P-value
20                     perimeter_worst -18.978444  2.570996e-80
19                        radius_worst -18.778554  1.131190e-78
21                          area_worst -18.754029  1.794645e-78
6                  concave points_mean -18.538845  1.003546e-76
39                area_worst_mean_diff -18.516693  1.514588e-76
35           perimeter_worst_mean_diff -18.010904  1.599948e-72
2                       perimeter_mean -17.838703  3.538114e-71
28              radius_worst_mean_diff -17.650153  1.014809e-69
3                            area_mean -17.496148  1.532915e-68
5                       concavity_mean -17.476634  2.158718e-68
0                          radius_mean -17.464240  2.682507e-68
12                             area_se -17.020949  5.743236e-65
23                     concavity_worst -16.819477  1.757032e-63
52     concave points_se_to_mean_ratio  16.293382  1.0

### univariable feature selection - results
<u>engineered features:</u> 

Our univariate analysis showed that the engineered features did not improve the model's ability to classify the target variable as effectively as the original features. 

In [22]:
df2 = df.copy()
# To avoid dividing by zero add this value to the denominator
df2 = df2.drop(columns=['id'])
y = df2['diagnosis']
X = df2.drop(columns=['diagnosis'])
X.info()                    

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   radius_mean              569 non-null    float64
 1   texture_mean             569 non-null    float64
 2   perimeter_mean           569 non-null    float64
 3   area_mean                569 non-null    float64
 4   smoothness_mean          569 non-null    float64
 5   compactness_mean         569 non-null    float64
 6   concavity_mean           569 non-null    float64
 7   concave points_mean      569 non-null    float64
 8   symmetry_mean            569 non-null    float64
 9   fractal_dimension_mean   569 non-null    float64
 10  radius_se                569 non-null    float64
 11  texture_se               569 non-null    float64
 12  perimeter_se             569 non-null    float64
 13  area_se                  569 non-null    float64
 14  smoothness_se            5

# saving to pick file

In [24]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   diagnosis                569 non-null    int8   
 1   radius_mean              569 non-null    float64
 2   texture_mean             569 non-null    float64
 3   perimeter_mean           569 non-null    float64
 4   area_mean                569 non-null    float64
 5   smoothness_mean          569 non-null    float64
 6   compactness_mean         569 non-null    float64
 7   concavity_mean           569 non-null    float64
 8   concave points_mean      569 non-null    float64
 9   symmetry_mean            569 non-null    float64
 10  fractal_dimension_mean   569 non-null    float64
 11  radius_se                569 non-null    float64
 12  texture_se               569 non-null    float64
 13  perimeter_se             569 non-null    float64
 14  area_se                  5

In [44]:
df2.to_pickle('df_final_features.pkl')

# In summary

removed the 'id' column as it was not relevant for prediction.

removed calculated columns.


**started with 32 original columns**
**we are left with 31 columns for model building.** 